# 📚 Libraries

In [1]:
!pip install ftfy --quiet

In [3]:
!pip install -U sentence-transformers --quiet

In [2]:
import pandas as pd
import ftfy
import re
from tqdm import tqdm
from statistics import mode

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import util

2024-07-30 07:53:44.659788: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 07:53:44.659911: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 07:53:44.783642: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
import warnings

warnings.filterwarnings("ignore")

# ⚙️ Data Processing

In [7]:
train = pd.read_csv('/kaggle/input/big-data-challenge-2024/dataset_penyisihan_bdc_2024(in).csv',delimiter=';')
test = pd.read_csv('/kaggle/input/big-data-challenge-2024/dataset_unlabeled_penyisihan_bdc_2024(in).csv',delimiter=';')

In [8]:
def bersihkan_data_duplikat(df, kolom_teks='text', kolom_label='label'):
    duplikat = df[df[kolom_teks].duplicated(keep=False)]
    frekuensi_label = duplikat.groupby(kolom_teks)[kolom_label].value_counts().unstack(fill_value=0)
    label_terbanyak = frekuensi_label.idxmax(axis=1)

    df_baru = df.drop_duplicates(subset=kolom_teks, keep=False)
    df_baru = df_baru.set_index(kolom_teks)
    df_baru[kolom_label] = label_terbanyak
    df_baru = df_baru.reset_index()

    df_final = pd.concat([df_baru, df[~df[kolom_teks].isin(duplikat[kolom_teks])]]).sort_index()

    return df_final[~df_final[kolom_label].isna()]

In [9]:
train = bersihkan_data_duplikat(train)

In [10]:
def clean_tweet(tweet):
    # Memperbaiki teks yang terdistorsi akibat kesalahan encoding
    tweet = ftfy.fix_text(tweet)
    # Hapus karakter newline
    tweet = tweet.replace('\n', ' ')
    # Hapus spasi berlebih yang mungkin tersisa
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    # Lowercasting
    tweet = tweet.lower()
   
    return tweet.strip()

In [11]:
train['text'] = train['text'].apply(clean_tweet)

In [13]:
test['text'] = test['Text'].apply(clean_tweet)
test = test.drop(columns=['Text'])

In [12]:
drop_index = train[train['text'].str.len() == 0].index
train = train.drop(drop_index).reset_index(drop=True)

In [None]:
# ===========================================================================================================================

# train = train.iloc[:10]

# test = test.iloc[:5]

# 🤖 Prepare Model

In [14]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [15]:
tokenizer = AutoTokenizer.from_pretrained('indolem/indobertweet-base-uncased')
model = AutoModel.from_pretrained('Amadeus99/indonesia-election-topic-classification')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [16]:
def get_embeddings(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

# 🔮 Prediction

In [None]:
# train_sentences = train['text'].tolist()

# batch_size = 4
# train_embeddings = []

# for i in tqdm(range(0, len(train_sentences), batch_size)):
#     batch_sentences = train_sentences[i:i+batch_size]
#     batch_embeddings = get_embeddings(batch_sentences)
#     train_embeddings.append(batch_embeddings)

# train_embeddings = torch.cat(train_embeddings)

In [17]:
train_embeddings = torch.load('/kaggle/input/big-data-challenge-2024/train_embeddings_2.pt')

In [18]:
list_id_texts = []
list_texts = []
similarity_values = []
list_prediction = []
similar_texts = []

for idx,row in tqdm(test.iterrows()):
    highest_similarity = -99999
    predictions = []
    sim_values = []
    sentence1 = row['text']
    sentence1_embedding = get_embeddings([sentence1])[0]
    most_similar_text = ''
    
    for idx_train,row_train in train.iterrows():
        sentence2_embedding = train_embeddings[idx_train]
        similarity_value = util.cos_sim(sentence1_embedding, sentence2_embedding)
        
        if similarity_value > highest_similarity:
            highest_similarity = similarity_value
            most_similar_text = row_train['text']

        if highest_similarity - similarity_value < 0.01:
            predictions.append(row_train['label'])
            sim_values.append(similarity_value)
    
    # Penyaringan prediksi dan nilai kesamaan
    if predictions:
        max_similarity = max(sim_values)
        predictions = [cat for cat, score in zip(predictions, sim_values) if abs(max_similarity - score) < 0.01]
        sim_values = [score for score in sim_values if abs(max_similarity - score) < 0.01]

    # Menambahkan prediksi ke list atau menangani kasus prediksi kosong
    if predictions:
        try:
            list_prediction.append(mode(predictions))
        except StatisticsError:
            print(f"Terjadi statistical error pada test data : {test_data}")
            list_prediction.append(predictions[0])  # atau bisa mengisi dengan label default
    else:
        list_prediction.append(None)
        
    
    list_id_texts.append(row['IDText'])
    list_texts.append(row['text'])
    similarity_values.append(float(highest_similarity))
    similar_texts.append(most_similar_text)

1000it [15:47,  1.06it/s]


In [19]:
result_df = pd.DataFrame({'IDText':list_id_texts,'Text':list_texts,'Similarity Values':similarity_values,'Similar Text':similar_texts,'Kelas':list_prediction})
result_df.head()

Unnamed: 0,IDText,Text,Similarity Values,Similar Text,Kelas
0,TXT0001,lu mau org2 pro-demokrasi di negara ini bisa p...,0.621785,rt ini fakta yang aku alami di lapangan. aku s...,Politik
1,TXT0002,prabowo ditanya soal hutang luar negeri dia me...,0.775302,"rt tak ada angin, tak ada hujan, tiba-tiba sri...",Pertahanan dan Keamanan
2,TXT0003,kiki_daliyo ganjar pranowo itulah beliau sosok...,0.984255,kiki_daliyo ganjar pranowo pak ganjar jadi ins...,Ideologi
3,TXT0004,@kumparan prabowo gibran yang bisa melakukan i...,0.826801,capres cawapres ganjar pranowo - mahfud md - m...,Politik
4,TXT0005,@sniperruben45 @uda_zulhendra @ainunnajib lah ...,0.52846,@1owykgpaodd0yqezopv1bkeqxwcabxhz2ovkzukqr9u= ...,Politik


In [20]:
result_df.to_csv('result.csv',index=False)

submission_df = result_df[['IDText','Kelas']]
submission_df.to_csv('submission.csv',index=False)