# 📚 Libraries

In [None]:
!pip install ftfy --quiet

In [None]:
!pip install -U sentence-transformers --quiet

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ftfy
import re
import numpy as np
import os
from tqdm import tqdm
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import NearMiss, TomekLinks, EditedNearestNeighbours, CondensedNearestNeighbour
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel
from transformers import DefaultDataCollator
from sentence_transformers import SentenceTransformer,util
from huggingface_hub import HfApi, HfFolder
from torch.utils.data import DataLoader

In [None]:
import warnings

# Nonaktifkan semua warning
warnings.filterwarnings("ignore")

# ⚙️ Data Engineering

### 1️⃣ Read Data

In [None]:
train = pd.read_csv('/kaggle/input/big-data-challenge-2024/dataset_penyisihan_bdc_2024(in).csv',delimiter=';')
test = pd.read_csv('/kaggle/input/big-data-challenge-2024/dataset_unlabeled_penyisihan_bdc_2024(in).csv',delimiter=';')

In [None]:
# Show label distributions
plt.figure(figsize=(20,12))
sns.countplot(data=train,x='label')

### 2️⃣ Data Cleaning and Feature Extraction

In [None]:
def bersihkan_data_duplikat(df, kolom_teks='text', kolom_label='label'):
    # Temukan teks yang duplikat
    duplikat = df[df[kolom_teks].duplicated(keep=False)]

    # Hitung frekuensi label untuk setiap teks yang duplikat
    frekuensi_label = duplikat.groupby(kolom_teks)[kolom_label].value_counts().unstack(fill_value=0)

    # Pilih label dengan frekuensi tertinggi untuk setiap teks yang duplikat
    label_terbanyak = frekuensi_label.idxmax(axis=1)

    # Buat DataFrame baru dengan label terbanyak
    df_baru = df.drop_duplicates(subset=kolom_teks, keep=False)
    df_baru = df_baru.set_index(kolom_teks)
    df_baru[kolom_label] = label_terbanyak
    df_baru = df_baru.reset_index()

    # Gabungkan DataFrame baru dengan baris yang tidak duplikat
    df_final = pd.concat([df_baru, df[~df[kolom_teks].isin(duplikat[kolom_teks])]]).sort_index()

    return df_final[~df_final[kolom_label].isna()]

In [None]:
# Buang duplikat pada data train
train = bersihkan_data_duplikat(train)

In [None]:
def clean_tweet(tweet):
    # Memperbaiki teks yang terdistorsi akibat kesalahan encoding
    tweet = ftfy.fix_text(tweet)
    # Hapus karakter newline
    tweet = tweet.replace('\n', ' ')
    # Hapus spasi berlebih yang mungkin tersisa
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    # Lowercasting
    tweet = tweet.lower()
   
    return tweet.strip()

In [None]:
# Bersihkan setiap tweet pada dataset
train['text'] = train['text'].apply(clean_tweet)

test['text'] = test['Text'].apply(clean_tweet)
test = test.drop(columns=['Text'])

In [None]:
# Drop baris dengan string kosong
drop_index = train[train['text'].str.len() == 0].index
train = train.drop(drop_index)

In [None]:
train['label'].value_counts()

In [None]:
sampling_strategy = {'Sumber Daya Alam':142, 'Politik':294, 'Demografi':59, 'Pertahanan dan Keamanan':294, 'Ideologi':326, 'Ekonomi':287, 'Sosial Budaya':364, 'Geografi':20}

rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

In [None]:
X = train.drop(columns=['label'])
y = train['label']

X_res, y_res = rus.fit_resample(X, y)

In [None]:
train = pd.concat([X_res,y_res],axis=1).reset_index(drop=True)

### 3️⃣ Prepare Dataset

In [None]:
label_mapping = {label: idx for idx, label in enumerate(train['label'].unique())}
train['label'] = train['label'].map(label_mapping)

print(label_mapping)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42,stratify=train['label'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

class ChunkTextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.windows = []
        self.labels = []

        self._create_windows(texts, labels)

    def _create_windows(self, texts, labels):
        for text, label in zip(texts, labels):
            tokens = self.tokenizer(text, truncation=False)
            input_ids = tokens['input_ids']
           
            if len(input_ids) < self.max_length:
                window = input_ids + [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))
                self.windows.append(window)
                self.labels.append(label)
            else:
                input_ids = input_ids[1:-1]
                start = 0
                while start < len(input_ids):
                    end = start + (self.max_length - 2)
                    window = [self.tokenizer.cls_token_id] + input_ids[start:end] + [self.tokenizer.sep_token_id]
                    if len(window) < self.max_length:
                        window = window + [self.tokenizer.pad_token_id] * (self.max_length - len(window))
                    self.windows.append(window)
                    self.labels.append(label)
                    start += self.max_length - 2

    def __len__(self):
        return len(self.windows)

    def __getitem__(self, idx):
        item = {'input_ids': self.windows[idx]}
        attention_mask = [1 if token != self.tokenizer.pad_token_id else 0 for token in item['input_ids']]
        item['attention_mask'] = attention_mask
        item['labels'] = torch.tensor(self.labels[idx])
        return {key: torch.tensor(val) for key, val in item.items()}

In [None]:
train_dataset = ChunkTextDataset(train_texts, train_labels.tolist(), tokenizer)
val_dataset = ChunkTextDataset(val_texts, val_labels.tolist(), tokenizer)

# 🧾 Prepare for Modelling

In [None]:
# Simpan token API Hugging Face Anda di variabel lingkungan
os.environ['HF_TOKEN'] = ''

In [None]:
# Login menggunakan token API
hf_api = HfApi()
HfFolder.save_token(os.environ['HF_TOKEN'])

In [None]:
# Verifikasi login
user_info = hf_api.whoami()
print(f"Logged in as: {user_info['name']}")

# 🤖 Modelling

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('indolem/indobertweet-base-uncased', num_labels=len(label_mapping))

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    bal_acc = balanced_accuracy_score(labels, preds)
    return {
        'balanced_accuracy': bal_acc,
    }

In [None]:
training_args = TrainingArguments(
    output_dir='model/indonesia-election-topic-classification-undersampling-double',
    num_train_epochs=14,
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    load_best_model_at_end=True,
    metric_for_best_model="balanced_accuracy",
    report_to = "none"
)

data_collator = DefaultDataCollator()
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
trainer.save_model("indonesia-election-topic-classification-undersampling-double")

In [None]:
trainer.push_to_hub()

In [None]:
tokenizer.push_to_hub("indonesia-election-topic-classification-undersampling-double")