# 📚 Libraries

In [1]:
!pip install ftfy --quiet

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ftfy
import re
import numpy as np
import os
from tqdm import tqdm

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel

2024-07-30 08:12:02.436252: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 08:12:02.436416: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 08:12:02.700743: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import warnings

# Nonaktifkan semua warning
warnings.filterwarnings("ignore")

# ⚙️ Data Engineering

### 1️⃣ Read Data

In [4]:
test = pd.read_csv('/kaggle/input/big-data-challenge-2024/dataset_unlabeled_penyisihan_bdc_2024(in).csv',delimiter=';')

### 2️⃣ Data Cleaning and Feature Extraction

In [5]:
def clean_tweet(tweet):
    # Memperbaiki teks yang terdistorsi akibat kesalahan encoding
    tweet = ftfy.fix_text(tweet)
    # Hapus karakter newline
    tweet = tweet.replace('\n', ' ')
    # Hapus spasi berlebih yang mungkin tersisa
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    # Lowercasting
    tweet = tweet.lower()
   
    return tweet.strip()

In [6]:
# Bersihkan setiap tweet pada dataset
test['cleaned_text'] = test['Text'].apply(clean_tweet)

In [7]:
# Drop un-processed text column
test = test.drop(columns=['Text'])

### 3️⃣ Prepare Dataset

In [8]:
test_texts = test['cleaned_text']

# 🤖 Prediction

In [9]:
tokenizer = AutoTokenizer.from_pretrained("Amadeus99/indonesia-election-topic-classification-undersampling-double")
model = AutoModelForSequenceClassification.from_pretrained('Amadeus99/indonesia-election-topic-classification-undersampling-double')

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/738k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [10]:
label_mapping = {0:'Demografi', 1:'Ekonomi', 2:'Geografi', 3:'Ideologi', 4:'Pertahanan dan Keamanan', 5:'Politik', 6:'Sosial Budaya', 7:'Sumber Daya Alam'}

In [11]:
def predict_text(text):
    tokens = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**tokens)
    
    probs = torch.nn.functional.softmax(outputs[0], dim=-1)
    probs_list = probs.squeeze().tolist()
    
    max_prob = max(probs_list)
    max_prob_idx = probs_list.index(max_prob)
    
    return max_prob_idx

In [12]:
y_pred = []
for text in tqdm(test_texts):
    pred = predict_text(text)
    kelas = label_mapping[pred]
    y_pred.append(kelas)

100%|██████████| 1000/1000 [01:41<00:00,  9.86it/s]


In [13]:
submission_df = pd.read_csv('/kaggle/input/big-data-challenge-2024/template_jawaban_penyisihan_bdc_2024.csv',delimiter=';')
submission_df['Kelas'] = y_pred

In [14]:
submission_df.to_csv('submission.csv',index=False)