In [None]:
import pandas as pd
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# --- LANGKAH 1: LOAD DATA ---
df = pd.read_csv('../dataset/merged.csv')
ks = pd.read_csv('../dataset/kamus_slang.csv')

# --- LANGKAH 2: MEMBANGUN KAMUS SLANG & STOPWORDS ---
# Mapping Slang
slang_dict = {}
for _, row in ks.dropna(subset=['slang', 'formal']).iterrows():
    slang_dict[str(row['slang']).strip().lower()] = str(row['formal']).strip().lower()

# Kamus Kata Non-Emosi (Hasil analisis frekuensi sebelumnya)
stop_umum = ['yang', 'dan', 'di', 'ini', 'itu', 'dari', 'untuk', 'dengan', 'ada', 'adalah', 'pada', 'sebagai', 'akan', 'sudah', 'bisa', 'saat', 'dalam', 'ke', 'oleh', 'bagi', 'serta', 'bahwa', 'maka', 'namun', 'tersebut']
stop_konteks = ['indonesia', 'pendidikan', 'ai', 'teknologi', 'digital', 'guru', 'sekolah', 'negara', 'ekonomi', 'politik', 'pemerintah', 'rakyat', 'startup', 'program', 'sistem', 'data', 'manusia', 'dunia', 'prabowo']
stop_sosmed = ['ya', 'buat', 'saja', 'juga', 'kalau', 'memang', 'banyak', 'lagi', 'bikin', 'pakai', 'punya', 'deh', 'sih', 'kok', 'amp', 'the', 'and', 'to', 'in', 'of', 'a']
custom_stopwords = set(stop_umum + stop_konteks + stop_sosmed)

# --- LANGKAH 3: FUNGSI PREPROCESSING TERINTEGRASI ---
def final_preprocess(text):
    if not isinstance(text, str): return ""
    # a. Cleaning: URL, Mention, Hashtag, Non-huruf
    text = re.sub(r'http\S+|www\S+|https\S+|@\w+|#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # b. Case Folding
    text = text.lower()
    # c. Slang Mapping & Stopword Removal
    words = text.split()
    clean_words = []
    for w in words:
        w_formal = slang_dict.get(w, w) # Ubah jadi baku
        if w_formal not in custom_stopwords: # Filter jika bukan kata emosi
            clean_words.append(w_formal)
    return " ".join(clean_words).strip()

# Eksekusi Preprocessing
df['caption_cleaned'] = df['caption'].apply(final_preprocess)

# Hapus data yang kosong setelah dibersihkan
df = df.dropna(subset=['caption_cleaned'])
df = df[df['caption_cleaned'] != '']

# --- LANGKAH 4: SPLITTING (70/15/15) ---
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['emotion'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['emotion'])

# --- LANGKAH 5: NATURAL QUASI-BALANCING (Hanya Train) ---
max_c = train_df['emotion'].value_counts().max()
quasi_list = []
for emo in train_df['emotion'].unique():
    d = train_df[train_df['emotion'] == emo]
    if len(d) == max_c:
        quasi_list.append(d)
    else:
        # Target: Original + 75% Gap + Random Jitter
        target = int(len(d) + (max_c - len(d)) * 0.75) + random.randint(-15, 15)
        quasi_list.append(resample(d, replace=True, n_samples=target, random_state=42))

train_final = pd.concat(quasi_list)

# --- LANGKAH 6: SIMPAN DATA ---
train_final.to_csv('train_final.csv', index=False)
val_df.drop_duplicates(subset=['caption_cleaned']).to_csv('val_final.csv', index=False)
test_df.drop_duplicates(subset=['caption_cleaned']).to_csv('test_final.csv', index=False)

print("Pemrosesan ulang selesai. Data siap digunakan untuk model!")

Pemrosesan ulang selesai. Data siap digunakan untuk model!
