In [3]:
import pandas as pd
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# 1. LOAD DATA
df = pd.read_csv('../dataset/merged.csv')
ks = pd.read_csv('../dataset/kamus_slang.csv')

# 2. PROSES KAMUS SLANG (Membangun Mapping)
slang_dict = {}
# Mengambil dari kolom slang & formal
for _, row in ks.dropna(subset=['slang', 'formal']).iterrows():
    slang_dict[str(row['slang']).strip().lower()] = str(row['formal']).strip().lower()

# Mengambil data dari header dan kolom ketiga (format: "slang;formal")
header_pair = 'aamiin;amin'.split(';')
slang_dict[header_pair[0].strip().lower()] = header_pair[1].strip().lower()

for val in ks[ks.columns[2]].dropna():
    if ';' in str(val):
        parts = str(val).split(';')
        if len(parts) >= 2:
            slang_dict[parts[0].strip().lower()] = parts[1].strip().lower()

# 3. FUNGSI PREPROCESSING DETAIL
def detail_preprocess(text, mapping):
    if not isinstance(text, str): return ""
    
    # a. Hapus URL (http, https, www)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # b. Hapus Mention (@user) dan Hashtag (#tag)
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # c. Hapus karakter non-alfabet (Simbol, Angka, Emoticon jika ingin dibersihkan)
    # Catatan: Jika ingin mempertahankan emosi dari tanda seru/tanya, hapus baris ini
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # d. Case Folding (Kecilkan semua huruf)
    text = text.lower()
    
    # e. Normalisasi Karakter Berulang (misal: "Horeeee" -> "Horee")
    text = re.sub(r'(.)\1+', r'\1\1', text)
    
    # f. Normalisasi Kamus Alay / Slang
    words = text.split()
    normalized_words = [mapping.get(w, w) for w in words]
    
    return " ".join(normalized_words).strip()

# Terapkan Preprocessing Detail
print("Sedang melakukan preprocessing...")
df['caption_cleaned'] = df['caption'].apply(lambda x: detail_preprocess(x, slang_dict))

# 4. DATA SPLITTING (70/15/15) - Lakukan sebelum balancing
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['emotion'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['emotion'])

# 5. NATURAL QUASI-BALANCING (Hanya untuk Data Training)
max_count = train_df['emotion'].value_counts().max() 
quasi_list = []

for emotion in train_df['emotion'].unique():
    df_emo = train_df[train_df['emotion'] == emotion]
    current_count = len(df_emo)
    
    if current_count == max_count:
        quasi_list.append(df_emo)
    else:
        # Target: Original + 75% Gap + Random Jitter agar natural
        target = int(current_count + (max_count - current_count) * 0.75)
        target += random.randint(-25, 25) 
        
        df_resampled = resample(df_emo, replace=True, n_samples=target, random_state=42)
        quasi_list.append(df_resampled)

train_natural = pd.concat(quasi_list)

# 6. SIMPAN HASIL FINAL
train_natural.to_csv('../train_natural_quasi.csv', index=False)
val_df.to_csv('../val_final.csv', index=False)
test_df.to_csv('../test_final.csv', index=False)

print("--- Hasil Akhir ---")
print(f"Jumlah Data Training (Quasi): {len(train_natural)}")
print(f"Jumlah Data Validation (Orig): {len(val_df)}")
print(f"Jumlah Data Testing (Orig)   : {len(test_df)}")
print("\nDistribusi Emosi Training:")
print(train_natural['emotion'].value_counts())

Sedang melakukan preprocessing...
--- Hasil Akhir ---
Jumlah Data Training (Quasi): 7826
Jumlah Data Validation (Orig): 763
Jumlah Data Testing (Orig)   : 764

Distribusi Emosi Training:
emotion
Anticipation    1162
Trust           1010
Fear             972
Sadness          954
Anger            942
Joy              933
Surprise         929
Disgust          924
Name: count, dtype: int64
