In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [5]:
# 1. Memuat Dataset Utama dan Kamus Slang
df = pd.read_csv('../dataset/merged.csv')
ks = pd.read_csv('../dataset/kamus_slang.csv')

# 2. Membangun Mapping Kamus Slang dari File CSV Anda
slang_dict = {}

# Mengambil dari kolom slang & formal yang tersedia
for _, row in ks.dropna(subset=['slang', 'formal']).iterrows():
    slang_dict[str(row['slang']).strip().lower()] = str(row['formal']).strip().lower()

# Mengambil data dari kolom ketiga yang berisi format "slang;formal"
header_pair = 'aamiin;amin'.split(';')
slang_dict[header_pair[0].strip().lower()] = header_pair[1].strip().lower()

third_col = ks.columns[2]
for val in ks[third_col].dropna():
    if ';' in str(val):
        parts = str(val).split(';')
        if len(parts) >= 2:
            slang_dict[parts[0].strip().lower()] = parts[1].strip().lower()

# 3. Fungsi Preprocessing Final
def preprocess_text_final(text, mapping):
    if not isinstance(text, str): return ""
    
    # a. Cleaning: Hapus URL, Mention, Hashtag
    text = re.sub(r'http\S+|www\S+|https\S+|@\w+|#\w+', '', text)
    
    # b. Hapus Karakter selain huruf
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # c. Case Folding
    text = text.lower()
    
    # d. Normalisasi menggunakan Kamus Slang Anda
    words = text.split()
    normalized_words = [mapping.get(w, w) for w in words]
    
    return " ".join(normalized_words).strip()

# Terapkan Preprocessing pada Dataset
df['caption_cleaned'] = df['caption'].apply(lambda x: preprocess_text_final(x, slang_dict))

# 4. Data Splitting (Sesuai Saran Dosen: Lakukan sebelum Balancing)
# Train (70%), Validation (15%), Test (15%)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['emotion'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['emotion'])

# 5. Balancing Data (Hanya dilakukan pada Training Set)
n_samples = train_df['emotion'].value_counts().max()
balanced_list = []
for emotion in train_df['emotion'].unique():
    df_emo = train_df[train_df['emotion'] == emotion]
    df_upsampled = resample(df_emo, replace=True, n_samples=n_samples, random_state=42)
    balanced_list.append(df_upsampled)

train_balanced = pd.concat(balanced_list)

# 6. Simpan Hasil Akhir
train_balanced.to_csv('../dataset/train_ready_final.csv', index=False)
val_df.to_csv('../dataset/val_ready.csv', index=False)
test_df.to_csv('../dataset/test_ready.csv', index=False)

print("Proses selesai! Data telah dibersihkan dengan kamus alay kustom.")

Proses selesai! Data telah dibersihkan dengan kamus alay kustom.
