In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Konfigurasi File dan Parameter
NAMA_FILE_INPUT = 'ulasan_shopee_44303086179_1053_reviews_final.csv'
OUTPUT_DEV_SET = 'development_set.csv'
OUTPUT_TEST_SET = 'final_test_set.csv'
TEST_SIZE = 0.2
RANDOM_SEED = 42

def hitung_panjang_teks(text):
    """Mengembalikan jumlah kata dalam teks."""
    if not isinstance(text, str):
        text = str(text)
    return len(text.split())

In [3]:
# Memuat Data
print(f"[INFO] Membaca file sumber: {NAMA_FILE_INPUT}")
df = pd.read_csv(NAMA_FILE_INPUT)

# Ekstraksi Fitur (Panjang Teks)
# Menghitung panjang teks sebelum data dipisah agar konsisten
df['panjang_teks'] = df['comment'].apply(hitung_panjang_teks)
print("[INFO] Kolom 'panjang_teks' berhasil ditambahkan.")

[INFO] Membaca file sumber: ulasan_shopee_44303086179_1053_reviews_final.csv
[INFO] Kolom 'panjang_teks' berhasil ditambahkan.


In [4]:
# Pembagian Data (Split 80:20)
dev_set, test_set = train_test_split(
    df, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_SEED
)

In [5]:
# 4. Menyimpan Hasil
dev_set.to_csv(OUTPUT_DEV_SET, index=False)
test_set.to_csv(OUTPUT_TEST_SET, index=False)

# 5. Laporan Output
print("-" * 30)
print(f"[INFO] Proses Selesai.")
print(f"Total Data Awal : {len(df)}")
print(f"Development Set : {len(dev_set)} (Disimpan ke: {OUTPUT_DEV_SET})")
print(f"Final Test Set  : {len(test_set)} (Disimpan ke: {OUTPUT_TEST_SET})")
print("-" * 30)

------------------------------
[INFO] Proses Selesai.
Total Data Awal : 1053
Development Set : 842 (Disimpan ke: development_set.csv)
Final Test Set  : 211 (Disimpan ke: final_test_set.csv)
------------------------------
