## Data Preparation

Label:
- 0 = Proud
- 1 = Trust
- 2 = Joy
- 3 = Surprise
- 4 = Neutral
- 5 = Sadness
- 6 = Fear
- 7 = Anger

Langkah-langkah yang akan dijalankan di sel berikutnya:
1. Baca `data/datatrain.csv` mentah.
2. Normalisasi label `emotion` ke kolom `emotion_clean` (mapping sederhana untuk typo/varian bahasa).
3. Normalisasi nilai `video` ke `video_norm` (strip, ganti newline).
4. Resolve duplikat berdasarkan `video_norm` dengan kebijakan majority-vote; catat grup konflik untuk audit.
5. Simpan dataset final yang bersih ke `data/datatrain_clean.csv` (overwrite) dan simpan laporan konflik ke `data/duplicate_conflicts.csv`.

Catatan: Saya berasumsi kebijakan resolusi otomatis yang wajar adalah majority-vote per video; jika ada tie, pilih kemunculan pertama. Jika Anda ingin aturan berbeda (mis. prioritas sumber), beri tahu dan saya ubah.

In [1]:
# Imports and canonical mapping
import pandas as pd
from collections import Counter

# Canonical labels mapping (lowercase keys -> canonical)
CANONICAL = {
    'surprise': 'Surprise',
    'terkejut': 'Surprise',
    'trkejut': 'Surprise',
    'kaget': 'Surprise',
    'trekejut': 'Surprise',

    'joy': 'Joy',
    'happy': 'Joy',

    'trust': 'Trust',
    'faith': 'Trust',
    'loyalty': 'Trust',
    'percaya': 'Trust',
    'percaya ': 'Trust',

    'proud': 'Proud',
    'pride': 'Proud',
    'bangga': 'Proud',
    'love': 'Proud',

    'sadness': 'Sadness',
    'sad': 'Sadness',

    'anger': 'Anger',
    'angry': 'Anger',
    'marah': 'Anger',
    'marh': 'Anger',

    'fear': 'Fear',

    'neutral': 'Neutral'
}


def normalize_label(raw_label: str) -> str:
    if pd.isna(raw_label):
        return ''
    s = str(raw_label).strip()
    low = s.lower()
    return CANONICAL.get(low, s.title())


def normalize_video(v: str) -> str:
    if pd.isna(v):
        return ''
    s = str(v).strip()
    s = s.replace('\n', ' ').replace('\r', ' ')
    return s.strip()


In [2]:
def dedupe_and_report(df: pd.DataFrame):
    df = df.copy()
    # normalize labels and video keys
    df['emotion_clean'] = df['emotion'].apply(normalize_label)
    df['video_norm'] = df['video'].apply(normalize_video)
    df['video_key'] = df['video_norm'].str.lower().str.replace('\\s+', ' ', regex=True).str.strip()

    groups = df.groupby('video_key')
    keep_rows = []
    conflict_rows = []
    report = []

    for key, g in groups:
        unique_labels = g['emotion_clean'].dropna().unique().tolist()
        counts = Counter(g['emotion_clean'].dropna().tolist())
        if len(unique_labels) <= 1:
            # safe to dedupe: keep first
            keep_rows.append(g.iloc[0].to_dict())
        else:
            # conflict - save for review
            for _, r in g.iterrows():
                conflict_rows.append(r.to_dict())
            report.append({'video_key': key, 'n_rows': len(g), 'labels': counts})

    df_keep = pd.DataFrame(keep_rows)
    df_conflicts = pd.DataFrame(conflict_rows)
    report_df = pd.DataFrame(report)
    return df_keep, df_conflicts, report_df


def resolve_conflicts_majority(df_conflicts: pd.DataFrame):
    # For each video_key choose majority label; if tie, choose first alphabetically
    resolved = []
    if df_conflicts.empty:
        return pd.DataFrame()
    for key, g in df_conflicts.groupby('video_key'):
        counts = Counter(g['emotion_clean'].dropna().tolist())
        if not counts:
            chosen = ''
        else:
            max_count = max(counts.values())
            candidates = [lab for lab, c in counts.items() if c == max_count]
            chosen = sorted(candidates)[0]
        # pick first row from group but force emotion_clean to chosen
        row = g.iloc[0].to_dict()
        row['emotion_clean'] = chosen
        resolved.append(row)
    return pd.DataFrame(resolved)


In [3]:
# Runner: run pipeline and write outputs
raw_path = 'data/datatrain.csv'
out_clean = 'data/datatrain_clean.csv'  # final overwrite
out_conflicts = 'data/duplicate_conflicts.csv'

raw = pd.read_csv(raw_path)
print('Raw rows:', len(raw))

keep, conflicts, report = dedupe_and_report(raw)
print('Unique groups with conflicts (report rows):', len(report), 'conflict rows:', len(conflicts))

# Save conflict rows for manual review (do not include in final)
conflicts.to_csv(out_conflicts, index=False)

# Exclude conflicts entirely from final dataset
final = keep.copy()
# ensure columns order like original and emotion_clean
cols = list(raw.columns) + ['emotion_clean']
cols = [c for c in cols if c in final.columns]
final = final[cols]

print('Final rows (conflicts excluded):', len(final))
final.to_csv(out_clean, index=False)
print('Wrote final clean file (conflicts excluded) ->', out_clean)
print('Also wrote conflicts ->', out_conflicts)


Raw rows: 803
Unique groups with conflicts (report rows): 6 conflict rows: 31
Final rows (conflicts excluded): 769
Wrote final clean file (conflicts excluded) -> data/datatrain_clean.csv
Also wrote conflicts -> data/duplicate_conflicts.csv
Unique groups with conflicts (report rows): 6 conflict rows: 31
Final rows (conflicts excluded): 769
Wrote final clean file (conflicts excluded) -> data/datatrain_clean.csv
Also wrote conflicts -> data/duplicate_conflicts.csv
