In [None]:
# Cell 1: Import library dan setup logging
import pandas as pd
import re
import logging
from deep_translator import GoogleTranslator
from langdetect import detect, LangDetectException
import random
import time
from retry import retry
from nltk.tokenize import word_tokenize
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from multiprocessing import Pool, cpu_count
import numpy as np
import os

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Cell 2: Unduh paket NLTK (cukup dijalankan sekali)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')
print("Paket NLTK sudah siap.")


In [None]:
# Cell 3: Utilitas dan kamus

def truncate_text(text, max_length=4500):
    if len(text) > max_length:
        return text[:max_length]
    return text

translation_cache = {}

@retry(tries=3, delay=2, backoff=2)
def back_translate(text, intermediate_lang='en'):
    try:
        text = truncate_text(text)
        if text in translation_cache:
            return translation_cache[text]
        translated = GoogleTranslator(source='id', target=intermediate_lang).translate(text)
        time.sleep(0.5)
        back_translated = GoogleTranslator(source=intermediate_lang, target='id').translate(translated)
        time.sleep(0.5)
        if back_translated:
            translation_cache[text] = back_translated
            return back_translated
        return text
    except Exception as e:
        logger.error(f"Error pada back-translation: {e}")
        return text

synonym_dict = {
    'tidak': ['bukan', 'tak', 'jangan'],
    'pembangunan': ['konstruksi', 'pengembangan', 'pendirian'],
    'proyek': ['rencana', 'pekerjaan', 'program'],
    'investasi': ['penanaman modal', 'pendanaan', 'modal'],
    'infrastruktur': ['sarana', 'prasarana', 'fasilitas'],
    'layanan': ['jasa', 'servis'],
    'driver': ['pengemudi', 'mitra'],
    'tarif': ['ongkos', 'biaya'],
    'aplikasi': ['platform', 'sistem']
}

gojek_keywords = [
    'gojek', 'goto', 'ojol', 'driver', 'aplikasi', 'layanan', 'tarif',
    'gofood', 'gopay', 'transportasi', 'ojek', 'online'
]

stemmer = StemmerFactory().create_stemmer()


def synonym_replacement(text, n=3):
    try:
        words = word_tokenize(text)
        new_words = words.copy()
        indices = list(range(len(words)))
        random.shuffle(indices)
        replaced = 0
        for i in indices:
            word = words[i].lower()
            if word in synonym_dict:
                synonyms = [syn for syn in synonym_dict[word] if syn != word]
                if synonyms:
                    new_words[i] = random.choice(synonyms)
                    replaced += 1
            if replaced >= n:
                break
        return " ".join(new_words)
    except Exception as e:
        logger.error(f"Error in synonym_replacement: {e}")
        return text


def simple_paraphrase(text):
    try:
        words = word_tokenize(text)
        if len(words) < 5:
            return text
        key_positions = [i for i, w in enumerate(words) if w.lower() in gojek_keywords]
        if len(key_positions) < 2:
            return text
        pos1, pos2 = random.sample(key_positions, 2)
        words[pos1], words[pos2] = words[pos2], words[pos1]
        return " ".join(words)
    except Exception as e:
        logger.error(f"Error in simple_paraphrase: {e}")
        return text


def validate_augmented_text(text):
    try:
        if len(text.split()) < 5:
            return False
        try:
            is_id = detect(text) == 'id'
        except LangDetectException:
            is_id = False
        if not is_id:
            return False
        keyword_count = sum(1 for keyword in gojek_keywords if keyword.lower() in text.lower())
        return keyword_count >= 2
    except Exception as e:
        logger.error(f"Error in validate_augmented_text: {e}")
        return False


def clean_augmented_text(text):
    try:
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'[^\w\s.,!?éê]', '', text)
        words = word_tokenize(text)
        stemmed_words = [stemmer.stem(w) if w.lower() not in gojek_keywords else w for w in words]
        return " ".join(stemmed_words)
    except Exception as e:
        logger.error(f"Error in clean_augmented_text: {e}")
        return text


def augment_row(row, intermediate_langs):
    try:
        original_text = row['cleaned_content']
        method = random.choice(['back_translate', 'synonym', 'paraphrase'])
        if method == 'back_translate' and len(original_text) > 50:
            intermediate_lang = random.choice(intermediate_langs)
            augmented_text = back_translate(original_text, intermediate_lang=intermediate_lang)
        elif method == 'synonym':
            augmented_text = synonym_replacement(original_text, n=3)
        else:
            augmented_text = simple_paraphrase(original_text)

        if augmented_text and augmented_text != original_text:
            cleaned_aug_text = clean_augmented_text(augmented_text)
            if validate_augmented_text(cleaned_aug_text):
                new_row = row.copy()
                new_row['cleaned_content'] = cleaned_aug_text
                new_row['is_augmented'] = True
                return new_row
        return None
    except Exception as e:
        logger.error(f"Error dalam augment_row: {e}")
        return None


In [None]:
# Cell 4: Pipeline Augmentasi Utama
if __name__ == '__main__':
    # Muat file hasil preprocessing
    try:
        df = pd.read_csv('data/gojek_news_preprocessed.csv')
        logger.info(f"Berhasil memuat file dengan {len(df)} baris.")
    except FileNotFoundError:
        logger.error("File 'data/gojek_news_preprocessed.csv' tidak ditemukan. Jalankan notebook preprocessing terlebih dahulu.")
        raise SystemExit(1)

    # Pastikan kolom label ada
    if 'label' not in df.columns:
        logger.warning("Kolom 'label' tidak ditemukan. Membuat kolom label dummy [positif, netral, negatif].")
        df['label'] = np.random.choice(['positif', 'netral', 'negatif'], size=len(df), p=[0.2, 0.6, 0.2])

    TARGET_PER_LABEL = 150
    INTERMEDIATE_LANGS = ['en', 'fr', 'de', 'es']

    final_dfs = []
    logger.info(f"Memulai proses augmentasi untuk mencapai target {TARGET_PER_LABEL} data per label...")

    for label in df['label'].unique():
        subset = df[df['label'] == label].copy()
        count = len(subset)
        logger.info(f"Label '{label}' memiliki {count} data.")

        if count >= TARGET_PER_LABEL:
            sampled = subset.sample(n=TARGET_PER_LABEL, random_state=42).copy()
            sampled['is_augmented'] = False
            final_dfs.append(sampled)
        else:
            subset['is_augmented'] = False
            final_dfs.append(subset)
            n_to_augment = TARGET_PER_LABEL - count
            logger.info(f"Label '{label}': Perlu augmentasi sebanyak {n_to_augment} data.")

            # Oversample untuk antisipasi kegagalan validasi
            rows_to_augment = [subset.iloc[i % count].copy() for i in range(n_to_augment * 2)]

            with Pool(processes=cpu_count()) as pool:
                results = pool.starmap(augment_row, [(row, INTERMEDIATE_LANGS) for row in rows_to_augment])

            augmented_rows = [row for row in results if row is not None][:n_to_augment]

            if len(augmented_rows) < n_to_augment:
                logger.warning(f"Label '{label}': Hanya {len(augmented_rows)} data berhasil diaugmentasi. Sisanya diisi dengan duplikasi.")
                remaining = n_to_augment - len(augmented_rows)
                fallback_rows = [subset.iloc[i % count].copy() for i in range(remaining)]
                for row in fallback_rows:
                    row['is_augmented'] = False
                augmented_rows.extend(fallback_rows)

            df_aug = pd.DataFrame(augmented_rows)
            final_dfs.append(df_aug)

    df_final = pd.concat(final_dfs, ignore_index=True)

    output_path_csv = 'data/gojek_news_augmented.csv'
    output_path_xlsx = 'data/gojek_news_augmented.xlsx'
    df_final.to_csv(output_path_csv, index=False)
    df_final.to_excel(output_path_xlsx, index=False)

    logger.info("--- PROSES SELESAI ---")
    logger.info(f"Data augmented disimpan di '{output_path_csv}' dan '{output_path_xlsx}'")
    logger.info(f"Total data akhir: {len(df_final)}")
    logger.info(f"Distribusi label baru:\n{df_final['label'].value_counts().to_string()}")

    print("\nSampel data hasil augmentasi:")
    try:
        display(df_final.sample(10)[['cleaned_content', 'label', 'is_augmented']])
    except Exception:
        print(df_final.sample(10)[['cleaned_content', 'label', 'is_augmented']])
