In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from typing import Dict, List, Set
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

class IndonesianTextPreprocessor:
    def __init__(self):
        self.stemmer = StemmerFactory().create_stemmer()
        self.contractions: Dict[str, str] = self._init_contractions()
        self.slang: Dict[str, str] = self._init_slang()
        self.stopwords: Set[str] = self._init_stopwords()
        self.special_patterns: List[str] = [
            r'@\w+',  # Username mentions
            r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+',  # Email addresses
            r'https?://\S+',  # URLs
            r'\#\w+',  # Hashtags
            r'RT[\s]+',  # Retweets
            r'wk+',  # Variasi "wkwk"
            r'h[ha]+',  # Variasi "haha"
            r'_+',  # Underscore berulang
        ]

    def _init_contractions(self) -> Dict[str, str]:
        """Initialize contraction dictionary with common Indonesian contractions"""
        base_contractions = {
            'dgn': 'dengan', 'yg': 'yang', 'utk': 'untuk', 'tdk': 'tidak',
            'krn': 'karena', 'hrs': 'harus', 'sdh': 'sudah', 'spy': 'supaya',
            'trs': 'terus', 'gk': 'tidak', 'ga': 'tidak', 'nggak': 'tidak',
            'gak': 'tidak', 'udh': 'sudah', 'udah': 'sudah', 'klo': 'kalau',
            'kalo': 'kalau', 'gitu': 'begitu', 'gmn': 'bagaimana', 'emg': 'memang',
            'emang': 'memang', 'bgt': 'banget', 'bngt': 'banget', 'dpat': 'dapat',
            'dpt': 'dapat', 'tp': 'tapi', 'tpi': 'tapi', 'skrg': 'sekarang',
            'skg': 'sekarang', 'org': 'orang', 'orng': 'orang', 'sy': 'saya',
            'sya': 'saya', 'km': 'kamu', 'kmu': 'kamu', 'ak': 'aku', 'aq': 'aku',
            'bs': 'bisa', 'bsa': 'bisa', 'br': 'baru', 'bru': 'baru',
            'pd': 'pada', 'sprti': 'seperti', 'spt': 'seperti', 'msh': 'masih',
            'msi': 'masih', 'dr': 'dari', 'dri': 'dari', 'dlm': 'dalam',
            'dal': 'dalam', 'sbg': 'sebagai', 'sbgi': 'sebagai', 'smp': 'sampai',
            'smpe': 'sampai', 'cb': 'coba', 'cba': 'coba', 'tdak': 'tidak',
            # Tambahan dari dataset baru
            'gue': 'saya', 'lu': 'kamu', 'ni': 'ini', 'tu': 'itu',
            'kl': 'kalau', 'gt': 'begitu', 'mah': 'memang', 'ya': 'iya',
            'yh': 'iya', 'kaya': 'seperti', 'cos': 'karena', 'coz': 'karena',
            'gr': 'gara-gara', 'ngga': 'tidak', 'gada': 'tidak ada',
            'aj': 'saja', 'aja': 'saja', 'dah': 'sudah', 'udh': 'sudah'
        }
        return base_contractions

    def _init_slang(self) -> Dict[str, str]:
        """Initialize slang dictionary with common Indonesian informal terms"""
        base_slang = {
            'mantap': 'mantap', 'mantul': 'mantap', 'keren': 'bagus',
            'kereen': 'bagus', 'kece': 'bagus', 'oke': 'baik', 'ok': 'baik',
            'sip': 'baik', 'sippp': 'baik', 'good': 'baik', 'nice': 'baik',
            'mantab': 'mantap', 'kenceng': 'cepat', 'lemot': 'lambat',
            'lelet': 'lambat', 'telat': 'terlambat', 'telmi': 'lambat mengerti',
            'gercep': 'cepat', 'slow': 'lambat', 'fast': 'cepat', 'asik': 'asyik',
            'asek': 'asyik', 'asiik': 'asyik', 'asykk': 'asyik',
            # Tambahan dari dataset baru
            'njirr': 'anjing', 'anjinggg': 'anjing', 'asu': 'anjing',
            'bangsat': 'buruk', 'bangsatttt': 'buruk',
            'worthit': 'bermanfaat', 'worth': 'bermanfaat',
            'php': 'palsu', 'gemes': 'kesal',
            'mayan': 'lumayan', 'lumayanlah': 'lumayan',
            'bagusnya': 'bagus', 'bagusss': 'bagus',
            'seruuu': 'seru', 'seruu': 'seru'
        }
        return base_slang

    def _init_stopwords(self) -> Set[str]:
        """Initialize comprehensive Indonesian stopwords set"""
        base_stopwords = {
            "yang", "di", "ke", "dari", "pada", "dalam", "untuk", "dengan", "dan",
            "akan", "tentang", "seperti", "dapat", "juga", "sudah", "saya", "anda",
            "dia", "mereka", "kita", "ada", "tidak", "saat", "oleh", "setelah",
            "kepada", "sebagai", "ini", "itu", "jika", "sehingga", "karena",
            "dimana", "ketika", "yaitu", "yakni", "daripada", "sejak", "sambil",
            "bahwa", "namun", "menurut", "hampir", "dimana", "bagaimana", "selama",
            "siapa", "mengapa", "kapan", "kemana", "apakah", "harus", "samping",
            "sedang", "selagi", "sementara", "tetap", "apabila", "sebelum",
            "sesudah", "supaya", "dengan", "agar", "lain", "pula", "padahal",
            "berada", "terhadap", "semua", "belum", "atas", "bawah", "telah",
            "guna", "kali", "cara", "dalam", "tak", "per", "bagi", "serta",
            # Tambahan stopwords khusus konteks Kampus Merdeka
            "kampus", "merdeka", "kuliah", "mahasiswa", "program", "semester",
            "belajar", "magang", "msib", "iisma", "pmm", "pendidikan",
            "universitas", "fakultas", "dosen", "mata", "sks", "sistem",
            "kurikulum", "nadiem"
        }
        return base_stopwords

    def _remove_emoji(self, text: str) -> str:
        """Remove emoji characters"""
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def remove_special_characters(self, text: str) -> str:
        """Remove special characters and numbers"""
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text

    def remove_multiple_spaces(self, text: str) -> str:
        """Remove multiple spaces"""
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def remove_repeated_characters(self, text: str) -> str:
        """Remove repeated characters"""
        return re.sub(r'(.)\1+', r'\1\1', text)

    def expand_contractions(self, text: str) -> str:
        """Expand contractions using the contractions dictionary"""
        words = text.split()
        return ' '.join([self.contractions.get(word.lower(), word) for word in words])

    def normalize_slang(self, text: str) -> str:
        """Normalize slang words using the slang dictionary"""
        words = text.split()
        return ' '.join([self.slang.get(word.lower(), word) for word in words])

    def preprocess(self, text: str, stem: bool = True) -> str:
        """
        Complete preprocessing pipeline for Indonesian text

        Parameters:
        -----------
        text : str
            Input text to preprocess
        stem : bool, optional (default=True)
            Whether to apply stemming

        Returns:
        --------
        str
            Preprocessed text
        """
        # Remove emojis
        text = self._remove_emoji(text)

        # Remove special patterns
        for pattern in self.special_patterns:
            text = re.sub(pattern, '', text)

        # Case folding
        text = text.lower()

        # Expand contractions
        text = self.expand_contractions(text)

        # Normalize slang
        text = self.normalize_slang(text)

        # Remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation))

        # Remove special characters and numbers
        text = self.remove_special_characters(text)

        # Remove repeated characters
        text = self.remove_repeated_characters(text)

        # Remove multiple spaces
        text = self.remove_multiple_spaces(text)

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords
        tokens = [word for word in tokens if word not in self.stopwords]

        # Stemming (optional)
        if stem:
            tokens = [self.stemmer.stem(token) for token in tokens]

        return ' '.join(tokens)

# 1. LOAD DATA
def load_data(file_path):
    df = pd.read_csv(file_path)
    print("Jumlah data awal:", len(df))
    return df

# 2. PEMROSESAN DATA
def prepare_data(df):
    # Inisialisasi preprocessor
    preprocessor = IndonesianTextPreprocessor()

    # Preprocessing menggunakan IndonesianTextPreprocessor
    print("Melakukan preprocessing...")
    df['cleaned_text'] = df['full_text'].apply(lambda x: preprocessor.preprocess(x, stem=True))

    # Vectorization dengan parameter yang dioptimalkan
    print("Melakukan vectorization...")
    vectorizer = TfidfVectorizer(
        max_features=5000,
        min_df=2,
        max_df=0.95,
        ngram_range=(1, 2)  # Menggunakan unigram dan bigram
    )
    X = vectorizer.fit_transform(df['cleaned_text'])
    y = df['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    return X_train, X_test, y_train, y_test, vectorizer

def train_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Naive Bayes': BernoulliNB(),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'SVM': SVC(kernel='linear', random_state=42)
    }

    results = {}

    for name, model in models.items():
        print(f"\nMelatih model {name}...")
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        results[name] = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted'),
            'f1': f1_score(y_test, y_pred, average='weighted')
        }

        print(f"Hasil evaluasi {name}:")
        print(classification_report(y_test, y_pred))

    return results

def analyze_results(results):
    print("\nPerbandingan Performa Model:")
    comparison_df = pd.DataFrame(results).round(3) * 100
    print(comparison_df)

    # Menentukan model terbaik berdasarkan F1-score
    best_model = max(results.items(), key=lambda x: x[1]['f1'])
    print(f"\nModel terbaik adalah {best_model[0]} dengan F1-score: {best_model[1]['f1']:.3f}")

    return best_model

def main():
    # Load data
    print("Loading data...")
    df = load_data('kampus_merdeka_cleaned.csv')

    # Prepare data
    X_train, X_test, y_train, y_test, vectorizer = prepare_data(df)

    # Train and evaluate models
    print("\nMelatih dan mengevaluasi model...")
    results = train_evaluate_models(X_train, X_test, y_train, y_test)

    # Analyze results
    best_model = analyze_results(results)

    print("\nKESIMPULAN:")
    print(f"1. Model {best_model[0]} menunjukkan performa terbaik dalam klasifikasi sentimen.")
    print(f"2. Metrik evaluasi model terbaik:")
    for metric, value in best_model[1].items():
        print(f"   - {metric}: {value:.3f}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading data...
Jumlah data awal: 600
Melakukan preprocessing...
Melakukan vectorization...

Melatih dan mengevaluasi model...

Melatih model Random Forest...
Hasil evaluasi Random Forest:
              precision    recall  f1-score   support

           0       0.72      0.78      0.75        63
           1       0.73      0.67      0.70        57

    accuracy                           0.72       120
   macro avg       0.73      0.72      0.72       120
weighted avg       0.73      0.72      0.72       120


Melatih model Naive Bayes...
Hasil evaluasi Naive Bayes:
              precision    recall  f1-score   support

           0       0.72      0.86      0.78        63
           1       0.80      0.63      0.71        57

    accuracy                           0.75       120
   macro avg       0.76      0.74      0.74       120
weighted avg       0.76      0.75      0.75       120


Melatih model Logistic Regression...
Hasil evaluasi Logistic Regression:
              precision  