# Sentiment Analysis (Bahasa Indonesia) — Versi Tuned (Per Bagian)

Konfigurasi dataset & output:
- **TEXT_COL** = `tweet`
- **LABEL_COL** = `sentimen`
- **CSV_PATH** = `"/content/drive/My Drive/Proyek/Data/"`
- **DATA_PATH** = `os.path.join(CSV_PATH, "tweet.csv")`
- **OUTPUT_DIR & MODEL_DIR** = `"/content/drive/My Drive/Proyek/OutputSentimentNew"` (semua artefak tersimpan di Drive)


## 0) Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## 1) Setup: Instalasi Paket (Colab)

In [2]:
# Jalankan hanya sekali (boleh di-skip kalau sudah terpasang)
!pip -q install pandas scikit-learn matplotlib joblib nltk Sastrawi imbalanced-learn


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2) Import & Konfigurasi Dasar

In [3]:
import os, re, json, joblib, numpy as np, pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import loguniform # Changed import location
import matplotlib.pyplot as plt
import nltk

# ====== Konfigurasi ======
SEED = 42
np.random.seed(SEED)

# Kolom & path dataset
TEXT_COL = "tweet"
LABEL_COL = "sentimen"

CSV_PATH = "/content/drive/My Drive/Proyek/Data/"
DATA_PATH = os.path.join(CSV_PATH, "tweet.csv")

# Output langsung ke Google Drive (folder baru)
OUTPUT_DIR = "/content/drive/My Drive/Proyek/OutputSentimentNew"
MODEL_DIR  = OUTPUT_DIR  # sama saja
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Stopwords Indonesia
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

# Stemmer Sastrawi
stemmer = StemmerFactory().create_stemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## 3) Load Dataset

In [4]:
# Membaca CSV dan drop kolom index lama jika ada
df = pd.read_csv(DATA_PATH)
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

assert TEXT_COL in df.columns and LABEL_COL in df.columns, f"Kolom {TEXT_COL} dan/atau {LABEL_COL} tidak ditemukan!"

print("Jumlah data:", len(df))
print("Distribusi label:")
print(df[LABEL_COL].value_counts())
df.head()


Jumlah data: 1815
Distribusi label:
sentimen
positif    612
netral     607
negatif    596
Name: count, dtype: int64


Unnamed: 0,sentimen,tweet
0,negatif,Kata @prabowo Indonesia tidak dihargai bangsa ...
1,netral,"Batuan Langka, Tasbih Jokowi Hadiah dari Habib..."
2,netral,"Di era Jokowi, ekonomi Indonesia semakin baik...."
3,positif,"Bagi Sumatera Selatan, Asian Games berdampak p..."
4,negatif,Negara kita ngutang buat bngun infrastruktur y...


## 4) Preprocessing: Slang Normalization, Negation Marking, Hashtag

In [5]:
import re

NEGATIONS = {"tidak","tak","nggak","ga","gak","enggak","bukan"}

SLANG = {
    "bgt":"banget","bgtt":"banget","bener2":"benar-benar","bner":"benar",
    "tp":"tapi","yg":"yang","ga":"gak","gk":"gak","ngga":"gak","nggak":"gak",
    "sm":"sama","dr":"dari","krn":"karena","krna":"karena","krnnya":"karena"
}

def normalize_elongation(w: str) -> str:
    # batasi pengulangan karakter: 'baguuuusss' -> 'baguus'
    return re.sub(r"(.)\1{2,}", r"\1\1", w)

def normalize_slang(w: str) -> str:
    w = normalize_elongation(w)
    return SLANG.get(w, w)

def extract_hashtags(text: str):
    return re.findall(r"#(\w+)", text)

def mark_negation(tokens):
    out = []
    negate = False
    for w in tokens:
        if w in NEGATIONS:
            out.append(w)   # simpan kata negasinya
            negate = True
            continue
        out.append((f"NEG_{w}") if negate else w)
        if negate:
            negate = False  # rule sederhana: negasi hanya memodifikasi token berikutnya
    return out

def clean_text_sentiment(text: str) -> str:
    text = str(text).lower()
    hashtags = extract_hashtags(text)  # simpan isi hashtag (tanpa simbol #)
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)  # hapus URL
    text = re.sub(r"@\w+", " ", text)  # hapus mention
    text = re.sub(r"#\w+", " ", text)  # hapus simbol #, kontennya sudah di 'hashtags'
    text = re.sub(r"[^a-z\s]", " ", text)  # buang non-alfabet

    tokens = [normalize_slang(w) for w in text.split()]

    # jangan buang kata negasi meski ada di stopwords
    tokens = [w for w in tokens if (w not in stop_words or w in NEGATIONS)]

    # stemming terakhir agar bentuk konsisten
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = mark_negation(tokens)

    # tambahkan konten hashtag sebagai token (distem)
    tokens += [stemmer.stem(h) for h in hashtags]
    return " ".join(tokens)

# Terapkan cleaning
df["clean_text"] = df[TEXT_COL].astype(str).apply(clean_text_sentiment)
df[[TEXT_COL, "clean_text", LABEL_COL]].head(10)


Unnamed: 0,tweet,clean_text,sentimen
0,Kata @prabowo Indonesia tidak dihargai bangsa ...,indonesia tidak NEG_harga bangsa asing berita ...,negatif
1,"Batuan Langka, Tasbih Jokowi Hadiah dari Habib...",batu langka tasbih jokowi hadiah habib luthfi ...,netral
2,"Di era Jokowi, ekonomi Indonesia semakin baik....",era jokowi ekonomi indonesia pic twitter com w...,netral
3,"Bagi Sumatera Selatan, Asian Games berdampak p...",sumatera selatan asi games dampak pd ekonomi l...,positif
4,Negara kita ngutang buat bngun infrastruktur y...,negara ngutang bngun infrastruktur udah dipake...,negatif
5,"Yg bisikin pak jokowi, cm mikirin perputaran d...",bisikin jokowi cm mikirin putar duit golong e ...,netral
6,Masa tenang msih ngoceh aja..ttp jokowi harga ...,tenang msih ngoceh aja ttp jokowi harga mati,positif
7,#UASdifitnahKejiBalasDiTPS kerjasa ekonomi b...,kerjasa ekonomi bilateral multilateral doa tps...,positif
8,"Iya bener Aa, kita MANTAP kan pilihan ke Pemim...",iya bener aa mantap pilih pimpin bs solusi buk...,netral
9,Prabowo-Sandi Sepakat Tak Ambil Gaji karena Ne...,prabowo sandi sepakat tak NEG_ambil gaji negar...,negatif


## 5) Split Data: Train / Test

In [6]:
from sklearn.model_selection import train_test_split
X = df["clean_text"].values
y = df[LABEL_COL].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

print("Train size:", len(X_train), "| Test size:", len(X_test))


Train size: 1452 | Test size: 363


## 6) Vectorization: Word TF-IDF + Char TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

word_vect = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=20000,
    min_df=2,
    sublinear_tf=True
)
char_vect = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    min_df=2,
    sublinear_tf=True
)

X_train_word = word_vect.fit_transform(X_train)
X_test_word  = word_vect.transform(X_test)
X_train_char = char_vect.fit_transform(X_train)
X_test_char  = char_vect.transform(X_test)

X_train_all = hstack([X_train_word, X_train_char])
X_test_all  = hstack([X_test_word, X_test_char])

X_train_all.shape, X_test_all.shape


((1452, 32577), (363, 32577))

## 7) (Opsional) Oversampling Training Set

In [8]:
from imblearn.over_sampling import RandomOverSampler
apply_oversampling = False  # ubah ke True jika ingin oversampling
if apply_oversampling:
    ros = RandomOverSampler(random_state=SEED)
    X_train_all, y_train = ros.fit_resample(X_train_all, y_train)
    print('Oversampling diterapkan.')
    from collections import Counter
    print('Distribusi label (train) setelah ROS:', Counter(y_train))
else:
    print('Oversampling tidak diterapkan.')


Oversampling tidak diterapkan.


## 8) Model Selection: RandomizedSearchCV (LR & NB)

In [10]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import loguniform # Corrected import location

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# Logistic Regression
lr = LogisticRegression(max_iter=1000, solver="liblinear", class_weight="balanced", random_state=SEED)
param_lr = {"C": loguniform(1e-2, 1e2), "penalty": ["l1","l2"]}
rs_lr = RandomizedSearchCV(lr, param_lr, n_iter=20, scoring="f1_macro", cv=cv, n_jobs=-1, random_state=SEED, verbose=1)
rs_lr.fit(X_train_all, y_train)

# MultinomialNB
nb = MultinomialNB()
param_nb = {"alpha": loguniform(1e-3, 10)}
rs_nb = RandomizedSearchCV(nb, param_nb, n_iter=20, scoring="f1_macro", cv=cv, n_jobs=-1, random_state=SEED, verbose=1)
rs_nb.fit(X_train_all, y_train)

cands = [("LR", rs_lr.best_estimator_, rs_lr.best_score_), ("NB", rs_nb.best_estimator_, rs_nb.best_score_)]
best_name, best_model, best_cv = max(cands, key=lambda x: x[2])
print(f"[CV] Best: {best_name} | mean F1-macro: {best_cv:.4f}")
best_model

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] Best: NB | mean F1-macro: 0.6261


## 9) Training Ulang Model Terbaik & Evaluasi di Test Set

In [11]:
best_model.fit(X_train_all, y_train)
y_pred = best_model.predict(X_test_all)

from sklearn.metrics import classification_report, confusion_matrix, f1_score
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

f1m = f1_score(y_test, y_pred, average="macro")
print(f"F1-macro (test): {f1m:.4f}")


              precision    recall  f1-score   support

     negatif     0.5931    0.7227    0.6515       119
      netral     0.6083    0.6033    0.6058       121
     positif     0.6020    0.4797    0.5339       123

    accuracy                         0.6006       363
   macro avg     0.6012    0.6019    0.5971       363
weighted avg     0.6012    0.6006    0.5964       363

Confusion matrix:
[[86 19 14]
 [23 73 25]
 [36 28 59]]
F1-macro (test): 0.5971


## 10) Error Analysis (Contoh)

In [12]:
import pandas as pd
test_df = pd.DataFrame({
    "text": X_test,
    "clean_text": [t for t in X_test],
    "true": y_test,
    "pred": y_pred
})
test_df["correct"] = (test_df["true"] == test_df["pred"]).astype(int)
print("Akurasi sederhana:", test_df["correct"].mean())

# Tampilkan contoh prediksi salah
errors = test_df[test_df["correct"]==0].copy()
print("Total errors:", len(errors))
errors.head(20)


Akurasi sederhana: 0.6005509641873278
Total errors: 145


Unnamed: 0,text,clean_text,true,pred,correct
0,mohon bilang pangeran muhammad bin salman tolo...,mohon bilang pangeran muhammad bin salman tolo...,netral,negatif,0
7,klo jokowi butuh gaji utk nafkah kluarga jokow...,klo jokowi butuh gaji utk nafkah kluarga jokow...,positif,negatif,0
8,petinggi partai demokrat terima nyata hina pra...,petinggi partai demokrat terima nyata hina pra...,negatif,netral,0
9,maaf rakyat tdk butuh peluk rakyat butuh harga...,maaf rakyat tdk butuh peluk rakyat butuh harga...,netral,negatif,0
12,bicara pajak naik pajak serba pajak pajak baya...,bicara pajak naik pajak serba pajak pajak baya...,negatif,netral,0
14,fyi digital ekonomi papar cawapres laku era pe...,fyi digital ekonomi papar cawapres laku era pe...,positif,netral,0
17,asli ngakak jokowi ekonomi digital jokowimenan...,asli ngakak jokowi ekonomi digital jokowimenan...,positif,netral,0
18,gaji prabowo sandi sumbang pimpin 2019gantipre...,gaji prabowo sandi sumbang pimpin 2019gantipre...,positif,netral,0
21,wowo salah arah ekonomi akibat deindustrialisa...,wowo salah arah ekonomi akibat deindustrialisa...,negatif,netral,0
23,sodaqohkan gaji nya uasdifitnahkejidanbrutal,sodaqohkan gaji nya uasdifitnahkejidanbrutal,netral,positif,0


## 11) Simpan Model & Vectorizer (Drive)

In [18]:
import zipfile
import os

zip_path = os.path.join(OUTPUT_DIR, "artefak_model.zip")

# Use zipfile directly for better control and ZIP64 support
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zipf:
    for foldername, subfolders, filenames in os.walk(OUTPUT_DIR):
        for filename in filenames:
            file_path = os.path.join(foldername, filename)
            # Add file to zip, preserving the directory structure within the zip
            # Ensure that we are not trying to zip the zip file itself
            if file_path != zip_path:
                zipf.write(file_path, os.path.relpath(file_path, OUTPUT_DIR))

print("ZIP created:", zip_path)

ZIP created: /content/drive/My Drive/Proyek/OutputSentimentNew/artefak_model.zip


## 12) Inference: Prediksi Data Baru (Single & Batch)

In [19]:
from scipy.sparse import hstack

# ----- Single text -----
sample_text = "Filmnya tidak bagus sama sekali, aku kecewa bgt."
def predict_text(text, model=best_model, word_v=word_vect, char_v=char_vect):
    ct = clean_text_sentiment(text)
    Xw = word_v.transform([ct])
    Xc = char_v.transform([ct])
    Xall = hstack([Xw, Xc])
    pred = model.predict(Xall)[0]
    return pred, ct

pred, cleaned = predict_text(sample_text)
print("Teks:", sample_text)
print("Bersih:", cleaned)
print("Prediksi:", pred)

# ----- Batch (CSV dengan kolom TEXT_COL) -----
# csv_path = os.path.join(CSV_PATH, "data_baru.csv")
# df_new = pd.read_csv(csv_path)
# df_new["clean_text"] = df_new[TEXT_COL].astype(str).apply(clean_text_sentiment)
# Xw = word_vect.transform(df_new["clean_text"])
# Xc = char_vect.transform(df_new["clean_text"])
# Xall = hstack([Xw, Xc])
# df_new["pred"] = best_model.predict(Xall)
# out_path = os.path.join(OUTPUT_DIR, "prediksi_baru.csv")
# df_new.to_csv(out_path, index=False)
# print("Prediksi batch disimpan ke:", out_path)
# df_new.head()


Teks: Filmnya tidak bagus sama sekali, aku kecewa bgt.
Bersih: film tidak NEG_bagus kecewa banget
Prediksi: netral
