
# 🇮🇩 Sentiment Analysis (Bahasa Indonesia) — Enhanced (Per Bagian)
Notebook ini menggabungkan *pipeline lama* dengan **tiga peningkatan** yang diminta:
1) Normalisasi **singkatan** (gk→tidak, tp→tapi, dll)  
2) Konversi **emoji → kata** (🙂→"senyum", 😢→"sedih", dsb)  
3) Pembuangan **kata sangat jarang** lewat `min_df` pada TF‑IDF (lebih rapi dibanding hapus manual)

Tambahan opsional yang disiapkan (tinggal aktifkan bagian yang dikomentari):
- **Char n‑grams** (menolong typo & ejaan informal)
- **Stopword removal + Sastrawi stemming**


In [None]:

# ============================================================
# Setup & Paths
# ============================================================

# (Opsional) Jalankan di Colab untuk mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

import os
from pathlib import Path

# Lokasi CSV (ubah sesuai kebutuhan)
# Contoh Drive: '/content/drive/My Drive/Proyek/Data/tweet.csv'
CSV_PATH = '/content/drive/My Drive/Proyek/Data/tweet.csv'

# Jika file CSV tidak ditemukan di path di atas, fallback ke file lokal (untuk demo)
if not os.path.exists(CSV_PATH):
    alt = '/mnt/data/tweet.csv'
    if os.path.exists(alt):
        CSV_PATH = alt

# Direktori output model
OUTPUT_DIR = '/content/drive/My Drive/Proyek/OutputSentimentNew'
if not os.path.exists('/content/drive'):
    # fallback lokal jika tidak di Colab/Drive
    OUTPUT_DIR = './OutputSentimentNew'

os.makedirs(OUTPUT_DIR, exist_ok=True)

print('CSV_PATH:', CSV_PATH)
print('OUTPUT_DIR:', OUTPUT_DIR)


In [None]:

# ============================================================
# Install & Import
# ============================================================

# Jalankan sekali (di Colab) jika paket belum terinstal
# !pip -q install emoji Sastrawi

import re, unicodedata, zipfile, joblib
import numpy as np
import pandas as pd

import emoji

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score


In [None]:

# ============================================================
# Load Data
# ============================================================
df = pd.read_csv(CSV_PATH)
TEXT_COL, LABEL_COL = "tweet", "sentimen"

print("Jumlah baris:", len(df))
print(df.head(3))


In [None]:

# ============================================================
# Normalisasi: encoding, emoji -> kata, slang mapping
# ============================================================

SLANG_MAP = {
    "gk": "tidak", "ga": "tidak", "gak": "tidak", "nggak": "tidak", "ngga": "tidak",
    "tdk": "tidak", "engga": "tidak", "enggak": "tidak",
    "tp": "tapi", "tpi": "tapi",
    "yg": "yang", "dr": "dari", "krn": "karena", "karna": "karena",
    "dgn": "dengan", "dg": "dengan",
    "sdh": "sudah", "udh": "sudah", "udah": "sudah",
    "blm": "belum",
    "sm": "sama", "sy": "saya",
    "bgt": "banget", "bngt": "banget",
    "dlm": "dalam", "bkn": "bukan", "utk": "untuk",
    "aja": "saja",
    "jd": "jadi", "jg": "juga",
    "krg": "kurang", "skrg": "sekarang",
}

EMOJI_SPECIAL = {
    ":smiling_face_with_smiling_eyes:": "senyum",
    ":slightly_smiling_face:": "senyum",
    ":grinning_face:": "senyum",
    ":face_with_tears_of_joy:": "tertawa",
    ":loudly_crying_face:": "sedih",
    ":crying_face:": "sedih",
    ":pouting_face:": "marah",
    ":angry_face:": "marah",
    ":red_heart:": "cinta",
    ":thumbs_up:": "jempol",
    ":fire:": "api",
}

URL_RE   = re.compile(r"http\S+|www\.\S+")
MENT_RE  = re.compile(r"@\w+")
HASH_RE  = re.compile(r"#\w+")
NONAL_RE = re.compile(r"[^a-z\s]")

def fix_encoding(text: str) -> str:
    text = str(text).replace("\xa0", " ").replace("Â", " ")
    text = unicodedata.normalize("NFKC", text)
    return text

def convert_emoji_to_words(text: str) -> str:
    demojized = emoji.demojize(text, delimiters=(" :", ": "))
    for k, v in EMOJI_SPECIAL.items():
        demojized = demojized.replace(k, f" {v} ")
    demojized = re.sub(r":([a-z0-9_]+):", lambda m: " " + m.group(1).replace("_", " ") + " ", demojized)
    return demojized

def normalize_slang(text: str) -> str:
    tokens = re.findall(r"\w+|\S", text.lower())
    norm = []
    for t in tokens:
        key = t.lower()
        if re.match(r"^\w+$", key) and key in SLANG_MAP:
            norm.append(SLANG_MAP[key])
        else:
            norm.append(t)
    return " ".join(norm)

def clean_text_v2(text: str) -> str:
    text = fix_encoding(text)
    text = text.lower()
    text = URL_RE.sub(" ", text)
    text = MENT_RE.sub(" ", text)
    text = HASH_RE.sub(" ", text)
    text = convert_emoji_to_words(text)
    text = normalize_slang(text)
    text = NONAL_RE.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [None]:

# ============================================================
# Terapkan Cleaning (opsional: stopword & stemming)
# ============================================================

df["clean_tweet"] = df[TEXT_COL].astype(str).apply(clean_text_v2)

# --- OPSIONAL: aktifkan jika ingin stopword removal + stemming ---
# from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# stop_factory = StopWordRemoverFactory()
# stop_remover = stop_factory.create_stop_word_remover()
# stemmer = StemmerFactory().create_stemmer()
# def indo_stop_stem(text):
#     text = stop_remover.remove(text)
#     text = stemmer.stem(text)
#     return text
# df["clean_tweet"] = df["clean_tweet"].apply(indo_stop_stem)

df[[TEXT_COL, "clean_tweet", LABEL_COL]].head(5)


In [None]:

# ============================================================
# Feature: TF-IDF (min_df untuk buang kata sangat jarang)
# (Opsional) gabungan char n-grams untuk typo/ejaan informal
# ============================================================

from scipy.sparse import hstack

word_vectorizer = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1,2),
    min_df=5,        # buang token muncul <5 dokumen
    max_df=0.95,
    max_features=20000,
)

# OPSIONAL: aktifkan untuk gabungan char n-grams
use_char_ngrams = False
char_vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    min_df=5,
    max_df=0.95
)

X_word = word_vectorizer.fit_transform(df["clean_tweet"])

if use_char_ngrams:
    X_char = char_vectorizer.fit_transform(df["clean_tweet"])
    X = hstack([X_word, X_char]).tocsr()
else:
    X = X_word

y = df[LABEL_COL]
print("Shape features:", X.shape)


In [None]:

# ============================================================
# Split Train/Test
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)


In [None]:

# ============================================================
# Modeling: Naive Bayes & Logistic Regression
# ============================================================

# Naive Bayes
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("=== MultinomialNB ===")
print("Accuracy :", round(accuracy_score(y_test, y_pred_nb), 4))
print("F1-macro :", round(f1_score(y_test, y_pred_nb, average="macro"), 4))
print(classification_report(y_test, y_pred_nb, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

# Logistic Regression
lr = LogisticRegression(max_iter=1000, solver="liblinear", class_weight="balanced")
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("\n=== Logistic Regression ===")
print("Accuracy :", round(accuracy_score(y_test, y_pred_lr), 4))
print("F1-macro :", round(f1_score(y_test, y_pred_lr, average="macro"), 4))
print(classification_report(y_test, y_pred_lr, digits=3))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


In [None]:

# ============================================================
# Save Model & Vectorizer
# ============================================================

joblib.dump(word_vectorizer, os.path.join(OUTPUT_DIR, "tfidf_word_vectorizer.joblib"))
if 'use_char_ngrams' in globals() and use_char_ngrams:
    joblib.dump(char_vectorizer, os.path.join(OUTPUT_DIR, "tfidf_char_vectorizer.joblib"))

joblib.dump(nb, os.path.join(OUTPUT_DIR, "model_nb.joblib"))
joblib.dump(lr, os.path.join(OUTPUT_DIR, "model_lr.joblib"))

print("Saved to:", OUTPUT_DIR)
print("Files:", os.listdir(OUTPUT_DIR))


In [None]:

# ============================================================
# Inference (Contoh Pemakaian)
# ============================================================
def predict_texts(texts, model="lr"):
    texts = [clean_text_v2(t) for t in texts]
    Xw = word_vectorizer.transform(texts)
    if 'use_char_ngrams' in globals() and use_char_ngrams:
        Xc = char_vectorizer.transform(texts)
        from scipy.sparse import hstack
        X_ = hstack([Xw, Xc]).tocsr()
    else:
        X_ = Xw
    mdl = lr if model == "lr" else nb
    return mdl.predict(X_), mdl.predict_proba(X_)

sample_texts = [
    "Senang banget! Pelayanannya cepat :)",
    "Biasa aja sih, standar.",
    "Kecewa berat, parah nih kualitasnya 😡"
]
preds, probs = predict_texts(sample_texts, model="lr")
for t, p, pr in zip(sample_texts, preds, probs):
    print(f"- {t} => {p} | proba={np.max(pr):.3f}")


In [None]:

# ============================================================
# (Opsional) Zip Output Model untuk diunduh
# ============================================================
zip_path = os.path.join(OUTPUT_DIR, "OutputSentimentNew_models.zip")
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
    for fn in os.listdir(OUTPUT_DIR):
        fp = os.path.join(OUTPUT_DIR, fn)
        if os.path.isfile(fp) and fn.endswith((".joblib", ".zip")) is False:
            # lewati file selain joblib (kecuali zip final)
            continue
        zf.write(fp, arcname=os.path.basename(fp))

print("ZIP ready:", zip_path)
