In [2]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m112.6/209.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [3]:
# ========================
# 1. Import Library
# ========================
import pandas as pd
import numpy as np
import re
import random
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# ========================
# 2. Data Dummy
# ========================
judi_komentar = [
    "ayo join judi online gampang menang",
    "slot gacor hari ini pasti jackpot",
    "link judi terpercaya bonus besar",
    "deposit pulsa tanpa potongan untuk judi",
    "main slot online modal kecil untung besar",
    "situs judi bola resmi aman terpercaya",
    "daftar judi online hadiah menarik",
    "agen slot online 24 jam gacor",
    "bandar judi terpercaya anti kalah",
    "bonus new member judi slot 100%"
]

non_judi_komentar = [
    "promo diskon belanja di toko kami",
    "ayo dukung UMKM lokal biar makin maju",
    "belanja hemat banyak promo menarik",
    "jangan lupa makan buah dan sayur",
    "tips menjaga kesehatan tubuh",
    "cara belajar python untuk pemula",
    "film terbaru minggu ini wajib ditonton",
    "pergi liburan ke pantai bersama keluarga",
    "kelas online gratis untuk mahasiswa",
    "belajar machine learning lebih mudah"
]

# gandakan data dummy
dataset = []
for _ in range(10):
    for j in judi_komentar:
        dataset.append((j, "judi"))
    for n in non_judi_komentar:
        dataset.append((n, "non_judi"))

df = pd.DataFrame(dataset, columns=["komentar", "label"])
print("Jumlah data:", len(df))
df.head()


Jumlah data: 200


Unnamed: 0,komentar,label
0,ayo join judi online gampang menang,judi
1,slot gacor hari ini pasti jackpot,judi
2,link judi terpercaya bonus besar,judi
3,deposit pulsa tanpa potongan untuk judi,judi
4,main slot online modal kecil untung besar,judi


In [4]:
# ========================
# 3. Preprocessing
# ========================
stop_factory = StopWordRemoverFactory()
stopwords = set(stop_factory.get_stop_words())

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # hapus url
    text = re.sub(r"[^a-zA-Z\s]", " ", text)            # hapus simbol/angka
    text = re.sub(r"\s+", " ", text).strip()            # hapus spasi ganda
    words = text.split()
    words = [w for w in words if w not in stopwords]
    return " ".join(words)

df["clean"] = df["komentar"].apply(clean_text)
df.sample(5)

Unnamed: 0,komentar,label,clean
29,bonus new member judi slot 100%,judi,bonus new member judi slot
183,deposit pulsa tanpa potongan untuk judi,judi,deposit pulsa potongan judi
6,daftar judi online hadiah menarik,judi,daftar judi online hadiah menarik
188,bandar judi terpercaya anti kalah,judi,bandar judi terpercaya anti kalah
159,belajar machine learning lebih mudah,non_judi,belajar machine learning lebih mudah


In [5]:
# ========================
# 4. Split Data
# ========================
X_train, X_test, y_train, y_test = train_test_split(
    df["clean"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 160
Test size: 40


In [6]:
# ========================
# 5. TF-IDF + Logistic Regression
# ========================
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

# Evaluasi
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        judi       1.00      1.00      1.00        20
    non_judi       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [7]:
# ========================
# 6. Simpan Model
# ========================
joblib.dump(model, "model.joblib")
joblib.dump(vectorizer, "vectorizer.joblib")
print("Model dan vectorizer berhasil disimpan!")

Model dan vectorizer berhasil disimpan!
