In [7]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Download resource yang dibutuhkan NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

print("Library dan Resource siap!")

Library dan Resource siap!


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Memuat file csv
df = pd.read_csv('backend/data/dataset_5_kategori_balanced.csv')

# Bersihkan data kosong (null) agar tidak error saat digabung
df['headline'] = df['headline'].fillna('')
df['short_description'] = df['short_description'].fillna('')

# Gabungkan teks
df['text'] = df['headline'] + " " + df['short_description']

# Ambil kolom teks dan kategori saja
df = df[['text', 'category']]

print(f"Data berhasil dimuat. Total data: {len(df)}")
df.head()

Data berhasil dimuat. Total data: 25385


Unnamed: 0,text,category
0,How to Manage Your Personal Brand Make no mist...,Economy
1,It Looks Like Uber's Winning Its War With New ...,Economy
2,The Progressive Promise of Today's Technology ...,Economy
3,Don't Let These 5 Confusing Words Mar Your Ima...,Economy
4,What You Don't Know About Overnight Success I'...,Economy


In [9]:
def preprocess_text(text):
    # 1. Case Folding (Kecilkan semua huruf)
    text = text.lower()

    # 2. Hapus URL dan Angka
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[-+]?[0-9]+', '', text)

    # 3. Removing Punctuation (Hapus tanda baca)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 4. Tokenization (Potong jadi kata-kata)
    tokens = word_tokenize(text)

    # 5. Stopword Removal (Hapus kata sambung/kata yang tidak bermakna)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return " ".join(filtered_tokens)

# Jalankan proses pembersihan (mungkin butuh waktu beberapa detik)
df['text_clean'] = df['text'].apply(preprocess_text)

print("Pre-processing selesai!")
df[['text', 'text_clean']].head()

Pre-processing selesai!


Unnamed: 0,text,text_clean
0,How to Manage Your Personal Brand Make no mist...,manage personal brand make mistake facebook ac...
1,It Looks Like Uber's Winning Its War With New ...,looks like ubers winning war new york grab pop...
2,The Progressive Promise of Today's Technology ...,progressive promise todays technology digital ...
3,Don't Let These 5 Confusing Words Mar Your Ima...,dont let confusing words mar image toms articu...
4,What You Don't Know About Overnight Success I'...,dont know overnight success ive fighting thing...


In [10]:
X = df['text_clean']
y = df['category']

# Bagi 80% untuk latihan, 20% untuk ujian
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Data Latihan: {len(X_train)}")
print(f"Data Ujian: {len(X_test)}")

Data Latihan: 20308
Data Ujian: 5077


In [11]:
# Vectorization yang lebih kuat (Unigram + Bigram)
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),    # Mengambil 1 kata dan gabungan 2 kata (misal: "stock", "market", "stock market")
    max_features=10000,    # Meningkatkan fitur karena bigram menambah banyak variasi kata
    sublinear_tf=True,     # Menghaluskan frekuensi kata (sangat membantu untuk SVM)
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Vectorization dengan N-Grams selesai!")

Vectorization dengan N-Grams selesai!


In [12]:
# Membuat model SVM
svm_model = SVC(kernel='linear', C=1.0)

# Proses belajar
svm_model.fit(X_train_tfidf, y_train)

print("Model AI sudah selesai dilatih!")

Model AI sudah selesai dilatih!


In [13]:
# AI mencoba menebak data ujian (X_test)
y_pred = svm_model.predict(X_test_tfidf)

# Tampilkan hasil akurasi
print(f"Akurasi Model: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nLaporan Detail:")
print(classification_report(y_test, y_pred))

Akurasi Model: 84.32%

Laporan Detail:
               precision    recall  f1-score   support

      Economy       0.78      0.78      0.78      1016
Entertainment       0.86      0.84      0.85      1015
       Health       0.86      0.88      0.87      1015
     Politics       0.83      0.84      0.84      1016
        Sport       0.90      0.87      0.89      1015

     accuracy                           0.84      5077
    macro avg       0.84      0.84      0.84      5077
 weighted avg       0.84      0.84      0.84      5077



In [14]:
def prediksi_kategori(kalimat_baru):
    # 1. Bersihkan teks
    bersih = preprocess_text(kalimat_baru)
    # 2. Ubah ke angka
    vektor = tfidf.transform([bersih])
    # 3. Prediksi
    hasil = svm_model.predict(vektor)
    return hasil[0]

# Contoh Test
input_user = "The team won the championship after a very intense match"
kategori = prediksi_kategori(input_user)

print(f"Input: {input_user}")
print(f"Kategori Terdeteksi: {kategori}")

Input: The team won the championship after a very intense match
Kategori Terdeteksi: Sport


In [None]:
import pickle

# 1. Simpan model SVM
with open('model_svm.pkl', 'wb') as model_file:
    pickle.dump(svm_model, model_file)

# 2. Simpan TF-IDF Vectorizer (Sangat penting! Harus ikut disimpan)
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(tfidf, vec_file)

print("File model_svm.pkl dan vectorizer.pkl berhasil disimpan!")

File model_svm.pkl dan vectorizer.pkl berhasil disimpan!


: 