In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE  
from collections import Counter           
from sklearn.model_selection import train_test_split 

print("Memuat dataset...")

train_df = pd.read_csv("../dataset/train.csv")
test_df  = pd.read_csv("../dataset/test.csv")

train_df.columns = ["Class Index", "Title", "Description"]
test_df.columns  = ["Class Index", "Title", "Description"]

train_df["text"] = train_df["Title"] + " " + train_df["Description"]
test_df["text"]  = test_df["Title"]  + " " + test_df["Description"]

# Sampling
train_df = train_df.sample(25000, random_state=42)
test_df  = test_df.sample(2000, random_state=42)

X_train_raw = train_df["text"].values
X_test_raw  = test_df["text"].values
y_train_raw = train_df["Class Index"].values - 1
y_test_raw  = test_df["Class Index"].values - 1

# TF-IDF 
vectorizer = TfidfVectorizer(
    max_features=4000,
    stop_words="english",
    ngram_range=(1,2)
)
X_train_vec = vectorizer.fit_transform(X_train_raw).toarray()
X_test_vec  = vectorizer.transform(X_test_raw).toarray()

print("TF-IDF selesai.")
print("Shape X_train_vec:", X_train_vec.shape)
print("Shape X_test_vec :", X_test_vec.shape)


Memuat dataset...
TF-IDF selesai.
Shape X_train_vec: (25000, 4000)
Shape X_test_vec : (2000, 4000)


In [15]:
# MISSING VALUE
print("Missing value train:\n", train_df.isnull().sum())
print("\nMissing value test:\n", test_df.isnull().sum())

train_df = train_df.dropna()
test_df  = test_df.dropna()

print("\nSetelah drop missing:")
print("Train:", train_df.shape)
print("Test :", test_df.shape)


Missing value train:
 Class Index    0
Title          0
Description    0
text           0
dtype: int64

Missing value test:
 Class Index    0
Title          0
Description    0
text           0
dtype: int64

Setelah drop missing:
Train: (25000, 4)
Test : (2000, 4)


In [16]:
# TRANSFORMASI MINMAX SCALER

scaler = MinMaxScaler()
X_train_sc = scaler.fit_transform(X_train_vec)
X_test_sc  = scaler.transform(X_test_vec)

print("Transformasi MinMaxScaler selesai.")
print("Range:", X_train_sc.min(), "→", X_train_sc.max())
print("Shape train:", X_train_sc.shape)


Transformasi MinMaxScaler selesai.
Range: 0.0 → 1.0
Shape train: (25000, 4000)


In [17]:
# 3. IMPLEMENTASI SMOTE
smote = SMOTE(random_state=42)

print("\n--- Sebelum SMOTE (Data Training) ---")
print(f"Jumlah sampel X_train: {X_train_sc.shape[0]}")
print(f"Distribusi Kelas y_train: {Counter(y_train_raw)}")

X_train_smote, y_train_smote = smote.fit_resample(
    X_train_sc, y_train_raw
)

print("\n--- Setelah SMOTE (Data Training) ---")
print(f"Jumlah sampel X_train_smote: {X_train_smote.shape[0]}")
print(f"Distribusi Kelas y_train_smote: {Counter(y_train_smote)}")



--- Sebelum SMOTE (Data Training) ---
Jumlah sampel X_train: 25000
Distribusi Kelas y_train: Counter({np.int64(3): 6329, np.int64(1): 6303, np.int64(0): 6215, np.int64(2): 6153})

--- Setelah SMOTE (Data Training) ---
Jumlah sampel X_train_smote: 25316
Distribusi Kelas y_train_smote: Counter({np.int64(2): 6329, np.int64(1): 6329, np.int64(3): 6329, np.int64(0): 6329})


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
import time

# LSTM butuh reshape: (samples, time_step, features)
X_train_lstm = X_train_sc.reshape((X_train_sc.shape[0], 1, X_train_sc.shape[1]))
X_test_lstm  = X_test_sc.reshape((X_test_sc.shape[0], 1, X_test_sc.shape[1]))
y_train_cat  = to_categorical(y_train_raw, 4)

def build_lstm(shape):
    model = Sequential()
    model.add(Input(shape=shape))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.4))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.4))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(4, activation="softmax"))
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

print("Training model LSTM...")

model_lstm = build_lstm((1, X_train_lstm.shape[2]))

start = time.time()
history = model_lstm.fit(
    X_train_lstm,
    y_train_cat,
    epochs=50,
    batch_size=32,
    verbose=1
)
train_time = time.time() - start


Training model LSTM...
Epoch 1/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 44ms/step - accuracy: 0.8400 - loss: 0.4811
Epoch 2/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 45ms/step - accuracy: 0.9189 - loss: 0.2474
Epoch 3/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 41ms/step - accuracy: 0.9326 - loss: 0.1989
Epoch 4/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 41ms/step - accuracy: 0.9425 - loss: 0.1636
Epoch 5/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 44ms/step - accuracy: 0.9510 - loss: 0.1343
Epoch 6/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 40ms/step - accuracy: 0.9562 - loss: 0.1143
Epoch 7/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 47ms/step - accuracy: 0.9613 - loss: 0.0982
Epoch 8/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 44ms/step - accuracy: 0.9667 - loss: 0.0863
E

In [20]:
# EVALUASI 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

start_test = time.time()
y_prob = model_lstm.predict(X_test_lstm)
test_time = time.time() - start_test

y_pred = np.argmax(y_prob, axis=1)

acc = accuracy_score(y_test_raw, y_pred)
prec = precision_score(y_test_raw, y_pred, average="weighted", zero_division=0)
rec = recall_score(y_test_raw, y_pred, average="weighted", zero_division=0)
auc = roc_auc_score(to_categorical(y_test_raw, 4), y_prob, multi_class="ovr")

print("\n=== EVALUASI PROSES 4 ===")
print("Akurasi :", acc)
print("Presisi :", prec)
print("Recall  :", rec)
print("AUC/ROC :", auc)
print("Waktu training :", train_time)
print("Waktu testing  :", test_time)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

=== EVALUASI PROSES 4 ===
Akurasi : 0.871
Presisi : 0.8710797376726673
Recall  : 0.871
AUC/ROC : 0.9706378625115468
Waktu training : 1664.164442539215
Waktu testing  : 0.6033868789672852
