In [1]:
# Import semua library utama yang dibutuhkan
import os
import sys
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


In [16]:
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


In [17]:
# Class untuk mematikan output berisik dari TensorFlow
# dipakai saat training model supaya rapi

class SuppressStderr:
    def __init__(self):
        self.null_fds = [os.open(os.devnull, os.O_RDWR) for _ in range(2)]
        self.save_fds = [os.dup(1), os.dup(2)]

    def __enter__(self):
        os.dup2(self.null_fds[0], 1)
        os.dup2(self.null_fds[1], 2)

    def __exit__(self, *_):
        os.dup2(self.save_fds[0], 1)
        os.dup2(self.save_fds[1], 2)
        for fd in self.null_fds + self.save_fds:
            os.close(fd)


In [None]:
# Load dataset train & test dari AG News
# - Menggabungkan Title + Description menjadi 1 teks
# - Sampling biar lebih ringan saat training
# - Melakukan mapping label dari 1–4 menjadi 0–3

train_df = pd.read_csv("../dataset/train.csv")
test_df = pd.read_csv("../dataset/test.csv")

train_df.columns = ["Class Index", "Title", "Description"]
test_df.columns = ["Class Index", "Title", "Description"]

train_df["text"] = train_df["Title"] + " " + train_df["Description"]
test_df["text"] = test_df["Title"] + " " + test_df["Description"]

train_df = train_df.sample(25000, random_state=42)
test_df = test_df.sample(2000, random_state=42)

X_train_raw = train_df["text"].values
y_train_raw = train_df["Class Index"].values - 1
X_test_raw = test_df["text"].values
y_test_raw = test_df["Class Index"].values - 1


In [5]:
# TF-IDF Vectorizer
# - Mengubah teks menjadi fitur numerik
# - max_features=4000 → 4000 fitur terbaik
# - ngram 1–2 → unigram + bigram
# Inilah fitur awal sebelum masuk proses 3

vectorizer = TfidfVectorizer(
    max_features=4000,
    stop_words="english",
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train_raw).toarray()
X_test_vec = vectorizer.transform(X_test_raw).toarray()

print("Shape TF-IDF train :", X_train_vec.shape)
print("Shape TF-IDF test  :", X_test_vec.shape)


Shape TF-IDF train : (25000, 4000)
Shape TF-IDF test  : (2000, 4000)


In [6]:
#  Fungsi untuk mengecek apakah ada missing value
# Menampilkan jumlah nilai NaN pada fitur
def cek_missing_value(X, name="X"):
    missing = np.isnan(X).sum()
    print(f"Missing value pada {name}: {missing}")

cek_missing_value(X_train_vec, "TF-IDF Train")
cek_missing_value(X_test_vec, "TF-IDF Test")


Missing value pada TF-IDF Train: 0
Missing value pada TF-IDF Test: 0


In [7]:
# Fungsi normalisasi menggunakan MinMaxScaler
# Normalisasi penting untuk memperbaiki performa model LSTM

def fitur_transformasi_scaler(X_train, X_test):
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled


In [8]:
# Menerapkan scaling pada data TF-IDF

X_train_scaled, X_test_scaled = fitur_transformasi_scaler(X_train_vec, X_test_vec)

print("Scaled train shape:", X_train_scaled.shape)
print("Scaled test shape :", X_test_scaled.shape)


Scaled train shape: (25000, 4000)
Scaled test shape : (2000, 4000)


In [9]:
#  Fungsi seleksi fitur dengan Chi-Square
# Mengambil K fitur paling relevan
# PROSES 3 wajib menggunakan chi-square

def fitur_seleksi_chisquare(X_train, y_train, X_test, k=1500):
    selector = SelectKBest(score_func=chi2, k=k)
    X_train_sel = selector.fit_transform(X_train, y_train)
    X_test_sel = selector.transform(X_test)
    return X_train_sel, X_test_sel


In [10]:
# Seleksi fitur Chi-Square
# Dari 4000 fitur → dipilih 1500 fitur paling penting

X_train_sel, X_test_sel = fitur_seleksi_chisquare(
    X_train_scaled,
    y_train_raw,
    X_test_scaled,
    k=1500
)

print("Setelah Chi-square:")
print(X_train_sel.shape, X_test_sel.shape)


Setelah Chi-square:
(25000, 1500) (2000, 1500)


In [11]:
# LSTM butuh input bentuk (samples, timesteps, features)
# Karena fitur adalah vektor 1D, kita reshape ke 3D dengan timesteps=1

X_train_lstm = X_train_sel.reshape(X_train_sel.shape[0], 1, X_train_sel.shape[1])
X_test_lstm = X_test_sel.reshape(X_test_sel.shape[0], 1, X_test_sel.shape[1])

print("Shape untuk LSTM:")
print(X_train_lstm.shape, X_test_lstm.shape)


Shape untuk LSTM:
(25000, 1, 1500) (2000, 1, 1500)


In [12]:
# Mengubah label menjadi one-hot encoding untuk softmax LSTM

y_train_cat = to_categorical(y_train_raw, num_classes=4)
y_test_cat = to_categorical(y_test_raw, num_classes=4)


In [13]:
def build_vanilla_lstm_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))

    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.4))

    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.4))

    model.add(Dense(64, activation="relu"))
    model.add(Dense(4, activation="softmax"))

    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model


In [14]:
# Training model LSTM dengan early stopping
# Ini adalah inti dari PROSES KETIGA

model = build_vanilla_lstm_model((1, 1500))

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

start_time = time.time()
with SuppressStderr():
    history = model.fit(
        X_train_lstm, y_train_cat,
        validation_split=0.2,
        epochs=10,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )
training_time = time.time() - start_time

print("Training selesai dalam", round(training_time, 2), "detik")


Training selesai dalam 19.24 detik


In [15]:
# Cell 14: Evaluasi accuracy, precision, recall, ROC-AUC

# Hitung waktu testing
start_test = time.time()
y_pred = model.predict(X_test_lstm)
end_test = time.time()

testing_time = end_test - start_test

# Prediksi label
y_pred_label = np.argmax(y_pred, axis=1)

# Hitung metrik evaluasi
acc = accuracy_score(y_test_raw, y_pred_label)
prec = precision_score(y_test_raw, y_pred_label, average="macro")
rec = recall_score(y_test_raw, y_pred_label, average="macro")
roc = roc_auc_score(y_test_cat, y_pred, multi_class="ovr")

# Print hasil evaluasi dengan format PROSES 2
print("=== EVALUASI PROSES 3 ===")
print("Akurasi        :", acc)
print("Presisi        :", prec)
print("Recall         :", rec)
print("AUC/ROC        :", roc)
print("Waktu training :", training_time)
print("Waktu testing  :", testing_time)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
=== EVALUASI PROSES 3 ===
Akurasi        : 0.873
Presisi        : 0.8721628515478452
Recall         : 0.872667832220625
AUC/ROC        : 0.9745992582151052
Waktu training : 19.239001274108887
Waktu testing  : 0.7469232082366943
