In [3]:
# =========================================================
# 1. SETUP & PREPARASI DATA
# =========================================================
import zipfile
import pandas as pd
import numpy as np
import pickle
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, GlobalMaxPool1D, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

# Ekstrak Dataset
with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
    zip_ref.extractall('dataset_news')

# Load Data
df = pd.read_json('dataset_news/News_Category_Dataset_v3.json', lines=True)

# Mapping 4 Topic Utama
category_map = {
    'BUSINESS': 'economy', 'MONEY': 'economy',
    'ENTERTAINMENT': 'entertainment', 'COMEDY': 'entertainment',
    'WELLNESS': 'health', 'HEALTHY LIVING': 'health',
    'TECH': 'technology', 'SCIENCE': 'technology'
}
df['target_topic'] = df['category'].map(category_map)
df = df.dropna(subset=['target_topic']).reset_index(drop=True)

# Ambil 20.000 data agar stabil
df = df.sample(20000, random_state=42)

In [4]:
# =========================================================
# 2. MODEL 1: LSTM CLASSIFIER
# =========================================================
print("--- Memulai Training Model 1: LSTM Classifier ---")

df['combined_text'] = df['headline'].astype(str) + " " + df['short_description'].astype(str)
X_class = df['combined_text']
y_class = pd.get_dummies(df['target_topic'])
categories = y_class.columns.tolist()

num_words_class = 5000
class_tokenizer = Tokenizer(num_words=num_words_class)
class_tokenizer.fit_on_texts(X_class)
X_class_pad = pad_sequences(class_tokenizer.texts_to_sequences(X_class), maxlen=70, padding='post')

# Arsitektur Sederhana agar akurasi tidak "over-performer"
class_input = Input(shape=(70,))
x_cl = Embedding(input_dim=num_words_class, output_dim=64)(class_input)
x_cl = LSTM(64, return_sequences=True)(x_cl)
x_cl = GlobalMaxPool1D()(x_cl)
x_cl = Dropout(0.5)(x_cl)
class_output = Dense(len(categories), activation='softmax')(x_cl)

model_class = Model(class_input, class_output)
model_class.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Mengubah epochs menjadi 5
model_class.fit(X_class_pad, y_class, epochs=5, batch_size=64, validation_split=0.1)

# Simpan Model Klasifikasi
model_class.save('lstm_genre_classifier.h5')
with open('class_tokenizer.pkl', 'wb') as f: pickle.dump(class_tokenizer, f)
with open('categories_label.pkl', 'wb') as f: pickle.dump(categories, f)

--- Memulai Training Model 1: LSTM Classifier ---
Epoch 1/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.5522 - loss: 1.0850 - val_accuracy: 0.7260 - val_loss: 0.7506
Epoch 2/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.7486 - loss: 0.7137 - val_accuracy: 0.7700 - val_loss: 0.6393
Epoch 3/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.8157 - loss: 0.5427 - val_accuracy: 0.7870 - val_loss: 0.5886
Epoch 4/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8496 - loss: 0.4443 - val_accuracy: 0.8235 - val_loss: 0.5417
Epoch 5/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8884 - loss: 0.3481 - val_accuracy: 0.8220 - val_loss: 0.5630




In [5]:
# =========================================================
# 3. MODEL 2: LSTM SUMMARIZER
# =========================================================
print("\n--- Memulai Training Model 2: LSTM Summarizer ---")

df['headline_clean'] = df['headline'].apply(lambda x: 'sostok ' + str(x) + ' eostok')
max_len_text = 60
max_len_summary = 15

# Tokenizer Summarizer
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(df['short_description']))
x_tr = pad_sequences(x_tokenizer.texts_to_sequences(df['short_description']), maxlen=max_len_text, padding='post')
x_voc_size = len(x_tokenizer.word_index) + 1

y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(df['headline_clean']))
y_tr = pad_sequences(y_tokenizer.texts_to_sequences(df['headline_clean']), maxlen=max_len_summary, padding='post')
y_voc_size = len(y_tokenizer.word_index) + 1

# Arsitektur Seq2Seq
latent_dim = 256
embedding_dim = 128

encoder_inputs = Input(shape=(max_len_text,))
enc_emb = Embedding(x_voc_size, embedding_dim, trainable=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(y_voc_size, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
decoder_dense = TimeDistributed(Dense(y_voc_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

model_summ = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model_summ.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

model_summ.fit([x_tr, y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:,1:],
               epochs=5, batch_size=64, validation_split=0.1)


--- Memulai Training Model 2: LSTM Summarizer ---
Epoch 1/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 464ms/step - loss: 6.6766 - val_loss: 5.5040
Epoch 2/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 466ms/step - loss: 5.4097 - val_loss: 5.4628
Epoch 3/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 461ms/step - loss: 5.3448 - val_loss: 5.4193
Epoch 4/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 464ms/step - loss: 5.2759 - val_loss: 5.3580
Epoch 5/5
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 460ms/step - loss: 5.2185 - val_loss: 5.3094


<keras.src.callbacks.history.History at 0x189bc6cd520>

In [6]:
# =========================================================
# 4. EXPORT & FIX INFERENCE
# =========================================================
# Fix koneksi input-output agar tidak ValueError
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Input hanya yang terhubung ke Output
decoder_model = Model(
    inputs=[decoder_inputs, decoder_state_input_h, decoder_state_input_c],
    outputs=[decoder_outputs2, state_h2, state_c2]
)

encoder_model.save('encoder_model.h5')
decoder_model.save('decoder_model.h5')
with open('x_tokenizer.pkl', 'wb') as f: pickle.dump(x_tokenizer, f)
with open('y_tokenizer.pkl', 'wb') as f: pickle.dump(y_tokenizer, f)

print("\n[SELESAI] Semua file model siap dikirim ke Backend Flask!")




[SELESAI] Semua file model siap dikirim ke Backend Flask!
