# Import Library

In [None]:
# Data Manipulation
import pandas as pd
import numpy as np
import csv

# Text Processing & NLP
import emoji
import re
import string
import spacy
import nltk
import gensim.downloader as api

# Machine Learning & Deep Learning
import torch
import tensorflow as tf
import optuna
import joblib

# Visualization
import plotly.express as px
import matplotlib.pyplot as plt

# Utility & System
import os
import requests


In [None]:
# I/O Handling
from io import StringIO

# NLP Processing (Tokenization, Stopwords, Lemmatization)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download as nltk_download

# Word Embeddings & Language Models
from gensim.models import KeyedVectors
from transformers import BertTokenizer, BertModel

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE

# Feature Engineering (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Model Evaluation
from sklearn.metrics import accuracy_score, classification_report

# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Deep Learning (TensorFlow & Keras)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, 
    LSTM, Bidirectional, GRU
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Visualization
from wordcloud import WordCloud
from collections import Counter

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


# Load Dataset

In [None]:
data = pd.read_csv('../datasets/mlbb_reviews2.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

# Data Pre-processing

## Irrelevant Columns Removal

In [None]:
data = data.drop(columns=['reviewId', 'userName', 'userImage', 'thumbsUpCount', 'reviewCreatedVersion', 'replyContent', 'repliedAt', 'appVersion'])

In [None]:
data.head()

## Case Folding

Convert all text in content to lowercase

In [None]:
data['content_casefolding'] = data['content'].str.lower()

In [None]:
data.head()

## Text Cleaning

### Remove Emojis & Emoticons

In [None]:
def remove_emoji_with_library(text):
    return emoji.replace_emoji(text, replace='')

data['content_no_emoji'] = data['content_casefolding'].apply(remove_emoji_with_library)

### Remove Mentions

Remove mentions like @username.

In [None]:
data['content_no_mentions'] = data['content_no_emoji'].apply(
    lambda text: re.sub(r'@[A-Za-z0-9]+', '', text)
)

### Remove Hashtags

Remove hashtags such as #awesome.

In [None]:
data['content_no_hastags'] = data['content_no_mentions'].apply(
    lambda text: re.sub(r'#[A-Za-z0-9]+', '', text)
)

### Remove URLs

Remove web links from the text.

In [None]:
data['content_no_urls'] = data['content_no_hastags'].apply(
    lambda text: re.sub(r'https?://\S+|www\.\S+', '', text)
)

### Remove Numbers

Remove all numeric values from the text.

In [None]:
data['content_no_numbers'] = data['content_no_urls'].apply(
    lambda text: re.sub(r'[0-9]+', '', text)
)

### Remove Superscript

In [None]:
data['content_no_superscript'] = data['content_no_numbers'].apply(
    lambda text: re.sub(r'[\u2070-\u209F]', '', text)
)

### Remove Punctuation

Remove punctuation marks from the text.

In [None]:
data['content_no_punctuation'] = data['content_no_superscript'].apply(
    lambda text: text.translate(str.maketrans('', '', string.punctuation))
)

### Replace Newlines with Spaces

Replace newline characters (\n) with spaces to keep the text in one line.

In [None]:
data['content_no_newlines'] = data['content_no_punctuation'].apply(
    lambda text: text.replace('\n', ' ')
)

### Remove Mulltiple Spaces

Replace multiple consecutive spaces with a single space.

In [None]:
data['content_no_multiplespaces'] = data['content_no_newlines'].apply(
    lambda text: re.sub(r'\s+', ' ', text)
)

### Strip Leading & Trailing Spaces

Remove unnecessary spaces at the beginning and end of the text.

In [None]:
data['content_no_unnecessaryspaces'] = data['content_no_multiplespaces'].apply(
    lambda text: text.strip()
)

In [None]:
data.head()

## Tokenizing

In [None]:
data['content_tokenized'] = data['content_no_unnecessaryspaces'].apply(lambda text: word_tokenize(text))

In [None]:
data.head()

## Slang Normalization

In [None]:
lexicon = pd.read_csv('../lexicon/colloquial-indonesian-lexicon.csv')

In [None]:
lexicon[['slang', 'formal']]

In [None]:
slang_dict = dict(zip(lexicon['slang'], lexicon['formal']))

In [None]:
def normalize_slang(tokens, mapping):
    return [mapping.get(token, token) for token in tokens]

In [None]:
data['content_slangnormalized'] = data['content_tokenized'].apply(
    lambda tokens: normalize_slang(tokens, slang_dict)
)

In [None]:
data[['content_tokenized', 'content_slangnormalized']]

## Stopword Removal

In [None]:
listStopwords = set(stopwords.words('indonesian'))
listStopwords1 = set(stopwords.words('english'))

In [None]:
listStopwords = listStopwords.union(listStopwords1)

In [None]:
stoplist = pd.read_csv('../stoplist/stopwordbahasa.csv', header=None)

In [None]:
stopwords_from_csv = set(stoplist[0].astype(str).str.lower())

In [None]:
listStopwords = listStopwords.union(stopwords_from_csv)

In [None]:
def remove_stopwords(text):
    return [word for word in text if word.lower() not in listStopwords]

In [None]:
data['content_no_stopwords'] = data['content_slangnormalized'].apply(remove_stopwords)

In [None]:
data[['content_slangnormalized', 'content_no_stopwords']].head()

## Lemmatization

In [None]:
nlp = spacy.blank("id")
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
nlp.initialize()

In [None]:
def lemmatize_tokens(tokens):
    text = " ".join(tokens)
    doc = nlp(text)
    return [token.lemma_ for token in doc]

In [None]:
data['content_lemmatized'] = data['content_no_stopwords'].apply(lemmatize_tokens)

In [None]:
data[['content_no_stopwords', 'content_lemmatized']].head()

## Detokenizing

In [None]:
data['content_clean'] = data['content_lemmatized'].apply(lambda tokens: " ".join(tokens))

In [None]:
data.head()

# Labeling

In [None]:
lexicon_positive = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
reader = csv.reader(StringIO(response.text), delimiter=',')
for row in reader:
    lexicon_positive[row[0]] = int(row[1])

lexicon_negative = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
reader = csv.reader(StringIO(response.text), delimiter=',')
for row in reader:
    lexicon_negative[row[0]] = int(row[1])


In [None]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0

    for word in text:
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]

    for word in text:
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]

    polarity=''

    if (score >= 0):
        polarity = 'positive'
    elif (score <= -7):
        polarity = 'negative'
    else:
        polarity = 'neutral'

    return score, polarity

In [None]:
results = data['content_lemmatized'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
data['sentiment_score'] = results[0]
data['sentiment'] = results[1]

In [None]:
data['sentiment'].value_counts()

In [None]:
label_encoder = LabelEncoder()

In [None]:
data['sentiment_numeric'] = label_encoder.fit_transform(data['sentiment'])

In [None]:
data.head()

# Exploratory Data Analysis (EDA)

## Sentiment Distribution

In [None]:
sentiment_counts = data['sentiment'].value_counts().reset_index()
sentiment_counts.columns = ['sentiment', 'count']

fig = px.bar(
    sentiment_counts,
    x='sentiment',
    y='count',
    title='Sentiment Counts',
    labels={'sentiment': 'Sentiment', 'count': 'Count'},
    color='sentiment',
    color_discrete_map={'Positive': 'green', 'Neutral': 'orange', 'Negative': 'red'}, 
    text='count',
    width=800,
)

fig.update_traces(
    texttemplate='%{text}',
    textposition='outside'
)

fig.show()


## Word Count Analysis

In [None]:
data['word_count'] = data['content_clean'].apply(lambda x: len(x.split()))

fig_hist = px.histogram(
    data, 
    x='word_count', 
    title='Word Count Distribution',
    nbins=50,
    labels={'word_count': 'Word Count'},
    color_discrete_sequence=['blue']
)
fig_hist.update_layout(title_x=0.5)
fig_hist.show()


In [None]:
fig_scatter = px.scatter(
    data, 
    x='word_count', 
    y='sentiment', 
    title='Sentiment by Word Count',
    color='sentiment',
    labels={'word_count': 'Word Count', 'sentiment': 'Sentiment'},
    color_discrete_map={'positive': 'green', 'neutral': 'orange', 'negative': 'red'}
)
fig_scatter.update_layout(title_x=0.5)
fig_scatter.show()

## Word Cloud

In [None]:
positive_text = " ".join(data[data['sentiment'] == 'positive']['content_clean'])
neutral_text = " ".join(data[data['sentiment'] == 'neutral']['content_clean'])
negative_text = " ".join(data[data['sentiment'] == 'negative']['content_clean'])

In [None]:
def generate_wordcloud(text, title, color):
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white', 
        colormap=color,
        max_words=200
    ).generate(text)
    
    plt.figure(figsize=(15, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title, fontsize=16)
    plt.show()

generate_wordcloud(positive_text, "Word Cloud for Positive Sentiments", "Greens")
generate_wordcloud(neutral_text, "Word Cloud for Neutral Sentiments", "Oranges")
generate_wordcloud(negative_text, "Word Cloud for Negative Sentiments", "Reds")


## Unique Word Count Analysis

In [None]:
data['unique_word_count'] = data['content_clean'].apply(lambda x: len(set(x.split())))

In [None]:
fig_unique = px.histogram(
    data, 
    x='unique_word_count', 
    title='Unique Word Count Distribution',
    nbins=50,
    labels={'unique_word_count': 'Unique Word Count'},
    color_discrete_sequence=['purple']
)

fig_unique.show()

## Sentiment Proportion by Rating

In [None]:
fig_bar_score = px.histogram(
    data, 
    x='score', 
    color='sentiment', 
    barmode='group',
    title='Sentiment by Rating',
    labels={'score': 'Score', 'count': 'Number of Reviews'},
    color_discrete_map={'positive': 'green', 'neutral': 'orange', 'negative': 'red'}
)

fig_bar_score.show()

## Sentiment Over Time

In [None]:
data['date'] = pd.to_datetime(data['at']).dt.date

In [None]:
grouped_data = data.groupby(['date', 'sentiment']).size().reset_index(name='count')

In [None]:
fig_time = px.line(
    grouped_data,
    x='date',
    y='count',
    color='sentiment',
    title='Sentiments Over Time',
    labels={'date': 'Date', 'count': 'Number of Reviews'},
    color_discrete_map={'Positive': 'green', 'Neutral': 'orange', 'Negative': 'red'}
)

fig_time.show()


## Top Frequent Words

In [None]:
def get_top_words(data, sentiment, n=10):
    sentiment_data = data[data['sentiment'] == sentiment]
    all_words = [word for tokens in sentiment_data['content_lemmatized'] for word in tokens]
    word_counts = Counter(all_words)
    return word_counts.most_common(n)

In [None]:
top_positive = get_top_words(data, 'positive', n=10)
top_neutral = get_top_words(data, 'neutral', n=10)
top_negative = get_top_words(data, 'negative', n=10)

In [None]:
df_positive = pd.DataFrame(top_positive, columns=['word', 'count'])

fig_positive = px.bar(
    df_positive,
    x='word',
    y='count',
    title='Top Words for Positive Sentiment',
    labels={'word': 'Word', 'count': 'Frequency'},
    color_discrete_sequence=['green']
)

fig_positive.show()

In [None]:
df_neutral = pd.DataFrame(top_neutral, columns=['word', 'count'])

fig_neutral = px.bar(
    df_neutral,
    x='word',
    y='count',
    title='Top Words for Neutral Sentiment',
    labels={'word': 'Word', 'count': 'Frequency'},
    color_discrete_sequence=['orange']
)
fig_neutral.update_layout(title_x=0.5)
fig_neutral.show()


In [None]:
df_negative = pd.DataFrame(top_negative, columns=['word', 'count'])

fig_negative = px.bar(
    df_negative,
    x='word',
    y='count',
    title='Top Words for Negative Sentiment',
    labels={'word': 'Word', 'count': 'Frequency'},
    color_discrete_sequence=['red']
)

fig_negative.show()

# Data Balancing
In this process, Synthetic Minority Over-sampling Technique (SMOTE) is applied to balance the sentiment classification dataset. SMOTE generates synthetic samples for the minority classes (e.g., neutral or positive sentiment) to ensure a more balanced distribution.

In [None]:
#data['sentiment_label'] = data['sentiment'].map({'negative': 0, 'positive': 1, 'neutral': 2})

In [None]:
#vectorizer = TfidfVectorizer()
#X_tfidf = vectorizer.fit_transform(data['content_clean'])
#y = data['sentiment_label']

In [None]:
#smote = SMOTE(sampling_strategy='auto', random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

In [None]:
#smote_labels = pd.DataFrame({'sentiment_smote': y_resampled})

In [None]:
#data = pd.concat([data, smote_labels.iloc[:len(data)]], axis=1)

In [None]:
#y_resampled.value_counts()

In [None]:
#data.head()

# Feature Extraction

## TF-IDF
10000 feature dimensions

In [None]:
vectorizer_tfidf = TfidfVectorizer(max_features=10000)

In [None]:
X_tfidf = vectorizer_tfidf.fit_transform(data['content_clean'])

In [None]:
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer_tfidf.get_feature_names_out())

In [None]:
tfidf_df['sentiment'] = data['sentiment']

In [None]:
tfidf_df.head()

In [None]:
label_encoder = LabelEncoder()

In [None]:
tfidf_df['sentiment_numeric'] = label_encoder.fit_transform(tfidf_df['sentiment'])

In [None]:
tfidf_df = tfidf_df.drop(columns=['sentiment'])

In [None]:
tfidf_df.head()

## FastText
300 feature dimensions

In [None]:
fasttext_dir = "../models/fasttext_model"

In [None]:
fasttext_path = os.path.join(fasttext_dir, "fasttext.kv")

In [None]:
if not os.path.exists(fasttext_path):
    print("Download model FastText.")
    
    fasttext_model = api.load("fasttext-wiki-news-subwords-300")
    fasttext_model.save(fasttext_path)
    
    print(f"FastText model successfully saved in {fasttext_path}")
else:
    print("FastText model already exists, no need to download.")

In [None]:
fasttext_model = KeyedVectors.load(fasttext_path)

In [None]:
def get_fasttext_vector(text):
    words = text.split()
    vectors = [fasttext_model[word] for word in words if word in fasttext_model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

In [None]:
data['fasttext_features'] = data['content_clean'].apply(get_fasttext_vector)

In [None]:
fasttext_df = pd.DataFrame(data['fasttext_features'].tolist(), columns=[f'fasttext_{i}' for i in range(300)])

In [None]:
data_with_fasttext = pd.concat([data, fasttext_df], axis=1)

In [None]:
data_with_fasttext.drop(columns=['fasttext_features'], inplace=True)

In [None]:
data_with_fasttext.head()

In [None]:
data.head()

## IndoBERT
768 feature dimensions

In [None]:
#indobert_dir = "../models/indobert_model"

In [None]:
#os.makedirs(indobert_dir, exist_ok=True)

In [None]:
#if not os.path.exists(os.path.join(indobert_dir, "pytorch_model.bin")):
#    print("Download model IndoBERT.")
#
#    tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
#    model = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
#
#    tokenizer.save_pretrained(indobert_dir)
#    model.save_pretrained(indobert_dir)
#
#    print(f"IndoBERT model successfully saved in {indobert_dir}")
#else:
#    print("IndoBERT model already exists, no need to download.")

In [None]:
#tokenizer = BertTokenizer.from_pretrained(indobert_dir)
#model = BertModel.from_pretrained(indobert_dir)

In [None]:
#def get_bert_embedding(text):
#    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#
#    with torch.no_grad():
#        outputs = model(**inputs)
#
#    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
#data['bert_features'] = data['content_clean'].apply(get_bert_embedding)

In [None]:
#bert_df = pd.DataFrame(data['bert_features'].tolist(), columns=[f'bert_{i}' for i in range(768)])

In [None]:
#data_with_bert = pd.concat([data, bert_df], axis=1)

In [None]:
#data_with_bert.drop(columns=['bert_features'], inplace=True)

In [None]:
#data_with_bert.head()

In [None]:
#data.head()

# Modeling

In [None]:
MAX_WORDS = 20000
MAX_LENGTH = 100
NUM_CLASSES = 3

In [None]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(data['content_clean'])
X_sequences = tokenizer.texts_to_sequences(data['content_clean'])
X_padded = pad_sequences(X_sequences, maxlen=MAX_LENGTH)
y_labels = data['sentiment_numeric'].values

In [None]:
def plot_history(history, model_name):
    fig, axs = plt.subplots(1, 2, figsize=(12, 4))
    # Plot Loss
    axs[0].plot(history.history['loss'], label='Train Loss')
    axs[0].plot(history.history['val_loss'], label='Val Loss')
    axs[0].set_title(f'{model_name} Loss')
    axs[0].set_xlabel('Epoch')
    axs[0].set_ylabel('Loss')
    axs[0].legend()
    # Plot Accuracy
    axs[1].plot(history.history['accuracy'], label='Train Accuracy')
    axs[1].plot(history.history['val_accuracy'], label='Val Accuracy')
    axs[1].set_title(f'{model_name} Accuracy')
    axs[1].set_xlabel('Epoch')
    axs[1].set_ylabel('Accuracy')
    axs[1].legend()
    plt.tight_layout()
    plt.savefig(f'{model_name}_training_history.png')
    plt.show()

In [None]:
X_tfidf_features = tfidf_df.drop(columns=['sentiment_numeric']).values
y_tfidf = tfidf_df['sentiment_numeric'].values

In [None]:
X_fasttext = fasttext_df.values
y_fasttext = data['sentiment_numeric'].values

## SVM with TF-IDF

In [None]:
X_train_tfidf_svm, X_test_tfidf_svm, y_train_tfidf_svm, y_test_tfidf_svm = train_test_split(
    X_tfidf_features, y_tfidf, test_size=0.2, random_state=42
)

In [None]:
def objective_tfidf_svm(trial):
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
    C = trial.suggest_loguniform("C", 1e-3, 1e3)
    if kernel == "rbf":
        gamma = trial.suggest_loguniform("gamma", 1e-4, 1e-1)
        model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=42)
    else:
        model = SVC(kernel=kernel, C=C, random_state=42)
    
    model.fit(X_train_tfidf_svm, y_train_tfidf_svm)
    y_pred = model.predict(X_test_tfidf_svm)
    return accuracy_score(y_test_tfidf_svm, y_pred)

In [None]:
study_tfidf_svm = optuna.create_study(direction="maximize")
study_tfidf_svm.optimize(objective_tfidf_svm, n_trials=5)

In [None]:
joblib.dump(study_tfidf_svm, "../models/svm/tfidf_svm_optuna_study.pkl")

In [None]:
print("Best TF-IDF SVM hyperparameters:", study_tfidf_svm.best_params)
print("Best TF-IDF SVM Accuracy (Validation): {:.4f}".format(study_tfidf_svm.best_value))

In [None]:
best_params_tfidf_svm = study_tfidf_svm.best_params

In [None]:
if best_params_tfidf_svm["kernel"] == "rbf":
    best_model_tfidf_svm = SVC(kernel=best_params_tfidf_svm["kernel"], C=best_params_tfidf_svm["C"], gamma=best_params_tfidf_svm["gamma"], random_state=42)
else:
    best_model_tfidf_svm = SVC(kernel=best_params_tfidf_svm["kernel"], C=best_params_tfidf_svm["C"], random_state=42)

In [None]:
best_model_tfidf_svm.fit(X_train_tfidf_svm, y_train_tfidf_svm)

In [None]:
y_train_pred_tfidf_svm = best_model_tfidf_svm.predict(X_train_tfidf_svm)
y_test_pred_tfidf_svm = best_model_tfidf_svm.predict(X_test_tfidf_svm)

In [None]:
train_acc_tfidf_svm = accuracy_score(y_train_tfidf_svm, y_train_pred_tfidf_svm)
test_acc_tfidf_svm = accuracy_score(y_test_tfidf_svm, y_test_pred_tfidf_svm)

In [None]:
print("TF-IDF + SVM Training Accuracy: {:.4f}".format(train_acc_tfidf_svm))
print("TF-IDF + SVM Testing Accuracy:  {:.4f}".format(test_acc_tfidf_svm))

In [None]:
print(classification_report(y_test_tfidf_svm, y_test_pred_tfidf_svm))

In [None]:
joblib.dump(best_model_tfidf_svm, "../models/svm/best_tfidf_svm_model.pkl")

## SVM with FastText

In [None]:
X_train_fasttext_svm, X_test_fasttext_svm, y_train_fasttext_svm, y_test_fasttext_svm = train_test_split(
    X_fasttext, y_fasttext, test_size=0.2, random_state=42
)

In [None]:
def objective_fasttext_svm(trial):
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
    C = trial.suggest_loguniform("C", 1e-3, 1e3)
    if kernel == "rbf":
        gamma = trial.suggest_loguniform("gamma", 1e-4, 1e-1)
        model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=42)
    else:
        model = SVC(kernel=kernel, C=C, random_state=42)
    
    model.fit(X_train_fasttext_svm, y_train_fasttext_svm)
    y_pred = model.predict(X_test_fasttext_svm)
    return accuracy_score(y_test_fasttext_svm, y_pred)

In [None]:
study_fasttext_svm = optuna.create_study(direction="maximize")
study_fasttext_svm.optimize(objective_fasttext_svm, n_trials=5)

In [None]:
joblib.dump(study_fasttext_svm, "../models/svm/fasttext_svm_optuna_study.pkl")

In [None]:
print("Best FastText SVM hyperparameters:", study_fasttext_svm.best_params)
print("Best FastText SVM Accuracy (Validation): {:.4f}".format(study_fasttext_svm.best_value))

In [None]:
best_params_fasttext_svm = study_fasttext_svm.best_params

In [None]:
if best_params_fasttext_svm["kernel"] == "rbf":
    best_model_fasttext_svm = SVC(kernel=best_params_fasttext_svm["kernel"], C=best_params_fasttext_svm["C"], gamma=best_params_fasttext_svm["gamma"], random_state=42)
else:
    best_model_fasttext_svm = SVC(kernel=best_params_fasttext_svm["kernel"], C=best_params_fasttext_svm["C"], random_state=42)

In [None]:
best_model_fasttext_svm.fit(X_train_fasttext_svm, y_train_fasttext_svm)

In [None]:
y_train_pred_fasttext_svm = best_model_fasttext_svm.predict(X_train_fasttext_svm)
y_test_pred_fasttext_svm = best_model_fasttext_svm.predict(X_test_fasttext_svm)

In [None]:
train_acc_fasttext_svm = accuracy_score(y_train_fasttext_svm, y_train_pred_fasttext_svm)
test_acc_fasttext_svm = accuracy_score(y_test_fasttext_svm, y_test_pred_fasttext_svm)

In [None]:
print("FastText + SVM Training Accuracy: {:.4f}".format(train_acc_fasttext_svm))
print("FastText + SVM Testing Accuracy:  {:.4f}".format(test_acc_fasttext_svm))

In [None]:
print(classification_report(y_test_fasttext_svm, y_test_pred_fasttext_svm))

In [None]:
joblib.dump(best_model_fasttext_svm, "../models/svm/best_fasttext_svm_model.pkl")

## RF with TF-IDF

In [None]:
X_train_tfidf_rf, X_test_tfidf_rf, y_train_tfidf_rf, y_test_tfidf_rf = train_test_split(
    X_tfidf_features, y_tfidf, test_size=0.2, random_state=42
)

In [None]:
def objective_tfidf_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 5, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    
    model = RandomForestClassifier(n_estimators=n_estimators, 
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split, 
                                   random_state=42)
    model.fit(X_train_tfidf_rf, y_train_tfidf_rf)
    y_pred = model.predict(X_test_tfidf_rf)
    return accuracy_score(y_test_tfidf_rf, y_pred)

In [None]:
study_tfidf_rf = optuna.create_study(direction="maximize")
study_tfidf_rf.optimize(objective_tfidf_rf, n_trials=5)

In [None]:
joblib.dump(study_tfidf_rf, "../models/rf/tfidf_rf_optuna_study.pkl")

In [None]:
print("Best TF-IDF RF hyperparameters:", study_tfidf_rf.best_params)
print("Best TF-IDF RF Accuracy (Validation): {:.4f}".format(study_tfidf_rf.best_value))

In [None]:
best_params_tfidf_rf = study_tfidf_rf.best_params

In [None]:
best_model_tfidf_rf = RandomForestClassifier(n_estimators=best_params_tfidf_rf["n_estimators"],
                                             max_depth=best_params_tfidf_rf["max_depth"],
                                             min_samples_split=best_params_tfidf_rf["min_samples_split"],
                                             random_state=42)

In [None]:
best_model_tfidf_rf.fit(X_train_tfidf_rf, y_train_tfidf_rf)

In [None]:
y_train_pred_tfidf_rf = best_model_tfidf_rf.predict(X_train_tfidf_rf)
y_test_pred_tfidf_rf = best_model_tfidf_rf.predict(X_test_tfidf_rf)

In [None]:
train_acc_tfidf_rf = accuracy_score(y_train_tfidf_rf, y_train_pred_tfidf_rf)
test_acc_tfidf_rf = accuracy_score(y_test_tfidf_rf, y_test_pred_tfidf_rf)

In [None]:
print("TF-IDF + Random Forest Training Accuracy: {:.4f}".format(train_acc_tfidf_rf))
print("TF-IDF + Random Forest Testing Accuracy:  {:.4f}".format(test_acc_tfidf_rf))

In [None]:
print(classification_report(y_test_tfidf_rf, y_test_pred_tfidf_rf))

In [None]:
joblib.dump(best_model_tfidf_rf, "../models/rf/best_tfidf_rf_model.pkl")

## RF with FastText

In [None]:
X_train_fasttext_rf, X_test_fasttext_rf, y_train_fasttext_rf, y_test_fasttext_rf = train_test_split(
    X_fasttext, y_fasttext, test_size=0.2, random_state=42
)

In [None]:
def objective_fasttext_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 5, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    
    model = RandomForestClassifier(n_estimators=n_estimators, 
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split, 
                                   random_state=42)
    model.fit(X_train_fasttext_rf, y_train_fasttext_rf)
    y_pred = model.predict(X_test_fasttext_rf)
    return accuracy_score(y_test_fasttext_rf, y_pred)

In [None]:
study_fasttext_rf = optuna.create_study(direction="maximize")
study_fasttext_rf.optimize(objective_fasttext_rf, n_trials=5)

In [None]:
joblib.dump(study_fasttext_rf, "../models/rf/fasttext_rf_optuna_study.pkl")

In [None]:
print("Best FastText RF hyperparameters:", study_fasttext_rf.best_params)
print("Best FastText RF Accuracy (Validation): {:.4f}".format(study_fasttext_rf.best_value))

In [None]:
best_params_fasttext_rf = study_fasttext_rf.best_params

In [None]:
best_model_fasttext_rf = RandomForestClassifier(n_estimators=best_params_fasttext_rf["n_estimators"],
                                                max_depth=best_params_fasttext_rf["max_depth"],
                                                min_samples_split=best_params_fasttext_rf["min_samples_split"],
                                                random_state=42)

In [None]:
best_model_fasttext_rf.fit(X_train_fasttext_rf, y_train_fasttext_rf)

In [None]:
y_train_pred_fasttext_rf = best_model_fasttext_rf.predict(X_train_fasttext_rf)
y_test_pred_fasttext_rf = best_model_fasttext_rf.predict(X_test_fasttext_rf)

In [None]:
train_acc_fasttext_rf = accuracy_score(y_train_fasttext_rf, y_train_pred_fasttext_rf)
test_acc_fasttext_rf = accuracy_score(y_test_fasttext_rf, y_test_pred_fasttext_rf)

In [None]:
print("FastText + Random Forest Training Accuracy: {:.4f}".format(train_acc_fasttext_rf))
print("FastText + Random Forest Testing Accuracy:  {:.4f}".format(test_acc_fasttext_rf))

In [None]:
print(classification_report(y_test_fasttext_rf, y_test_pred_fasttext_rf))

In [None]:
joblib.dump(best_model_fasttext_rf, "../models/rf/best_fasttext_rf_model.pkl")

## CNN

In [None]:
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_padded, y_labels, test_size=0.2, random_state=42)

In [None]:
def objective_cnn(trial):
    filters = trial.suggest_categorical("filters", [128, 256])
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    dropout_rate = trial.suggest_float("dropout", 0.1, 1.0)
    l2_lambda = trial.suggest_float("l2_lambda", 1e-4, 1e-2, log=True)
    
    model = Sequential([
        Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LENGTH),
        Conv1D(filters=filters, kernel_size=5, activation='relu', kernel_regularizer=L2(l2_lambda)),
        GlobalMaxPooling1D(),
        Dropout(dropout_rate),
        Dense(64, activation='relu', kernel_regularizer=L2(l2_lambda)),
        Dropout(dropout_rate),
        Dense(NUM_CLASSES, activation='softmax')
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=64, validation_data=(X_test_cnn, y_test_cnn),
              verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)])
    
    _, accuracy = model.evaluate(X_test_cnn, y_test_cnn, verbose=0)
    return accuracy

In [None]:
study_cnn = optuna.create_study(direction="maximize")
study_cnn.optimize(objective_cnn, n_trials=5)

In [None]:
joblib.dump(study_cnn, "../models/cnn/cnn_optuna_study.pkl")

In [None]:
print(f"Best CNN Hyperparameters: {study_cnn.best_params}")
print(f"Best CNN Accuracy: {study_cnn.best_value:.4f}")

In [None]:
best_params_cnn = study_cnn.best_params

In [None]:
cnn_best = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LENGTH),
    Conv1D(filters=best_params_cnn['filters'], kernel_size=5, activation='relu', kernel_regularizer=L2(best_params_cnn['l2_lambda'])),
    GlobalMaxPooling1D(),
    Dropout(best_params_cnn['dropout']),
    Dense(64, activation='relu', kernel_regularizer=L2(best_params_cnn['l2_lambda'])),
    Dropout(best_params_cnn['dropout']),
    Dense(NUM_CLASSES, activation='softmax')
])

In [None]:
cnn_best.compile(optimizer=Adam(learning_rate=best_params_cnn['learning_rate']), 
                 loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history_cnn = cnn_best.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=64, 
                           validation_data=(X_test_cnn, y_test_cnn), 
                           callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)], verbose=1)

In [None]:
train_loss_cnn, train_acc_cnn = cnn_best.evaluate(X_train_cnn, y_train_cnn, verbose=0)
test_loss_cnn, test_acc_cnn = cnn_best.evaluate(X_test_cnn, y_test_cnn, verbose=0)

In [None]:
print(f"CNN Training Accuracy: {train_acc_cnn:.4f}")
print(f"CNN Testing Accuracy: {test_acc_cnn:.4f}")

In [None]:
cnn_best.save("../models/cnn/best_cnn_model.h5")

In [None]:
plot_history(history_cnn, "CNN")

## LSTM

In [None]:
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_padded, y_labels, test_size=0.2, random_state=42)

In [None]:
def objective_lstm(trial):
    lstm_units = trial.suggest_categorical("lstm_units", [128, 256])
    dropout_rate = trial.suggest_float("dropout", 0.1, 1.0)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 5e-3, log=True)
    l2_lambda = trial.suggest_float("l2_lambda", 1e-4, 1e-2, log=True)
    
    model = Sequential([
        Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LENGTH),
        Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout_rate, kernel_regularizer=L2(l2_lambda))),
        Bidirectional(LSTM(lstm_units // 2, dropout=dropout_rate, kernel_regularizer=L2(l2_lambda))),
        Dense(64, activation='relu', kernel_regularizer=L2(l2_lambda)),
        Dropout(dropout_rate),
        Dense(NUM_CLASSES, activation='softmax')
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, validation_data=(X_test_lstm, y_test_lstm), 
              verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)])
    
    _, accuracy = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
    return accuracy

In [None]:
study_lstm = optuna.create_study(direction="maximize")
study_lstm.optimize(objective_lstm, n_trials=5)

In [None]:
joblib.dump(study_lstm, "../models/lstm/lstm_optuna_study.pkl")

In [None]:
print(f"Best LSTM Hyperparameters: {study_lstm.best_params}")
print(f"Best LSTM Accuracy: {study_lstm.best_value:.4f}")

In [None]:
best_params_lstm = study_lstm.best_params

In [None]:
lstm_best = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LENGTH),
    Bidirectional(LSTM(best_params_lstm['lstm_units'], return_sequences=True, dropout=best_params_lstm['dropout'], kernel_regularizer=L2(best_params_lstm['l2_lambda']))),
    Bidirectional(LSTM(best_params_lstm['lstm_units'] // 2, dropout=best_params_lstm['dropout'], kernel_regularizer=L2(best_params_lstm['l2_lambda']))),
    Dense(64, activation='relu', kernel_regularizer=L2(best_params_lstm['l2_lambda'])),
    Dropout(best_params_lstm['dropout']),
    Dense(NUM_CLASSES, activation='softmax')
])

In [None]:
lstm_best.compile(optimizer=Adam(learning_rate=best_params_lstm['learning_rate']),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history_lstm = lstm_best.fit(X_train_lstm, y_train_lstm, epochs=15, batch_size=64, validation_data=(X_test_lstm, y_test_lstm),
                             callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)], verbose=1)

In [None]:
train_loss_lstm, train_acc_lstm = lstm_best.evaluate(X_train_lstm, y_train_lstm, verbose=0)
test_loss_lstm, test_acc_lstm = lstm_best.evaluate(X_test_lstm, y_test_lstm, verbose=0)

In [None]:
print(f"LSTM Training Accuracy: {train_acc_lstm:.4f}")
print(f"LSTM Testing Accuracy: {test_acc_lstm:.4f}")

In [None]:
lstm_best.save("../models/lstm/best_lstm_model.h5")

In [None]:
plot_history(history_lstm, "LSTM")

## GRU

In [None]:
X_train_gru, X_test_gru, y_train_gru, y_test_gru = train_test_split(X_padded, y_labels, test_size=0.2, random_state=42)

In [None]:
def objective_gru(trial):
    gru_units = trial.suggest_categorical("gru_units", [128, 256])
    dropout_rate = trial.suggest_float("dropout", 0.1, 1.0)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 5e-3, log=True)
    l2_lambda = trial.suggest_float("l2_lambda", 1e-4, 1e-2, log=True)
    
    model = Sequential([
        Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LENGTH),
        Bidirectional(GRU(gru_units, return_sequences=True, dropout=dropout_rate, kernel_regularizer=L2(l2_lambda))),
        Bidirectional(GRU(gru_units // 2, dropout=dropout_rate, kernel_regularizer=L2(l2_lambda))),
        Dense(64, activation='relu', kernel_regularizer=L2(l2_lambda)),
        Dropout(dropout_rate),
        Dense(NUM_CLASSES, activation='softmax')
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    model.fit(X_train_gru, y_train_gru, epochs=10, batch_size=64, validation_data=(X_test_gru, y_test_gru),
              verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)])
    
    _, accuracy = model.evaluate(X_test_gru, y_test_gru, verbose=0)
    return accuracy

In [None]:
study_gru = optuna.create_study(direction="maximize")
study_gru.optimize(objective_gru, n_trials=5)

In [None]:
joblib.dump(study_gru, "../models/gru/gru_optuna_study.pkl")

In [None]:
print(f"Best GRU Hyperparameters: {study_gru.best_params}")
print(f"Best GRU Accuracy: {study_gru.best_value:.4f}")

In [None]:
best_params_gru = study_gru.best_params

In [None]:
gru_best = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LENGTH),
    Bidirectional(GRU(best_params_gru['gru_units'], return_sequences=True, dropout=best_params_gru['dropout'], kernel_regularizer=L2(best_params_gru['l2_lambda']))),
    Bidirectional(GRU(best_params_gru['gru_units'] // 2, dropout=best_params_gru['dropout'], kernel_regularizer=L2(best_params_gru['l2_lambda']))),
    Dense(64, activation='relu', kernel_regularizer=L2(best_params_gru['l2_lambda'])),
    Dropout(best_params_gru['dropout']),
    Dense(NUM_CLASSES, activation='softmax')
])

In [None]:
gru_best.compile(optimizer=Adam(learning_rate=best_params_gru['learning_rate']),
                 loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history_gru = gru_best.fit(X_train_gru, y_train_gru, epochs=15, batch_size=64, validation_data=(X_test_gru, y_test_gru),
                           callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)], verbose=1)

In [None]:
train_loss_gru, train_acc_gru = gru_best.evaluate(X_train_gru, y_train_gru, verbose=0)
test_loss_gru, test_acc_gru = gru_best.evaluate(X_test_gru, y_test_gru, verbose=0)

In [None]:
print(f"GRU Training Accuracy: {train_acc_gru:.4f}")
print(f"GRU Testing Accuracy: {test_acc_gru:.4f}")

In [None]:
gru_best.save("../models/gru/best_gru_model.h5")

In [None]:
plot_history(history_gru, "GRU")

# Evaluation

## Accuracy Testing & Training

In [None]:
data_acc = {
    "Model": [
        "SVM TF-IDF", "SVM TF-IDF",
        "SVM FastText", "SVM FastText",
        "RF TF-IDF", "RF TF-IDF",
        "RF FastText", "RF FastText",
        "CNN", "CNN",
        "LSTM", "LSTM",
        "GRU", "GRU"
    ],
    "Dataset": [
        "Train", "Test",
        "Train", "Test",
        "Train", "Test",
        "Train", "Test",
        "Train", "Test",
        "Train", "Test",
        "Train", "Test"
    ],
    "Accuracy": [
        train_acc_tfidf_svm, test_acc_tfidf_svm,
        train_acc_fasttext_svm, test_acc_fasttext_svm,
        train_acc_tfidf_rf, test_acc_tfidf_rf,
        train_acc_fasttext_rf, test_acc_fasttext_rf,
        train_acc_cnn, test_acc_cnn,
        train_acc_lstm, test_acc_lstm,
        train_acc_gru, test_acc_gru
    ]
}

In [None]:
df_acc = pd.DataFrame(data_acc)

In [None]:
model_order = ["SVM TF-IDF", "SVM FastText", "RF TF-IDF", "RF FastText", "CNN", "LSTM", "GRU"]
df_acc["Model"] = pd.Categorical(df_acc["Model"], categories=model_order, ordered=True)

In [None]:
fig = px.bar(
    df_acc,
    x="Model",
    y="Accuracy",
    color="Dataset",
    barmode="group",
    title="Comparison of Training and Testing Accuracy",
    text="Accuracy"
)

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()