In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import data

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
X_train, X_test, y_train, y_test, label_dict = data.load_data()

count_vectorizer = CountVectorizer(ngram_range=(1, 2))
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

smote = SMOTE(random_state=42)
under_sampler = RandomUnderSampler(random_state=42)
smote_tomek = SMOTETomek(random_state=42)

In [3]:
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

In [4]:
results = []

def train_and_evaluate(vectorizer, resampling_method, classifier, vectorizer_name, resampling_name, classifier_name):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    if resampling_method:
        X_resampled, y_resampled = resampling_method.fit_resample(X_train_vec, y_train)
    else:
        X_resampled, y_resampled = X_train_vec, y_train

    if classifier_name == 'LightGBM':
        X_resampled = X_resampled.astype('float32')
        X_test_vec = X_test_vec.astype('float32')

    classifier.fit(X_resampled, y_resampled)
    y_pred = classifier.predict(X_test_vec)

    report = classification_report(y_test, y_pred, output_dict=True)
    acc = accuracy_score(y_test, y_pred)
    results.append({
        'Vectorizer': vectorizer_name,
        'Resampling': resampling_name,
        'Classifier': classifier_name,
        'Accuracy': acc,
        'Classification Report': report
    })

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_dict.values(), yticklabels=label_dict.values())
    plt.title(f'Confusion Matrix: {vectorizer_name} + {classifier_name} ({resampling_name})')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

## CountVectorizer

In [5]:
vectorizer_name = 'CountVectorizer'
vectorizer = count_vectorizer

In [None]:
# Original data
resampling_name = 'Original'
resampling_method = None
for classifier_name, classifier in classifiers.items():
    print(f'Training: {vectorizer_name} + {classifier_name} ({resampling_name})')
    train_and_evaluate(vectorizer, resampling_method, classifier, vectorizer_name, resampling_name, classifier_name)

In [None]:
# SMOTE resampling
resampling_name = 'SMOTE'
resampling_method = smote
for classifier_name, classifier in classifiers.items():
    print(f'Training: {vectorizer_name} + {classifier_name} ({resampling_name})')
    train_and_evaluate(vectorizer, resampling_method, classifier, vectorizer_name, resampling_name, classifier_name)

In [None]:
# Undersampling
resampling_name = 'Undersample'
resampling_method = under_sampler
for classifier_name, classifier in classifiers.items():
    print(f'Training: {vectorizer_name} + {classifier_name} ({resampling_name})')
    train_and_evaluate(vectorizer, resampling_method, classifier, vectorizer_name, resampling_name, classifier_name)

In [None]:
# SMOTE+Tomek resampling
resampling_name = 'SMOTE+Tomek'
resampling_method = smote_tomek
for classifier_name, classifier in classifiers.items():
    print(f'Training: {vectorizer_name} + {classifier_name} ({resampling_name})')
    train_and_evaluate(vectorizer, resampling_method, classifier, vectorizer_name, resampling_name, classifier_name)

## TfidfVectorizer