In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

from functions import *

# Load your dataset
df = pd.read_csv("../data/clean/dataset.csv").fillna('')
X = df["lemmes"]
y = df["funny"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

2024-11-19 23:54:05.585471: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732056845.605098   40280 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732056845.610687   40280 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-19 23:54:05.628506: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
vectorizer = CountVectorizer(min_df=50, max_features=10000)

In [3]:
# Train models
best_models = []

In [4]:
# Random Forest

rf_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 200),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 15),
    "classifier__min_samples_split": optuna.distributions.IntDistribution(2, 10),
    "classifier__min_samples_leaf": optuna.distributions.IntDistribution(1, 5),
}

rf_model, rf_acc = train_model_with_optuna(
    vectorizer=vectorizer,
    model=RandomForestClassifier(random_state=314),
    param_distributions=rf_param_distributions,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
)
best_models.append(("random_forest", rf_model))

  search = OptunaSearchCV(
[I 2024-11-19 23:54:08,359] A new study created in memory with name: no-name-84838af4-f41e-44e3-a6a6-bb0f9eaf3623
[I 2024-11-19 23:54:20,706] Trial 0 finished with value: 0.0 and parameters: {'classifier__n_estimators': 162, 'classifier__max_depth': 7, 'classifier__min_samples_split': 3, 'classifier__min_samples_leaf': 5}. Best is trial 0 with value: 0.0.
[I 2024-11-19 23:54:35,308] Trial 1 finished with value: 0.00012642225031605565 and parameters: {'classifier__n_estimators': 193, 'classifier__max_depth': 7, 'classifier__min_samples_split': 3, 'classifier__min_samples_leaf': 1}. Best is trial 1 with value: 0.00012642225031605565.
[I 2024-11-19 23:54:42,932] Trial 2 finished with value: 0.0006317918018005787 and parameters: {'classifier__n_estimators': 58, 'classifier__max_depth': 8, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2}. Best is trial 2 with value: 0.0006317918018005787.
[I 2024-11-19 23:54:58,287] Trial 3 finished with valu

Best parameters: {'classifier__n_estimators': 122, 'classifier__max_depth': 15, 'classifier__min_samples_split': 6, 'classifier__min_samples_leaf': 2}
Test Accuracy: 0.8155116595505091
F1 Score: 0.011066398390342052
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     17359
           1       1.00      0.01      0.01      3954

    accuracy                           0.82     21313
   macro avg       0.91      0.50      0.45     21313
weighted avg       0.85      0.82      0.73     21313



In [5]:
# XGBoost

xgb_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 200),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.3),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 10),
}

xgb_model, xgb_acc = train_model_with_optuna(
    vectorizer=vectorizer,
    model=XGBClassifier(random_state=314),
    param_distributions=xgb_param_distributions,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
)
best_models.append(("xgboost", xgb_model))

  search = OptunaSearchCV(
[I 2024-11-19 23:58:21,891] A new study created in memory with name: no-name-bfa0b7ba-7582-4aca-a83a-178adc07869d
[I 2024-11-19 23:58:28,570] Trial 0 finished with value: 0.3958171587303308 and parameters: {'classifier__n_estimators': 162, 'classifier__learning_rate': 0.10223368484312673, 'classifier__max_depth': 4}. Best is trial 0 with value: 0.3958171587303308.
[I 2024-11-19 23:58:39,245] Trial 1 finished with value: 0.5276948268838851 and parameters: {'classifier__n_estimators': 199, 'classifier__learning_rate': 0.2849341173889311, 'classifier__max_depth': 5}. Best is trial 1 with value: 0.5276948268838851.
[I 2024-11-19 23:58:45,591] Trial 2 finished with value: 0.29100310557835163 and parameters: {'classifier__n_estimators': 77, 'classifier__learning_rate': 0.025252317480855793, 'classifier__max_depth': 3}. Best is trial 1 with value: 0.5276948268838851.
[I 2024-11-19 23:58:53,001] Trial 3 finished with value: 0.4023389598835852 and parameters: {'classi

Best parameters: {'classifier__n_estimators': 200, 'classifier__learning_rate': 0.29934142188815765, 'classifier__max_depth': 8}
Test Accuracy: 0.8815277060948716
F1 Score: 0.5870809484873263
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     17359
           1       0.83      0.45      0.59      3954

    accuracy                           0.88     21313
   macro avg       0.86      0.72      0.76     21313
weighted avg       0.88      0.88      0.87     21313



In [6]:
# CatBoost
catboost_param_distributions = {
    "classifier__iterations": optuna.distributions.IntDistribution(50, 200),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.3),
    "classifier__depth": optuna.distributions.IntDistribution(3, 10),
}

catboost_model, catboost_acc = train_model_with_optuna(
    vectorizer=vectorizer,
    model=CatBoostClassifier(verbose=0, random_state=314),
    param_distributions=catboost_param_distributions,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
)
best_models.append(("catboost", catboost_model))

  search = OptunaSearchCV(
[I 2024-11-20 00:01:13,230] A new study created in memory with name: no-name-aba4b270-af1c-498a-9bc9-56d568e7fde4
[I 2024-11-20 00:01:27,938] Trial 0 finished with value: 0.4084859149080394 and parameters: {'classifier__iterations': 162, 'classifier__learning_rate': 0.10223368484312673, 'classifier__depth': 4}. Best is trial 0 with value: 0.4084859149080394.
[I 2024-11-20 00:01:46,195] Trial 1 finished with value: 0.5418654086136236 and parameters: {'classifier__iterations': 199, 'classifier__learning_rate': 0.2849341173889311, 'classifier__depth': 5}. Best is trial 1 with value: 0.5418654086136236.
[I 2024-11-20 00:01:55,276] Trial 2 finished with value: 0.29821935558121015 and parameters: {'classifier__iterations': 77, 'classifier__learning_rate': 0.025252317480855793, 'classifier__depth': 3}. Best is trial 1 with value: 0.5418654086136236.
[I 2024-11-20 00:02:08,494] Trial 3 finished with value: 0.4145688576351135 and parameters: {'classifier__iterations':

Best parameters: {'classifier__iterations': 200, 'classifier__learning_rate': 0.29934142188815765, 'classifier__depth': 8}
Test Accuracy: 0.8822315019002487
F1 Score: 0.5990415335463258
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     17359
           1       0.81      0.47      0.60      3954

    accuracy                           0.88     21313
   macro avg       0.85      0.72      0.77     21313
weighted avg       0.88      0.88      0.87     21313



In [7]:
# Stacking the best models
stacked_classifier = StackingClassifier(
    estimators=best_models,
    final_estimator=GradientBoostingClassifier(random_state=314),
    cv=5,
)


stacked_classifier.fit(X_train, y_train)
y_pred = stacked_classifier.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Stacking Classifier Test Accuracy: {acc}")
print(f"Stacking Classifier F1 Score: {f1}")

print(classification_report(y_test, y_pred))


Stacking Classifier Test Accuracy: 0.8886125838690001
Stacking Classifier F1 Score: 0.6562409498986389
              precision    recall  f1-score   support

           0       0.91      0.96      0.93     17359
           1       0.77      0.57      0.66      3954

    accuracy                           0.89     21313
   macro avg       0.84      0.77      0.79     21313
weighted avg       0.88      0.89      0.88     21313

