In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

from functions import *

# Load your dataset
df = pd.read_csv("../data/clean/dataset.csv").fillna('')
X = df["lemmes"]
y = df["funny"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

2024-11-21 00:40:36.128584: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732146036.147995   62216 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732146036.153697   62216 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-21 00:40:36.171782: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
vectorizer = CountVectorizer(min_df=50, max_features=10000)

In [3]:
X_train_embeddings = vectorizer.fit_transform(X_train)
X_test_embeddings = vectorizer.transform(X_test)

In [4]:
# Train models
best_models = []

In [5]:
# Random Forest

rf_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 200),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 15),
    "classifier__min_samples_split": optuna.distributions.IntDistribution(2, 10),
    "classifier__min_samples_leaf": optuna.distributions.IntDistribution(1, 5),
}

rf_model, rf_acc = train_model_with_optuna(
    model=RandomForestClassifier(random_state=314),
    param_distributions=rf_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("random_forest", rf_model))

  search = OptunaSearchCV(
[I 2024-11-21 00:40:39,943] A new study created in memory with name: no-name-fe5fef4b-b8a6-40e9-a11e-807aca4fcbdf
[I 2024-11-21 00:40:53,679] Trial 2 finished with value: 0.0 and parameters: {'classifier__n_estimators': 70, 'classifier__max_depth': 8, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 3}. Best is trial 2 with value: 0.0.
[I 2024-11-21 00:41:05,246] Trial 1 finished with value: 0.0 and parameters: {'classifier__n_estimators': 129, 'classifier__max_depth': 8, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 4}. Best is trial 2 with value: 0.0.
[I 2024-11-21 00:41:07,959] Trial 5 finished with value: 0.0 and parameters: {'classifier__n_estimators': 142, 'classifier__max_depth': 8, 'classifier__min_samples_split': 4, 'classifier__min_samples_leaf': 3}. Best is trial 2 with value: 0.0.
[I 2024-11-21 00:41:08,167] Trial 7 finished with value: 0.004037976754872902 and parameters: {'classifier__n_estimators': 128,

Best parameters: {'classifier__n_estimators': 105, 'classifier__max_depth': 15, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1}
Test Accuracy: 0.8158870173133768
F1 Score: 0.015060240963855422
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     17359
           1       1.00      0.01      0.02      3954

    accuracy                           0.82     21313
   macro avg       0.91      0.50      0.46     21313
weighted avg       0.85      0.82      0.73     21313



In [6]:
# XGBoost

xgb_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 200),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.3),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 10),
}

xgb_model, xgb_acc = train_model_with_optuna(
    model=XGBClassifier(random_state=314),
    param_distributions=xgb_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("xgboost", xgb_model))

  search = OptunaSearchCV(
[I 2024-11-21 00:42:05,394] A new study created in memory with name: no-name-bffad89f-77bc-4076-a39b-129bfd9c2180
[I 2024-11-21 00:42:13,477] Trial 5 finished with value: 0.3358280471501819 and parameters: {'classifier__n_estimators': 81, 'classifier__learning_rate': 0.12026040974662963, 'classifier__max_depth': 3}. Best is trial 5 with value: 0.3358280471501819.
[I 2024-11-21 00:42:17,710] Trial 1 finished with value: 0.46150465050533696 and parameters: {'classifier__n_estimators': 50, 'classifier__learning_rate': 0.2803619948688601, 'classifier__max_depth': 8}. Best is trial 1 with value: 0.46150465050533696.
[I 2024-11-21 00:42:23,232] Trial 0 finished with value: 0.422773120709152 and parameters: {'classifier__n_estimators': 95, 'classifier__learning_rate': 0.12792836440140917, 'classifier__max_depth': 7}. Best is trial 1 with value: 0.46150465050533696.
[I 2024-11-21 00:42:24,049] Trial 7 finished with value: 0.4772388156501151 and parameters: {'classifi

Best parameters: {'classifier__n_estimators': 192, 'classifier__learning_rate': 0.27946163509335914, 'classifier__max_depth': 9}
Test Accuracy: 0.8823253413409656
F1 Score: 0.5901960784313726
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     17359
           1       0.83      0.46      0.59      3954

    accuracy                           0.88     21313
   macro avg       0.86      0.72      0.76     21313
weighted avg       0.88      0.88      0.87     21313



In [7]:
# CatBoost
catboost_param_distributions = {
    "classifier__iterations": optuna.distributions.IntDistribution(50, 200),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.3),
    "classifier__depth": optuna.distributions.IntDistribution(3, 10),
}

catboost_model, catboost_acc = train_model_with_optuna(
    model=CatBoostClassifier(verbose=0, random_state=314),
    param_distributions=catboost_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
    n_jobs=1,
)
best_models.append(("catboost", catboost_model))

  search = OptunaSearchCV(
[I 2024-11-21 00:43:00,826] A new study created in memory with name: no-name-5d8a8a8d-8e25-4ab1-b699-8a1d62f80762
[I 2024-11-21 00:43:11,576] Trial 0 finished with value: 0.4093662591346961 and parameters: {'classifier__iterations': 162, 'classifier__learning_rate': 0.10223368484312673, 'classifier__depth': 4}. Best is trial 0 with value: 0.4093662591346961.
[I 2024-11-21 00:43:26,102] Trial 1 finished with value: 0.5539762274032631 and parameters: {'classifier__iterations': 199, 'classifier__learning_rate': 0.2849341173889311, 'classifier__depth': 5}. Best is trial 1 with value: 0.5539762274032631.
[I 2024-11-21 00:43:31,514] Trial 2 finished with value: 0.2986521241238643 and parameters: {'classifier__iterations': 77, 'classifier__learning_rate': 0.025252317480855793, 'classifier__depth': 3}. Best is trial 1 with value: 0.5539762274032631.
[I 2024-11-21 00:43:40,136] Trial 3 finished with value: 0.4140299818767231 and parameters: {'classifier__iterations': 

Best parameters: {'classifier__iterations': 200, 'classifier__learning_rate': 0.29934142188815765, 'classifier__depth': 8}
Test Accuracy: 0.8822315019002487
F1 Score: 0.5990415335463258
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     17359
           1       0.81      0.47      0.60      3954

    accuracy                           0.88     21313
   macro avg       0.85      0.72      0.77     21313
weighted avg       0.88      0.88      0.87     21313



In [8]:
# Stacking the best models
stacked_classifier = StackingClassifier(
    estimators=best_models,
    final_estimator=GradientBoostingClassifier(random_state=314),
    cv=5,
)


stacked_classifier.fit(X_train_embeddings, y_train)
y_pred = stacked_classifier.predict(X_test_embeddings)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Stacking Classifier Test Accuracy: {acc}")
print(f"Stacking Classifier F1 Score: {f1}")

print(classification_report(y_test, y_pred))


Stacking Classifier Test Accuracy: 0.888894102191151
Stacking Classifier F1 Score: 0.6576055523423945
              precision    recall  f1-score   support

           0       0.91      0.96      0.93     17359
           1       0.77      0.58      0.66      3954

    accuracy                           0.89     21313
   macro avg       0.84      0.77      0.80     21313
weighted avg       0.88      0.89      0.88     21313

