In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

from functions import *
from word2vec_vectorizer import Word2VecVectorizer

# Load your dataset
df = pd.read_csv("../data/clean/dataset.csv").fillna('')
X = df["lemmes"]
y = df["funny"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

2024-11-21 01:39:15.841132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732149555.859753   72935 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732149555.865232   72935 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-21 01:39:15.882455: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
vectorizer = Word2VecVectorizer()

In [3]:
X_train_embeddings = vectorizer.fit_transform(X_train)
X_test_embeddings = vectorizer.transform(X_test)

I0000 00:00:1732149617.683513   72935 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1730 MB memory:  -> device: 0, name: NVIDIA GeForce MX150, pci bus id: 0000:01:00.0, compute capability: 6.1
I0000 00:00:1732149620.259888   74053 service.cc:148] XLA service 0x7f07b4002b40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732149620.259910   74053 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce MX150, Compute Capability 6.1
I0000 00:00:1732149620.266607   74053 cuda_dnn.cc:529] Loaded cuDNN version 90300
2024-11-21 01:40:20.278664: W external/local_xla/xla/service/gpu/nvptx_compiler.cc:930] The NVIDIA driver's CUDA version is 12.4 which is older than the PTX compiler version 12.5.82. Because the driver is older than the PTX compiler version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forwar

In [4]:
# Train models
best_models = []

In [5]:
# Random Forest
rf_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 150),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 10),
    "classifier__min_samples_split": optuna.distributions.IntDistribution(2, 8),
    "classifier__min_samples_leaf": optuna.distributions.IntDistribution(1, 4),
}

rf_model, rf_acc = train_model_with_optuna(
    model=RandomForestClassifier(random_state=314),
    param_distributions=rf_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("random_forest", rf_model))

  search = OptunaSearchCV(
[I 2024-11-21 01:40:25,442] A new study created in memory with name: no-name-039213ef-db98-45c7-8406-38f099481baf
[I 2024-11-21 01:43:36,835] Trial 1 finished with value: 0.017033298284591693 and parameters: {'classifier__n_estimators': 52, 'classifier__max_depth': 4, 'classifier__min_samples_split': 4, 'classifier__min_samples_leaf': 2}. Best is trial 1 with value: 0.017033298284591693.
[I 2024-11-21 01:44:58,253] Trial 2 finished with value: 0.0 and parameters: {'classifier__n_estimators': 98, 'classifier__max_depth': 3, 'classifier__min_samples_split': 4, 'classifier__min_samples_leaf': 2}. Best is trial 1 with value: 0.017033298284591693.
[I 2024-11-21 01:45:16,404] Trial 6 finished with value: 0.14807085620411503 and parameters: {'classifier__n_estimators': 66, 'classifier__max_depth': 5, 'classifier__min_samples_split': 4, 'classifier__min_samples_leaf': 3}. Best is trial 6 with value: 0.14807085620411503.
[I 2024-11-21 01:45:30,281] Trial 7 finished wi

Best parameters: {'classifier__n_estimators': 123, 'classifier__max_depth': 10, 'classifier__min_samples_split': 8, 'classifier__min_samples_leaf': 4}
Test Accuracy: 0.8769764932201004
F1 Score: 0.5460526315789473
              precision    recall  f1-score   support

           0       0.88      0.99      0.93     17359
           1       0.87      0.40      0.55      3954

    accuracy                           0.88     21313
   macro avg       0.87      0.69      0.74     21313
weighted avg       0.88      0.88      0.86     21313



In [6]:
# XGBoost
xgb_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 150),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.15),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 7),
}

xgb_model, xgb_acc = train_model_with_optuna(
    model=XGBClassifier(random_state=314),
    param_distributions=xgb_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("xgboost", xgb_model))

  search = OptunaSearchCV(
[I 2024-11-21 02:07:30,297] A new study created in memory with name: no-name-e18c55f9-86e1-45df-8228-e716cfc62e3d
[I 2024-11-21 02:08:01,987] Trial 4 finished with value: 0.5073900189075599 and parameters: {'classifier__n_estimators': 55, 'classifier__learning_rate': 0.06008337116580667, 'classifier__max_depth': 3}. Best is trial 4 with value: 0.5073900189075599.
[I 2024-11-21 02:08:06,329] Trial 2 finished with value: 0.15281916092528355 and parameters: {'classifier__n_estimators': 62, 'classifier__learning_rate': 0.022912653901423247, 'classifier__max_depth': 3}. Best is trial 4 with value: 0.5073900189075599.
[I 2024-11-21 02:08:22,694] Trial 7 finished with value: 0.562056145299376 and parameters: {'classifier__n_estimators': 54, 'classifier__learning_rate': 0.04491454830663207, 'classifier__max_depth': 5}. Best is trial 7 with value: 0.562056145299376.
[I 2024-11-21 02:08:32,217] Trial 3 finished with value: 0.6831539846649578 and parameters: {'classifie

Best parameters: {'classifier__n_estimators': 90, 'classifier__learning_rate': 0.14919327915370714, 'classifier__max_depth': 7}
Test Accuracy: 0.9063482381645005
F1 Score: 0.7286568787384448
              precision    recall  f1-score   support

           0       0.93      0.96      0.94     17359
           1       0.79      0.68      0.73      3954

    accuracy                           0.91     21313
   macro avg       0.86      0.82      0.84     21313
weighted avg       0.90      0.91      0.90     21313



In [7]:
# CatBoost
catboost_param_distributions = {
    "classifier__iterations": optuna.distributions.IntDistribution(50, 150),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.15),
    "classifier__depth": optuna.distributions.IntDistribution(3, 7),
}

catboost_model, catboost_acc = train_model_with_optuna(
    model=CatBoostClassifier(verbose=0, random_state=314),
    param_distributions=catboost_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
    n_jobs=1,
)
best_models.append(("catboost", catboost_model))


  search = OptunaSearchCV(
[I 2024-11-21 02:11:12,820] A new study created in memory with name: no-name-4c2a729c-7f75-49f1-b267-734b61e5a2c7
[I 2024-11-21 02:11:22,969] Trial 0 finished with value: 0.6346540424409578 and parameters: {'classifier__iterations': 125, 'classifier__learning_rate': 0.054526606475992216, 'classifier__depth': 4}. Best is trial 0 with value: 0.6346540424409578.
[I 2024-11-21 02:11:34,750] Trial 1 finished with value: 0.7000941721778149 and parameters: {'classifier__iterations': 150, 'classifier__learning_rate': 0.14272681529120812, 'classifier__depth': 4}. Best is trial 1 with value: 0.7000941721778149.
[I 2024-11-21 02:11:40,327] Trial 2 finished with value: 0.25600851078446485 and parameters: {'classifier__iterations': 68, 'classifier__learning_rate': 0.01736318774937866, 'classifier__depth': 3}. Best is trial 1 with value: 0.7000941721778149.
[I 2024-11-21 02:11:48,162] Trial 3 finished with value: 0.6227580683388386 and parameters: {'classifier__iterations'

Best parameters: {'classifier__iterations': 150, 'classifier__learning_rate': 0.14968206573911058, 'classifier__depth': 7}
Test Accuracy: 0.9051283254351804
F1 Score: 0.7232411716397481
              precision    recall  f1-score   support

           0       0.93      0.96      0.94     17359
           1       0.79      0.67      0.72      3954

    accuracy                           0.91     21313
   macro avg       0.86      0.81      0.83     21313
weighted avg       0.90      0.91      0.90     21313



In [8]:
# Stacking the best models
stacked_classifier = StackingClassifier(
    estimators=best_models,
    final_estimator=GradientBoostingClassifier(random_state=314),
    cv=5,
)


stacked_classifier.fit(X_train_embeddings, y_train)
y_pred = stacked_classifier.predict(X_test_embeddings)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Stacking Classifier Test Accuracy: {acc}")
print(f"Stacking Classifier F1 Score: {f1}")

print(classification_report(y_test, y_pred))

Stacking Classifier Test Accuracy: 0.90789658893633
Stacking Classifier F1 Score: 0.7352663519892111
              precision    recall  f1-score   support

           0       0.93      0.96      0.94     17359
           1       0.79      0.69      0.74      3954

    accuracy                           0.91     21313
   macro avg       0.86      0.82      0.84     21313
weighted avg       0.90      0.91      0.91     21313

