In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

from functions import *
from bert_vectorizer import BertVectorizer

import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

# Load your dataset
df = pd.read_csv("../data/clean/dataset.csv").fillna('')
X = df["lemmes"]
y = df["funny"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

2024-11-23 20:35:07.050833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732390507.069873   87832 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732390507.076462   87832 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 20:35:07.094676: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
vectorizer = BertVectorizer()

In [3]:
X_train_embeddings = vectorizer.fit_transform(list(X_train))
X_test_embeddings = vectorizer.transform(list(X_test))

I0000 00:00:1732390510.869617   87832 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1732390510.869794   87832 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1730 MB memory:  -> device: 0, name: NVIDIA GeForce MX150, pci bus id: 0000:01:00.0, compute capability: 6.1
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing TFCamembertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassi

In [4]:
# Train models
best_models = []

In [5]:
# Random Forest
rf_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(10, 100),  # Fewer trees, as BERT embeddings are rich
    "classifier__max_depth": optuna.distributions.IntDistribution(2, 8),       # Shallower trees due to high-dimensional data
    "classifier__min_samples_split": optuna.distributions.IntDistribution(2, 6),
    "classifier__min_samples_leaf": optuna.distributions.IntDistribution(1, 3),
}

rf_model, rf_acc = train_model_with_optuna(
    model=RandomForestClassifier(random_state=314),
    param_distributions=rf_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("random_forest", rf_model))

  search = OptunaSearchCV(
[I 2024-11-23 20:52:48,809] A new study created in memory with name: no-name-873a67e1-94f4-4247-aae9-ca819cb70e5d
[I 2024-11-23 20:56:31,341] Trial 7 finished with value: 0.11481926902053194 and parameters: {'classifier__n_estimators': 12, 'classifier__max_depth': 8, 'classifier__min_samples_split': 6, 'classifier__min_samples_leaf': 1}. Best is trial 7 with value: 0.11481926902053194.
[I 2024-11-23 20:58:17,829] Trial 0 finished with value: 0.0 and parameters: {'classifier__n_estimators': 33, 'classifier__max_depth': 4, 'classifier__min_samples_split': 4, 'classifier__min_samples_leaf': 1}. Best is trial 7 with value: 0.11481926902053194.
[I 2024-11-23 20:58:33,047] Trial 4 finished with value: 0.06254138603729431 and parameters: {'classifier__n_estimators': 21, 'classifier__max_depth': 7, 'classifier__min_samples_split': 4, 'classifier__min_samples_leaf': 1}. Best is trial 7 with value: 0.11481926902053194.
[I 2024-11-23 20:59:13,676] Trial 5 finished with 

Best parameters: {'classifier__n_estimators': 12, 'classifier__max_depth': 8, 'classifier__min_samples_split': 6, 'classifier__min_samples_leaf': 1}
Test Accuracy: 0.8246140853000516
F1 Score: 0.11589403973509933
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     17359
           1       0.89      0.06      0.12      3954

    accuracy                           0.82     21313
   macro avg       0.86      0.53      0.51     21313
weighted avg       0.84      0.82      0.76     21313



In [6]:
# XGBoost
xgb_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(20, 100),  # Fewer iterations needed
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.005, 0.1),  # Lower range for stable learning
    "classifier__max_depth": optuna.distributions.IntDistribution(2, 6),        # Shallower trees
    "classifier__colsample_bytree": optuna.distributions.FloatDistribution(0.5, 1.0),  # Feature subsampling
    "classifier__subsample": optuna.distributions.FloatDistribution(0.6, 1.0),  # Data subsampling
}

xgb_model, xgb_acc = train_model_with_optuna(
    model=XGBClassifier(random_state=314),
    param_distributions=xgb_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("xgboost", xgb_model))

  search = OptunaSearchCV(
[I 2024-11-23 21:21:23,154] A new study created in memory with name: no-name-7ee120cf-12ce-4231-8b32-3fafcd2991d1
[I 2024-11-23 21:25:12,781] Trial 5 finished with value: 0.0 and parameters: {'classifier__n_estimators': 36, 'classifier__learning_rate': 0.03023514612054874, 'classifier__max_depth': 2, 'classifier__colsample_bytree': 0.9147813859093016, 'classifier__subsample': 0.8513104449884434}. Best is trial 5 with value: 0.0.
[I 2024-11-23 21:28:27,985] Trial 1 finished with value: 0.2574936982788675 and parameters: {'classifier__n_estimators': 27, 'classifier__learning_rate': 0.06011312358127173, 'classifier__max_depth': 6, 'classifier__colsample_bytree': 0.9087404424905497, 'classifier__subsample': 0.641903183543874}. Best is trial 1 with value: 0.2574936982788675.
[I 2024-11-23 21:28:55,358] Trial 2 finished with value: 0.3531095113708773 and parameters: {'classifier__n_estimators': 51, 'classifier__learning_rate': 0.07106992674172187, 'classifier__max_

Best parameters: {'classifier__n_estimators': 78, 'classifier__learning_rate': 0.0792971615767501, 'classifier__max_depth': 6, 'classifier__colsample_bytree': 0.8982245261346242, 'classifier__subsample': 0.6470931857429583}
Test Accuracy: 0.8643081687233144
F1 Score: 0.5176784523015343
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     17359
           1       0.76      0.39      0.52      3954

    accuracy                           0.86     21313
   macro avg       0.82      0.68      0.72     21313
weighted avg       0.85      0.86      0.85     21313



In [7]:
# CatBoost
catboost_param_distributions = {
    "classifier__iterations": optuna.distributions.IntDistribution(30, 100),    # Fewer iterations
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.005, 0.1),  # Smaller learning rates
    "classifier__depth": optuna.distributions.IntDistribution(2, 6),            # Shallower trees
    "classifier__l2_leaf_reg": optuna.distributions.FloatDistribution(1, 5),    # Regularization strength
}

catboost_model, catboost_acc = train_model_with_optuna(
    model=CatBoostClassifier(verbose=0, random_state=314),
    param_distributions=catboost_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
    n_jobs=1,
)
best_models.append(("catboost", catboost_model))


  search = OptunaSearchCV(
[I 2024-11-23 21:46:31,749] A new study created in memory with name: no-name-9a390985-4573-4ed0-9a3c-06fce11c9c44
[I 2024-11-23 21:47:06,599] Trial 0 finished with value: 0.16452283357240993 and parameters: {'classifier__iterations': 83, 'classifier__learning_rate': 0.03521448296585186, 'classifier__depth': 3, 'classifier__l2_leaf_reg': 4.972089758671521}. Best is trial 0 with value: 0.16452283357240993.
[I 2024-11-23 21:47:35,984] Trial 1 finished with value: 0.12730055383823205 and parameters: {'classifier__iterations': 97, 'classifier__learning_rate': 0.03930713913624876, 'classifier__depth': 2, 'classifier__l2_leaf_reg': 1.2103767928393903}. Best is trial 0 with value: 0.16452283357240993.
[I 2024-11-23 21:48:00,006] Trial 2 finished with value: 0.12488524256369551 and parameters: {'classifier__iterations': 34, 'classifier__learning_rate': 0.04445319978181584, 'classifier__depth': 4, 'classifier__l2_leaf_reg': 2.252734192383126}. Best is trial 0 with valu

Best parameters: {'classifier__iterations': 69, 'classifier__learning_rate': 0.06750278443777788, 'classifier__depth': 6, 'classifier__l2_leaf_reg': 3.6964400458893834}
Test Accuracy: 0.8492469384882466
F1 Score: 0.3948012808438501
              precision    recall  f1-score   support

           0       0.85      0.98      0.91     17359
           1       0.77      0.27      0.39      3954

    accuracy                           0.85     21313
   macro avg       0.81      0.62      0.65     21313
weighted avg       0.84      0.85      0.82     21313



In [8]:
# Stacking the best models
stacked_classifier = StackingClassifier(
    estimators=best_models,
    final_estimator=GradientBoostingClassifier(random_state=314),
    cv=5,
)


stacked_classifier.fit(X_train_embeddings, y_train)
y_pred = stacked_classifier.predict(X_test_embeddings)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Stacking Classifier Test Accuracy: {acc}")
print(f"Stacking Classifier F1 Score: {f1}")

print(classification_report(y_test, y_pred))

Stacking Classifier Test Accuracy: 0.8685309435555764
Stacking Classifier F1 Score: 0.5757116898849183
              precision    recall  f1-score   support

           0       0.89      0.96      0.92     17359
           1       0.72      0.48      0.58      3954

    accuracy                           0.87     21313
   macro avg       0.80      0.72      0.75     21313
weighted avg       0.86      0.87      0.86     21313

