In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

from functions import *
from bert_vectorizer import BertVectorizer

import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

# Load your dataset
df = pd.read_csv("../data/clean/dataset.csv").fillna('')
X = df["title"]
y = df["funny"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

2024-11-24 10:54:57.051416: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732442097.070527  188910 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732442097.076008  188910 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 10:54:57.094677: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
vectorizer = BertVectorizer()

In [3]:
X_train_embeddings = vectorizer.fit_transform(list(X_train))
X_test_embeddings = vectorizer.transform(list(X_test))

I0000 00:00:1732442103.942215  188910 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1732442103.942390  188910 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1730 MB memory:  -> device: 0, name: NVIDIA GeForce MX150, pci bus id: 0000:01:00.0, compute capability: 6.1
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing TFCamembertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassi

In [4]:
# Train models
best_models = []

In [5]:
# Random Forest
rf_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(10, 100),  # Fewer trees, as BERT embeddings are rich
    "classifier__max_depth": optuna.distributions.IntDistribution(2, 8),       # Shallower trees due to high-dimensional data
    "classifier__min_samples_split": optuna.distributions.IntDistribution(2, 6),
    "classifier__min_samples_leaf": optuna.distributions.IntDistribution(1, 3),
}

rf_model, rf_acc = train_model_with_optuna(
    model=RandomForestClassifier(random_state=314),
    param_distributions=rf_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("random_forest", rf_model))

  search = OptunaSearchCV(
[I 2024-11-24 11:16:58,588] A new study created in memory with name: no-name-18102c59-91b1-4b1a-aba6-0c59547c953b
[I 2024-11-24 11:20:11,724] Trial 3 finished with value: 0.2427933814796181 and parameters: {'classifier__n_estimators': 13, 'classifier__max_depth': 6, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1}. Best is trial 3 with value: 0.2427933814796181.
[I 2024-11-24 11:21:06,005] Trial 1 finished with value: 0.0 and parameters: {'classifier__n_estimators': 47, 'classifier__max_depth': 2, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2}. Best is trial 3 with value: 0.2427933814796181.
[I 2024-11-24 11:25:30,457] Trial 7 finished with value: 0.31686730965211224 and parameters: {'classifier__n_estimators': 31, 'classifier__max_depth': 7, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1}. Best is trial 7 with value: 0.31686730965211224.
[I 2024-11-24 11:25:55,239] Trial 8 finished with val

Best parameters: {'classifier__n_estimators': 29, 'classifier__max_depth': 8, 'classifier__min_samples_split': 6, 'classifier__min_samples_leaf': 2}
Test Accuracy: 0.8595692769671093
F1 Score: 0.4010406243746248
              precision    recall  f1-score   support

           0       0.85      1.00      0.92     17359
           1       0.96      0.25      0.40      3954

    accuracy                           0.86     21313
   macro avg       0.91      0.63      0.66     21313
weighted avg       0.87      0.86      0.82     21313



In [6]:
# XGBoost
xgb_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(20, 100),  # Fewer iterations needed
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.005, 0.1),  # Lower range for stable learning
    "classifier__max_depth": optuna.distributions.IntDistribution(2, 6),        # Shallower trees
    "classifier__colsample_bytree": optuna.distributions.FloatDistribution(0.5, 1.0),  # Feature subsampling
    "classifier__subsample": optuna.distributions.FloatDistribution(0.6, 1.0),  # Data subsampling
}

xgb_model, xgb_acc = train_model_with_optuna(
    model=XGBClassifier(random_state=314),
    param_distributions=xgb_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
)
best_models.append(("xgboost", xgb_model))

  search = OptunaSearchCV(
[I 2024-11-24 11:55:31,513] A new study created in memory with name: no-name-d04841c6-cc8f-41a7-bbb3-04796a57317e
[I 2024-11-24 11:58:24,918] Trial 6 finished with value: 0.04934040891365688 and parameters: {'classifier__n_estimators': 20, 'classifier__learning_rate': 0.07770109911706814, 'classifier__max_depth': 2, 'classifier__colsample_bytree': 0.7066701192503249, 'classifier__subsample': 0.8866915768004306}. Best is trial 6 with value: 0.04934040891365688.
[I 2024-11-24 11:58:57,233] Trial 5 finished with value: 0.05267433991582621 and parameters: {'classifier__n_estimators': 21, 'classifier__learning_rate': 0.050942496335135, 'classifier__max_depth': 3, 'classifier__colsample_bytree': 0.5679596345597413, 'classifier__subsample': 0.8233998846384597}. Best is trial 5 with value: 0.05267433991582621.
[I 2024-11-24 11:59:34,495] Trial 0 finished with value: 0.30758642506911305 and parameters: {'classifier__n_estimators': 20, 'classifier__learning_rate': 0.06

Best parameters: {'classifier__n_estimators': 93, 'classifier__learning_rate': 0.09813426328744247, 'classifier__max_depth': 6, 'classifier__colsample_bytree': 0.7601857095940523, 'classifier__subsample': 0.756442032971816}
Test Accuracy: 0.9306995730305447
F1 Score: 0.7968642552606244
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     17359
           1       0.87      0.73      0.80      3954

    accuracy                           0.93     21313
   macro avg       0.91      0.85      0.88     21313
weighted avg       0.93      0.93      0.93     21313



In [7]:
# CatBoost
catboost_param_distributions = {
    "classifier__iterations": optuna.distributions.IntDistribution(30, 100),    # Fewer iterations
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.005, 0.1),  # Smaller learning rates
    "classifier__depth": optuna.distributions.IntDistribution(2, 6),            # Shallower trees
    "classifier__l2_leaf_reg": optuna.distributions.FloatDistribution(1, 5),    # Regularization strength
}

catboost_model, catboost_acc = train_model_with_optuna(
    model=CatBoostClassifier(verbose=0, random_state=314),
    param_distributions=catboost_param_distributions,
    X_train_embeddings=X_train_embeddings,
    y_train=y_train,
    X_test_embeddings=X_test_embeddings,
    y_test=y_test,
    n_jobs=1,
)
best_models.append(("catboost", catboost_model))


  search = OptunaSearchCV(
[I 2024-11-24 12:18:14,183] A new study created in memory with name: no-name-d103f0a0-7c48-470a-834d-24d9b427f359
[I 2024-11-24 12:18:50,300] Trial 0 finished with value: 0.4418630605747177 and parameters: {'classifier__iterations': 83, 'classifier__learning_rate': 0.03521448296585186, 'classifier__depth': 3, 'classifier__l2_leaf_reg': 4.972089758671521}. Best is trial 0 with value: 0.4418630605747177.
[I 2024-11-24 12:19:23,417] Trial 1 finished with value: 0.3790121111304335 and parameters: {'classifier__iterations': 97, 'classifier__learning_rate': 0.03930713913624876, 'classifier__depth': 2, 'classifier__l2_leaf_reg': 1.2103767928393903}. Best is trial 0 with value: 0.4418630605747177.
[I 2024-11-24 12:19:47,653] Trial 2 finished with value: 0.3776194510499184 and parameters: {'classifier__iterations': 34, 'classifier__learning_rate': 0.04445319978181584, 'classifier__depth': 4, 'classifier__l2_leaf_reg': 2.252734192383126}. Best is trial 0 with value: 0.

Best parameters: {'classifier__iterations': 67, 'classifier__learning_rate': 0.08067913230594767, 'classifier__depth': 5, 'classifier__l2_leaf_reg': 3.355148516980317}
Test Accuracy: 0.9080373480974053
F1 Score: 0.7017650639074863
              precision    recall  f1-score   support

           0       0.91      0.98      0.95     17359
           1       0.88      0.58      0.70      3954

    accuracy                           0.91     21313
   macro avg       0.90      0.78      0.82     21313
weighted avg       0.91      0.91      0.90     21313



In [8]:
# Stacking the best models
stacked_classifier = StackingClassifier(
    estimators=best_models,
    final_estimator=GradientBoostingClassifier(random_state=314),
    cv=5,
)


stacked_classifier.fit(X_train_embeddings, y_train)
y_pred = stacked_classifier.predict(X_test_embeddings)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Stacking Classifier Test Accuracy: {acc}")
print(f"Stacking Classifier F1 Score: {f1}")

print(classification_report(y_test, y_pred))

Stacking Classifier Test Accuracy: 0.9322948435227326
Stacking Classifier F1 Score: 0.8099565389174239
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     17359
           1       0.85      0.78      0.81      3954

    accuracy                           0.93     21313
   macro avg       0.90      0.87      0.88     21313
weighted avg       0.93      0.93      0.93     21313

