In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna

from functions import *
from bert_vectorizer import BertVectorizer
from svd_transformer import SVDTransformer

import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

# Load your dataset
df = pd.read_csv("../data/clean/dataset.csv").fillna('')
X = df["title"]
y = df["funny"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

2024-11-24 16:51:29.957750: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732463489.977499  242151 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732463489.983092  242151 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 16:51:30.002210: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
vectorizer = BertVectorizer()

In [3]:
X_train_embeddings = vectorizer.fit_transform(list(X_train))
X_test_embeddings = vectorizer.transform(list(X_test))

I0000 00:00:1732463494.178754  242151 gpu_process_state.cc:201] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1732463494.178963  242151 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1730 MB memory:  -> device: 0, name: NVIDIA GeForce MX150, pci bus id: 0000:01:00.0, compute capability: 6.1
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing TFCamembertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassi

In [4]:
svd_tr = SVDTransformer()
X_train_reduced = svd_tr.fit_transform(X_train_embeddings)
X_test_reduced = svd_tr.transform(X_test_embeddings)
print(X_train_reduced.shape, X_test_reduced.shape)

Number of components selected: 91
Cumulative explained variance: 0.9002
(85249, 91) (21313, 91)


In [5]:
# Train models
best_models = []

In [6]:
# Random Forest
rf_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 200),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 15),
    "classifier__min_samples_split": optuna.distributions.IntDistribution(2, 10),
    "classifier__min_samples_leaf": optuna.distributions.IntDistribution(1, 5),
}

rf_model, rf_acc = train_model_with_optuna(
    model=RandomForestClassifier(random_state=314),
    param_distributions=rf_param_distributions,
    X_train_embeddings=X_train_reduced,
    y_train=y_train,
    X_test_embeddings=X_test_reduced,
    y_test=y_test,
)
best_models.append(("random_forest", rf_model))

  search = OptunaSearchCV(
[I 2024-11-24 17:13:58,597] A new study created in memory with name: no-name-256bf808-8ed9-4ecf-bea9-41d82fb396be
[I 2024-11-24 17:18:42,839] Trial 5 finished with value: 0.0 and parameters: {'classifier__n_estimators': 107, 'classifier__max_depth': 3, 'classifier__min_samples_split': 6, 'classifier__min_samples_leaf': 1}. Best is trial 5 with value: 0.0.
[I 2024-11-24 17:21:32,238] Trial 2 finished with value: 0.15307706015420058 and parameters: {'classifier__n_estimators': 92, 'classifier__max_depth': 6, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 3}. Best is trial 2 with value: 0.15307706015420058.
[I 2024-11-24 17:23:19,746] Trial 4 finished with value: 0.03453625378065702 and parameters: {'classifier__n_estimators': 134, 'classifier__max_depth': 5, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 4}. Best is trial 2 with value: 0.15307706015420058.
[I 2024-11-24 17:24:52,465] Trial 0 finished with value: 0.00037

Best parameters: {'classifier__n_estimators': 165, 'classifier__max_depth': 15, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 3}
Test Accuracy: 0.8985595645849951
F1 Score: 0.6405054871965414
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     17359
           1       0.93      0.49      0.64      3954

    accuracy                           0.90     21313
   macro avg       0.91      0.74      0.79     21313
weighted avg       0.90      0.90      0.89     21313



In [7]:
# XGBoost
xgb_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 150),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.2),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 7),
    "classifier__subsample": optuna.distributions.FloatDistribution(0.7, 1.0),
    "classifier__colsample_bytree": optuna.distributions.FloatDistribution(0.7, 1.0),
}

xgb_model, xgb_acc = train_model_with_optuna(
    model=XGBClassifier(random_state=314),
    param_distributions=xgb_param_distributions,
    X_train_embeddings=X_train_reduced,
    y_train=y_train,
    X_test_embeddings=X_test_reduced,
    y_test=y_test,
)
best_models.append(("xgboost", xgb_model))

  search = OptunaSearchCV(
[I 2024-11-24 17:55:47,615] A new study created in memory with name: no-name-985eaa59-5462-4e2d-8a4e-8280f5525fb2
[I 2024-11-24 17:56:28,855] Trial 6 finished with value: 0.7244516296161081 and parameters: {'classifier__n_estimators': 83, 'classifier__learning_rate': 0.11596050023767428, 'classifier__max_depth': 3, 'classifier__subsample': 0.8021623787481127, 'classifier__colsample_bytree': 0.7240729887464639}. Best is trial 6 with value: 0.7244516296161081.
[I 2024-11-24 17:56:49,500] Trial 0 finished with value: 0.6596035271750883 and parameters: {'classifier__n_estimators': 51, 'classifier__learning_rate': 0.04329057543986922, 'classifier__max_depth': 6, 'classifier__subsample': 0.9223751298852272, 'classifier__colsample_bytree': 0.9098817339005139}. Best is trial 6 with value: 0.7244516296161081.
[I 2024-11-24 17:56:51,195] Trial 5 finished with value: 0.7788717830753035 and parameters: {'classifier__n_estimators': 136, 'classifier__learning_rate': 0.1765

Best parameters: {'classifier__n_estimators': 121, 'classifier__learning_rate': 0.1686411399077262, 'classifier__max_depth': 7, 'classifier__subsample': 0.7060002034262184, 'classifier__colsample_bytree': 0.8014489853048463}
Test Accuracy: 0.9310749307934125
F1 Score: 0.8011371328008664
              precision    recall  f1-score   support

           0       0.94      0.97      0.96     17359
           1       0.86      0.75      0.80      3954

    accuracy                           0.93     21313
   macro avg       0.90      0.86      0.88     21313
weighted avg       0.93      0.93      0.93     21313



In [8]:
# CatBoost
catboost_param_distributions = {
    "classifier__iterations": optuna.distributions.IntDistribution(50, 200),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.2),
    "classifier__depth": optuna.distributions.IntDistribution(3, 10),
    "classifier__l2_leaf_reg": optuna.distributions.FloatDistribution(1.0, 10.0),
}

catboost_model, catboost_acc = train_model_with_optuna(
    model=CatBoostClassifier(verbose=0, random_state=314),
    param_distributions=catboost_param_distributions,
    X_train_embeddings=X_train_reduced,
    y_train=y_train,
    X_test_embeddings=X_test_reduced,
    y_test=y_test,
    n_jobs=1,
)
best_models.append(("catboost", catboost_model))


  search = OptunaSearchCV(
[I 2024-11-24 18:00:19,149] A new study created in memory with name: no-name-a7b6d5a4-081e-4b77-bbc0-82d93c9eb1bd
[I 2024-11-24 18:00:33,020] Trial 0 finished with value: 0.7363581971705161 and parameters: {'classifier__iterations': 162, 'classifier__learning_rate': 0.07042896593170372, 'classifier__depth': 4, 'classifier__l2_leaf_reg': 9.937201957010922}. Best is trial 0 with value: 0.7363581971705161.
[I 2024-11-24 18:00:49,569] Trial 1 finished with value: 0.754167515080419 and parameters: {'classifier__iterations': 193, 'classifier__learning_rate': 0.07861427827249752, 'classifier__depth': 4, 'classifier__l2_leaf_reg': 1.4733477838886282}. Best is trial 1 with value: 0.754167515080419.
[I 2024-11-24 18:00:57,696] Trial 2 finished with value: 0.7128490138647778 and parameters: {'classifier__iterations': 58, 'classifier__learning_rate': 0.08890639956363168, 'classifier__depth': 6, 'classifier__l2_leaf_reg': 3.818651932862034}. Best is trial 1 with value: 0.

Best parameters: {'classifier__iterations': 200, 'classifier__learning_rate': 0.1202665319623726, 'classifier__depth': 9, 'classifier__l2_leaf_reg': 6.779238303733177}
Test Accuracy: 0.9330924787688265
F1 Score: 0.8074534161490683
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     17359
           1       0.87      0.76      0.81      3954

    accuracy                           0.93     21313
   macro avg       0.91      0.86      0.88     21313
weighted avg       0.93      0.93      0.93     21313



In [9]:
# Stacking the best models
stacked_classifier = StackingClassifier(
    estimators=best_models,
    final_estimator=GradientBoostingClassifier(random_state=314),
    cv=5,
)


stacked_classifier.fit(X_train_reduced, y_train)
y_pred = stacked_classifier.predict(X_test_reduced)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Stacking Classifier Test Accuracy: {acc}")
print(f"Stacking Classifier F1 Score: {f1}")

print(classification_report(y_test, y_pred))

Stacking Classifier Test Accuracy: 0.9328578801670342
Stacking Classifier F1 Score: 0.8111389732083938
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     17359
           1       0.85      0.78      0.81      3954

    accuracy                           0.93     21313
   macro avg       0.90      0.87      0.89     21313
weighted avg       0.93      0.93      0.93     21313



In [17]:
import pandas as pd

probas = stacked_classifier.predict_proba(X_test_reduced)

results_df = pd.DataFrame({
    "y_true": y_test,
    "y_pred": y_pred,
    "proba": probas[:,1],
    "title": X_test, 
})

false_positives = results_df[(results_df["y_true"] == 0) & (results_df["y_pred"] == 1)]

true_positives = results_df[(results_df["y_true"] == 1) & (results_df["y_pred"] == 1)]

sorted_false_positives = false_positives.sort_values(by="proba", ascending=False)

sorted_true_positives = true_positives.sort_values(by="proba", ascending=False)


print(sorted_false_positives.head())
print(sorted_true_positives.head())


       y_true  y_pred     proba  \
48964       0       1  1.000000   
69848       0       1  1.000000   
1211        0       1  1.000000   
95199       0       1  0.993732   
85248       0       1  0.993732   

                                                   title  
48964  JO 2024 : résultats et décryptage du mardi 30 ...  
69848  Audiences TV : France 3 domine la concurrence ...  
1211   Karim Kattan, écrivain palestinien : « Pourquo...  
95199  VIDÉO. L'homme qui murmure à l'oreille des c...  
85248  Un tableau déniché dans une cave par un brocan...  
       y_true  y_pred     proba  \
42164       1       1  0.993732   
31631       1       1  0.993732   
50844       1       1  0.993732   
25540       1       1  0.993732   
39165       1       1  0.993732   

                                                   title  
42164  Il surfe un cadavre de baleine entouré de requins  
31631  VIDÉO. Un parachutiste saute d'un avion (et r...  
50844  Cette boîte de nuit offre des verres aux

In [18]:
for index, row in sorted_false_positives.head(100).iterrows():
    print(row['title'], row['proba'])

JO 2024 : résultats et décryptage du mardi 30 juillet 0.9999999990389898
Audiences TV : France 3 domine la concurrence avec « Mémoires à vif » 0.9999999929761592
Karim Kattan, écrivain palestinien : « Pourquoi Gaza a-t-elle disparu derrière des sophismes, des approximations, des murmures désolés ? » 0.9999999443281937
VIDÉO. L'homme qui murmure à l'oreille des cachalots 0.9937319782617635
Un tableau déniché dans une cave par un brocanteur italien en 1962 serait un original de Picasso 0.9937319782617635
Jean et Johnny, deux drôles d’oiseaux qui imitent à la perfection le chant de 500 volatiles 0.9922289416770422
Une passagère refuse de ranger son sac de luxe sous son siège et se fait expulser de l’avion 0.9863924540739821
Sur Internet, il vend sa BMW gagée et falsifie le contrôle technique 0.9857872985540334
EN IMAGES. Une étonnante exposition de briques Lego à découvrir 0.9829917121135836
La photographe Lee Miller au cinéma : "J'étais douée pour le sexe, l'alcool et les ph

In [19]:
for index, row in sorted_true_positives.head(100).iterrows():
    print(row['title'], row['proba'])

Il surfe un cadavre de baleine entouré de requins 0.9937319782617635
VIDÉO. Un parachutiste saute d'un avion (et reste accroché) 0.9937319782617635
Cette boîte de nuit offre des verres aux clientes qui retirent leur soutien-gorge et scandalise les internautes 0.9937319782617635
Elle détourne 200 000 euros… et convie ses ex-patrons escroqués à son mariage ! 0.9937319782617635
Aux États-Unis, des gens se sont mis à rouler des spliffs avec des pétales de rose 0.9937319782617635
Il joue à la roulette russe pour épater une femme et meurt 0.9937319782617635
VIDÉO. Ce présentateur météo hurle en direct, effrayé par une araignée 0.9937319782617635
VIDÉO. Un homme essaie de sabrer en deux une pastèque et c'est une mauvaise idée 0.9937319782617635
Donald Trump trop "malhonnête" pour avoir le droit de vendre de l'alcool dans son hôtel de Washington? 0.9937319782617635
VIDÉO - Une baleine saute sur un kayak 0.9937319782617635
Quand Thomas Pesquet fait un selfie lors de sa sortie 