In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, f1_score, average_precision_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import optuna
from optuna.integration import OptunaSearchCV
import mlflow
import mlflow.sklearn

# Load your dataset
df = pd.read_csv("../data/clean/dataset.csv").fillna('')
X = df["lemmes"]
y = df["funny"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Set up MLFlow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Classification BoW")


<Experiment: artifact_location='mlflow-artifacts:/696437413843260424', creation_time=1731871318459, experiment_id='696437413843260424', last_update_time=1731871318459, lifecycle_stage='active', name='Classification BoW', tags={}>

In [2]:
f1_class1 = make_scorer(f1_score, pos_label=1)

In [3]:
pr_auc_score = make_scorer(average_precision_score)

In [4]:
def train_model_with_optuna(model, model_name, param_distributions, n_trials=20):
    with mlflow.start_run(run_name=model_name):
        # Vectorizer
        vectorizer = CountVectorizer(min_df=50, max_features=10000)

        # Define pipeline
        pipeline = Pipeline([
            ("vectorizer", vectorizer),
            ("classifier", model)
        ])
        
        # OptunaSearchCV for hyperparameter tuning
        search = OptunaSearchCV(
            pipeline,
            param_distributions=param_distributions,
            cv=3,
            n_trials=n_trials,
            scoring="f1",
            random_state=42,
            verbose=1,
            #n_jobs=-1,
        )
        
        # Fit the model
        search.fit(X_train, y_train)

        # Evaluate on test set
        best_model = search.best_estimator_
        y_pred = best_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        pr_auc = average_precision_score(y_test, y_pred)
        f1_1 = f1_score(y_test, y_pred, pos_label=1)

        # Log results in MLFlow
        mlflow.log_param("scoring", "f1")
        mlflow.log_param("feature_extraction", "BoW")
        mlflow.log_param("model_name", model_name)
        mlflow.log_params(search.best_params_)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("pr_auc", pr_auc)
        mlflow.log_metric("f1_1", f1_1)
        mlflow.sklearn.log_model(best_model, model_name)

        print(f"Best parameters for {model_name}: {search.best_params_}")
        print(f"Test Accuracy: {acc}")
        print(classification_report(y_test, y_pred))
        
        return best_model, acc


In [5]:
# Train models
best_models = []

In [6]:
# Random Forest

rf_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 200),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 15),
    "classifier__min_samples_split": optuna.distributions.IntDistribution(2, 10),
    "classifier__min_samples_leaf": optuna.distributions.IntDistribution(1, 5),
}

rf_model, rf_acc = train_model_with_optuna(
    model=RandomForestClassifier(random_state=42),
    model_name="Random Forest",
    param_distributions=rf_param_distributions
)
best_models.append(("random_forest", rf_model))

  search = OptunaSearchCV(
[I 2024-11-17 23:24:11,209] A new study created in memory with name: no-name-c2c2c959-c2e4-4fc3-8a47-ab01848921c7
[I 2024-11-17 23:24:17,612] Trial 0 finished with value: 0.014556175093762438 and parameters: {'classifier__n_estimators': 118, 'classifier__max_depth': 13, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2}. Best is trial 0 with value: 0.014556175093762438.
[I 2024-11-17 23:24:23,140] Trial 1 finished with value: 0.0 and parameters: {'classifier__n_estimators': 123, 'classifier__max_depth': 6, 'classifier__min_samples_split': 3, 'classifier__min_samples_leaf': 1}. Best is trial 0 with value: 0.014556175093762438.
[I 2024-11-17 23:24:27,493] Trial 2 finished with value: 0.00012645422357106728 and parameters: {'classifier__n_estimators': 76, 'classifier__max_depth': 7, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 5}. Best is trial 0 with value: 0.014556175093762438.
[I 2024-11-17 23:24:32,936] Trial 3 fini

Best parameters for Random Forest: {'classifier__n_estimators': 52, 'classifier__max_depth': 15, 'classifier__min_samples_split': 8, 'classifier__min_samples_leaf': 1}
Test Accuracy: 0.8175761272462816
              precision    recall  f1-score   support

           0       0.82      1.00      0.90     17359
           1       1.00      0.02      0.03      3954

    accuracy                           0.82     21313
   macro avg       0.91      0.51      0.47     21313
weighted avg       0.85      0.82      0.74     21313



In [7]:
# XGBoost

xgb_param_distributions = {
    "classifier__n_estimators": optuna.distributions.IntDistribution(50, 200),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.3),
    "classifier__max_depth": optuna.distributions.IntDistribution(3, 10),
}

xgb_model, xgb_acc = train_model_with_optuna(
    model=XGBClassifier(random_state=42),
    model_name="XGBoost",
    param_distributions=xgb_param_distributions
)
best_models.append(("xgboost", xgb_model))

  search = OptunaSearchCV(
[I 2024-11-17 23:26:23,934] A new study created in memory with name: no-name-51a5f197-0f70-4ad0-b03e-898762d0f498
[I 2024-11-17 23:26:27,288] Trial 0 finished with value: 0.4220418712758765 and parameters: {'classifier__n_estimators': 118, 'classifier__learning_rate': 0.2514632893713383, 'classifier__max_depth': 3}. Best is trial 0 with value: 0.4220418712758765.
[I 2024-11-17 23:26:30,895] Trial 1 finished with value: 0.4074313589561946 and parameters: {'classifier__n_estimators': 98, 'classifier__learning_rate': 0.15070428596302937, 'classifier__max_depth': 5}. Best is trial 0 with value: 0.4220418712758765.
[I 2024-11-17 23:26:34,450] Trial 2 finished with value: 0.312539552387054 and parameters: {'classifier__n_estimators': 82, 'classifier__learning_rate': 0.058356583011244335, 'classifier__max_depth': 4}. Best is trial 0 with value: 0.4220418712758765.
[I 2024-11-17 23:26:38,420] Trial 3 finished with value: 0.44391532567599495 and parameters: {'classifi

Best parameters for XGBoost: {'classifier__n_estimators': 199, 'classifier__learning_rate': 0.2968379050471063, 'classifier__max_depth': 7}
Test Accuracy: 0.8793224792380238
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     17359
           1       0.83      0.44      0.58      3954

    accuracy                           0.88     21313
   macro avg       0.86      0.71      0.75     21313
weighted avg       0.87      0.88      0.86     21313



In [8]:
# CatBoost
catboost_param_distributions = {
    "classifier__iterations": optuna.distributions.IntDistribution(50, 200),
    "classifier__learning_rate": optuna.distributions.FloatDistribution(0.01, 0.3),
    "classifier__depth": optuna.distributions.IntDistribution(3, 10),
}

catboost_model, catboost_acc = train_model_with_optuna(
    model=CatBoostClassifier(verbose=0, random_state=42),
    model_name="CatBoost",
    param_distributions=catboost_param_distributions
)
best_models.append(("catboost", catboost_model))

  search = OptunaSearchCV(
[I 2024-11-17 23:27:59,324] A new study created in memory with name: no-name-d24322bb-e4f6-44da-81e2-858b2e46d4d2
[I 2024-11-17 23:28:05,664] Trial 0 finished with value: 0.4415919369883356 and parameters: {'classifier__iterations': 118, 'classifier__learning_rate': 0.2514632893713383, 'classifier__depth': 3}. Best is trial 0 with value: 0.4415919369883356.
[I 2024-11-17 23:28:13,301] Trial 1 finished with value: 0.4165651630728729 and parameters: {'classifier__iterations': 98, 'classifier__learning_rate': 0.15070428596302937, 'classifier__depth': 5}. Best is trial 0 with value: 0.4415919369883356.
[I 2024-11-17 23:28:18,918] Trial 2 finished with value: 0.3263236967698304 and parameters: {'classifier__iterations': 82, 'classifier__learning_rate': 0.058356583011244335, 'classifier__depth': 4}. Best is trial 0 with value: 0.4415919369883356.
[I 2024-11-17 23:28:40,055] Trial 3 finished with value: 0.4386829066474393 and parameters: {'classifier__iterations': 9

Best parameters for CatBoost: {'classifier__iterations': 199, 'classifier__learning_rate': 0.2968379050471063, 'classifier__depth': 7}
Test Accuracy: 0.8802139539248346
              precision    recall  f1-score   support

           0       0.89      0.98      0.93     17359
           1       0.81      0.46      0.59      3954

    accuracy                           0.88     21313
   macro avg       0.85      0.72      0.76     21313
weighted avg       0.87      0.88      0.87     21313



In [9]:
# Stacking the best models
stacked_classifier = StackingClassifier(
    estimators=best_models,
    final_estimator=GradientBoostingClassifier(random_state=42),
    cv=3
)

# Train the stacked classifier
with mlflow.start_run(run_name="Stacking Classifier"):
    stacked_classifier.fit(X_train, y_train)
    y_pred = stacked_classifier.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    pr_auc = average_precision_score(y_test, y_pred)
    f1_1 = f1_score(y_test, y_pred, pos_label=1)

    # Log results
    mlflow.log_param("scoring", "f1")
    mlflow.log_param("feature_extraction", "BoW")
    mlflow.log_param("model_name", "Stacking Classifier")
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("pr_auc", pr_auc)
    mlflow.log_metric("f1_1", f1_1)
    mlflow.sklearn.log_model(stacked_classifier, "Stacking Classifier")

    print(f"Stacking Classifier Test Accuracy: {acc}")
    print(classification_report(y_test, y_pred))


2024/11/17 23:32:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run Stacking Classifier at: http://127.0.0.1:5000/#/experiments/696437413843260424/runs/9dba0b7ae0cc44a2878713c76654214b.
2024/11/17 23:32:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/696437413843260424.


Stacking Classifier Test Accuracy: 0.8856566414864168
              precision    recall  f1-score   support

           0       0.91      0.96      0.93     17359
           1       0.76      0.56      0.65      3954

    accuracy                           0.89     21313
   macro avg       0.83      0.76      0.79     21313
weighted avg       0.88      0.89      0.88     21313

