### Configure the environment

In [5]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Compare basic classifiers

In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from helper import load_data, score_model

pd.set_option("display.max_columns", None)


def create_model(classifier) -> Pipeline:
    tfidf = FeatureUnion(
        [
            ("word", TfidfVectorizer()),
            ("char", TfidfVectorizer(analyzer="char")),
        ]
    )

    classifier = CalibratedClassifierCV(
        classifier,
        cv=5,
        method="isotonic",
    )

    pipeline = Pipeline([("tfidf", tfidf), ("cls", classifier)])
    return pipeline


def compare():
    classifiers = [
        LinearSVC(dual="auto"),
        LogisticRegression(),
        XGBClassifier(),
        LGBMClassifier(verbose=-1),     
        RandomForestClassifier(),
        MultinomialNB(),
    ]

    X_train, y_train = load_data("../data/train.parquet")

    metrics_table = []

    for classifier in classifiers:

        claasifier_name = classifier.__class__.__name__
        print(f"Experiment '{claasifier_name}' in progress...")

        model = create_model(classifier)
        scores = score_model(model, X_train, y_train)

        metrics = {"classifier": claasifier_name}
        for name, values in scores.items():
            value = values.mean()
            metrics[name] = value

        metrics_table.append(metrics)

    print('Training done.')  

    df_metrics = pd.DataFrame.from_records(metrics_table)
    df_metrics = df_metrics.set_index("classifier")

    return df_metrics


df_metrics = compare()
df_metrics

Experiment 'LinearSVC' in progress...
Experiment 'LogisticRegression' in progress...
Experiment 'XGBClassifier' in progress...
Experiment 'LGBMClassifier' in progress...
Experiment 'RandomForestClassifier' in progress...
Experiment 'MultinomialNB' in progress...
Training done.


Unnamed: 0_level_0,fit_time,score_time,test_recall,test_precision,test_f1,test_accuracy,test_roc_auc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LinearSVC,0.16163,0.045998,0.898928,0.932243,0.915157,0.91669,0.965994
LogisticRegression,0.711493,0.065455,0.879083,0.890518,0.884721,0.88548,0.950208
XGBClassifier,3.578575,0.084658,0.883261,0.898034,0.89058,0.891487,0.956427
LGBMClassifier,2.776865,0.092644,0.888223,0.901225,0.894669,0.895404,0.959198
RandomForestClassifier,16.347184,0.322608,0.844868,0.903014,0.87288,0.876993,0.953671
MultinomialNB,0.102047,0.048088,0.91016,0.891352,0.90052,0.899453,0.963431


### Add a little color to make it easier to read

In [9]:
df_metrics = df_metrics.sort_values("test_f1", ascending=False)
df_style = df_metrics.style

df_style.highlight_max(
    subset=df_metrics.columns[2:],
    props="background-color:lightblue;color:black"
)

df_style.bar(df_metrics.columns[:2], color='LightSalmon', width=50, height=50)
df_style

Unnamed: 0_level_0,fit_time,score_time,test_recall,test_precision,test_f1,test_accuracy,test_roc_auc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LinearSVC,0.16163,0.045998,0.898928,0.932243,0.915157,0.91669,0.965994
MultinomialNB,0.102047,0.048088,0.91016,0.891352,0.90052,0.899453,0.963431
LGBMClassifier,2.776865,0.092644,0.888223,0.901225,0.894669,0.895404,0.959198
XGBClassifier,3.578575,0.084658,0.883261,0.898034,0.89058,0.891487,0.956427
LogisticRegression,0.711493,0.065455,0.879083,0.890518,0.884721,0.88548,0.950208
RandomForestClassifier,16.347184,0.322608,0.844868,0.903014,0.87288,0.876993,0.953671
