In [None]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import plotly.express as px

from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# from tabpfn import TabPFNClassifier


In [None]:
datasets = fetch_datasets()
datasets

In [None]:
wine_quality = datasets["wine_quality"]
data, target = wine_quality["data"], wine_quality["target"]
data, target

In [None]:
target = (target == 1).astype(int)
target

In [None]:
data.shape, target.shape

In [None]:
Counter(target)

In [None]:
4715 / 183

In [None]:
columns = [
    "fixed_acidity",
    "volatile_acidity",
    "citric_acid",
    "residual_sugar",
    "chlorides",
    "free_sulfur_dioxide",
    "total_sulfur_dioxide",
    "density",
    "pH",
    "sulphates",
    "alcohol",
]

In [None]:
df = pd.DataFrame(data, columns=columns)
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.3, random_state=42, shuffle=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
params = {
    "random_state": 42,
    "n_jobs": -1,
}

models = [
    LogisticRegression(
        max_iter=10_000,
        **params
    ),
    RandomForestClassifier(
        n_estimators=50,
        **params
    ),
    XGBClassifier(
        **params
    ),
    # TabPFNClassifier(),
]

model_names = [
    "LogisticRegression",
    "RandomForestClassifier",
    "XGBClassifier",
]

In [None]:
predictions = []

for model in tqdm(models):
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    predictions.append(prediction)

predictions

In [None]:
metrics = [
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
]

In [None]:
summary = {}

for prediction, model_name in zip(predictions, model_names):
    print(model_name)
    print(Counter(prediction))
    model_summary = {}
    for metric in metrics:
        print(metric.__name__)
        score = metric(y_test, prediction)
        model_summary[metric.__name__] = score
        print(score)
    summary[f"{model_name} {Counter(prediction)}"] = model_summary
    print()

summary

In [None]:
df_summary = pd.DataFrame(summary).T
df_summary

In [None]:
fig = px.bar(df_summary, barmode="group", title="Comparison of metrics on wine_quality dataset")
fig.write_html("metrics.html")
fig.show()