In [1]:
from typing import Dict
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from src.data import load_dataset
import pandas as pd
from pandas.io.formats import style
from tqdm import tqdm

RANDOM_STATE = 42

X, y = load_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

models : Dict[str, BaseEstimator] = {
    "GaussianNB": GaussianNB(),
    "RandomForest": RandomForestClassifier(random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "SVC": SVC(kernel='rbf', random_state=RANDOM_STATE),
    "MLP": Pipeline([('scaler', MinMaxScaler()), ("MLP", MLPClassifier(random_state=RANDOM_STATE, max_iter=300))]),
    "KNN": KNeighborsClassifier(),
}

In [2]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, recall_score

results = []
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for model_name, model in tqdm(models.items()):
    accuracy_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
    precision_scores = cross_val_score(model, X, y, cv=kf, scoring='precision_macro')
    recall_scores = cross_val_score(model, X, y, cv=kf, scoring='recall_macro')
    results.append({
        'Model': model_name,
        'Accuracy': accuracy_scores.mean(),
        'Accuracy std': accuracy_scores.std(),
        'Precision': precision_scores.mean(),
        'Recall': recall_scores.mean()
    })


100%|██████████| 6/6 [42:00<00:00, 420.00s/it]


In [3]:
results_df = pd.DataFrame(results)
results_df = results_df.drop(columns=["Accuracy std"])

In [4]:
results_df = results_df.set_index('Model').sort_values(by='Accuracy', ascending=False)
results_df.columns.name = results_df.index.name
results_df.index.name = ""
# Find the maximum value for each column
max_values = results_df.astype(float).max()

def highlight_max(val, col):
    try:
        val_float = float(val)
        if val_float == max_values[col]:
            return f"\\textbf{{{val:.2f}}}"
        else:
            return f"{val:.2f}"
    except:
        pass
    return val

results_df = results_df.apply(lambda col: [highlight_max(val, col.name) for val in col], axis=0)
results_df

Model,Accuracy,Precision,Recall
,,,
GradientBoosting,\textbf{0.87},\textbf{0.83},0.78
MLP,0.85,0.80,0.78
RandomForest,0.85,0.80,0.77
KNN,0.84,0.79,0.78
GaussianNB,0.82,0.76,\textbf{0.80}
SVC,0.80,0.77,0.62


In [5]:
style.Styler(results_df).to_latex(
    "../tables/base_results.tex",
    column_format="lccc",
    caption="Wyniki wybranych klasyfikatorów na zbiorze testowym przy zastosowaniu domyślnych parametrów.",
    label="tab:base-results",
    environment="table",
    position="t",
    position_float="centering",
    multicol_align="c",
    hrules=True,
)