# Modele sklearn

## Importowanie bibliotek

In [1]:
import pandas as pd

from mlxtend.data import loadlocal_mnist

from sklearn import model_selection
from sklearn import metrics

from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import neural_network

## Inicjalizacja konfiguracji

In [2]:
class config:
    
    # Ścieżki do plików
    DATA_PATH = "data/train.csv"

    # Ogólne ustawienia projektu
    RANDOMIZE_DATA = True
    FOLDS_CNT = 5

## Wczytanie danych

In [3]:
# Załadowanie danych MNIST
df = pd.read_csv(config.DATA_PATH)
df = df.drop(columns=["id"])

df

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


## Podział danych na foldy

<img src="./images/image1.png" alt="image1" width="1300"/>
<!-- ![image1](./images/image1.png) -->
<!-- ![image1](https://towardsdatascience.com/wp-content/uploads/2023/12/1N45hocCMP0u4nXLe0WuSvw.png) -->

In [4]:
# Stworzenie kolumny kfold
df["kfold"] = -1

# Podział danych na segmenty oraz ewentualnie przelosowanie danych
folds_model = model_selection.StratifiedKFold(
    n_splits=config.FOLDS_CNT,
    shuffle=config.RANDOMIZE_DATA
)
for fold, (train, test) in enumerate(folds_model.split(df, df["label"].values)):
    df.loc[test, "kfold"] = fold
    
    print(f"{fold}. {train}, {test}")

# Wyświetlenie danych
df

0. [   1    2    3 ... 9996 9997 9998], [   0   16   17 ... 9990 9991 9999]
1. [   0    1    2 ... 9996 9998 9999], [   5    6    7 ... 9978 9994 9997]
2. [   0    1    4 ... 9996 9997 9999], [   2    3   10 ... 9987 9989 9998]
3. [   0    1    2 ... 9997 9998 9999], [   4   12   34 ... 9977 9980 9996]
4. [   0    2    3 ... 9997 9998 9999], [   1   20   23 ... 9992 9993 9995]


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,label,kfold
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,4
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,2


## Dyspozytor modelu

In [5]:
class model_dispatcher:
    
    # Tablica z nazwami modeli
    model_names = [
        # "logistic_regression",
        "SGDClassifier",
        "decision_tree_gini",
        "decision_tree_entropy",
        "decision_tree_log_loss",
        "random_forest",
        "neural_network_default",
        "neural_network_with_settings"
    ]

    # Słownik z nazwami modeli oraz klasami
    models = {
        "logistic_regression": linear_model.LogisticRegression(
            max_iter=6000
        ),
        "SGDClassifier": linear_model.SGDClassifier(),
        "decision_tree_gini": tree.DecisionTreeClassifier(
            criterion="gini"
        ),
        "decision_tree_entropy": tree.DecisionTreeClassifier(
            criterion="entropy"
        ),
        "decision_tree_log_loss": tree.DecisionTreeClassifier(
            criterion="log_loss"
        ),
        "random_forest": ensemble.RandomForestClassifier(),
        "neural_network_default": neural_network.MLPClassifier(),
        "neural_network_with_settings": neural_network.MLPClassifier(
            hidden_layer_sizes=(256, 256),
            activation="relu",
            solver="adam",
            batch_size=64,
            max_iter=10,
            verbose=True
        )
    }

## Trenowanie oraz testowanie modelu

In [7]:
def use_model(df_train, df_test, model, metrics_func):
    
    # Podział danych na odpowiednie zmienne
    x_train = df_train.loc[:, df_train.columns != "label"].values
    y_train = df_train.loc[:, "label"].values

    x_test = df_test.loc[:, df_test.columns != "label"].values
    y_test = df_test.loc[:, "label"].values
    
    # Trenowanie modelu
    model.fit(x_train, y_train)

    # Wykorzystanie modelu
    predicted_data = model.predict(x_test)
    
    # Obliczenie oraz zwrócenie wyników metryki
    scores = metrics_func(
        y_true=y_test,
        y_pred=predicted_data
    )
    return scores

In [8]:
# Inicjalizacja słownika z modelami oraz metrykami ich wyników
metrics_dict = {}

# Pętla przechodząca przez modele
for model_name in model_dispatcher.model_names:
    model = model_dispatcher.models[model_name]
    
    # Inicjalizacja nowego elementu w słowniku
    metrics_dict[model_name] = []

    # Testowanie modelu na różnych foldach
    for fold in range(config.FOLDS_CNT):
        
        # Inicjalizacja train oraz test dataframe
        df_train = df.loc[df["kfold"] != fold, df.columns != "kfold"]
        df_test = df.loc[df["kfold"] == fold, df.columns != "kfold"]

        # Obliczanie accuracy
        score = use_model(df_train, df_test, model, metrics.accuracy_score)

        # Dodanie accuracy do tablicy wyników aktualnego modelu
        metrics_dict[model_name].append(score)

        # Wypisanie accuracy
        print(f"{model_name}: {fold} - {score:.3f}")

SGDClassifier: 0 - 0.870
SGDClassifier: 1 - 0.890
SGDClassifier: 2 - 0.864
SGDClassifier: 3 - 0.870
SGDClassifier: 4 - 0.869
decision_tree_gini: 0 - 0.805
decision_tree_gini: 1 - 0.813
decision_tree_gini: 2 - 0.795
decision_tree_gini: 3 - 0.796
decision_tree_gini: 4 - 0.812
decision_tree_entropy: 0 - 0.818
decision_tree_entropy: 1 - 0.815
decision_tree_entropy: 2 - 0.787
decision_tree_entropy: 3 - 0.811
decision_tree_entropy: 4 - 0.818
decision_tree_log_loss: 0 - 0.821
decision_tree_log_loss: 1 - 0.817
decision_tree_log_loss: 2 - 0.794
decision_tree_log_loss: 3 - 0.811
decision_tree_log_loss: 4 - 0.815
random_forest: 0 - 0.952
random_forest: 1 - 0.963
random_forest: 2 - 0.950
random_forest: 3 - 0.953
random_forest: 4 - 0.949
neural_network_default: 0 - 0.927
neural_network_default: 1 - 0.918
neural_network_default: 2 - 0.916
neural_network_default: 3 - 0.907
neural_network_default: 4 - 0.926
Iteration 1, loss = 4.22631752
Iteration 2, loss = 0.96925162
Iteration 3, loss = 0.47707518
It



Iteration 1, loss = 4.36512418
Iteration 2, loss = 1.04860925
Iteration 3, loss = 0.50430545
Iteration 4, loss = 0.37163403
Iteration 5, loss = 0.24715355
Iteration 6, loss = 0.24627106
Iteration 7, loss = 0.25486073
Iteration 8, loss = 0.26761307
Iteration 9, loss = 0.29555412
Iteration 10, loss = 0.27368825
neural_network_with_settings: 1 - 0.936




Iteration 1, loss = 4.08774517
Iteration 2, loss = 1.05877736
Iteration 3, loss = 0.52217971
Iteration 4, loss = 0.33145875
Iteration 5, loss = 0.28641942
Iteration 6, loss = 0.34220672
Iteration 7, loss = 0.25663357
Iteration 8, loss = 0.27255810
Iteration 9, loss = 0.17786080
Iteration 10, loss = 0.20823292
neural_network_with_settings: 2 - 0.934




Iteration 1, loss = 4.13837696
Iteration 2, loss = 0.86283929
Iteration 3, loss = 0.43556867
Iteration 4, loss = 0.28935748
Iteration 5, loss = 0.28830811
Iteration 6, loss = 0.19822635
Iteration 7, loss = 0.27110531
Iteration 8, loss = 0.24381670
Iteration 9, loss = 0.20016194
Iteration 10, loss = 0.25035428
neural_network_with_settings: 3 - 0.931




Iteration 1, loss = 4.23062932
Iteration 2, loss = 1.02896152
Iteration 3, loss = 0.50023507
Iteration 4, loss = 0.37189043
Iteration 5, loss = 0.32339605
Iteration 6, loss = 0.32549594
Iteration 7, loss = 0.24857176
Iteration 8, loss = 0.19222000
Iteration 9, loss = 0.17853001
Iteration 10, loss = 0.24146153
neural_network_with_settings: 4 - 0.942




## Metryki

In [10]:
metrics_dict

{'SGDClassifier': [0.87, 0.89, 0.864, 0.8695, 0.869],
 'decision_tree_gini': [0.805, 0.813, 0.7955, 0.796, 0.8125],
 'decision_tree_entropy': [0.818, 0.815, 0.787, 0.811, 0.8175],
 'decision_tree_log_loss': [0.8205, 0.8165, 0.794, 0.8115, 0.8145],
 'random_forest': [0.9515, 0.9625, 0.95, 0.9525, 0.949],
 'neural_network_default': [0.927, 0.918, 0.9165, 0.9075, 0.926],
 'neural_network_with_settings': [0.9385, 0.9365, 0.934, 0.931, 0.942]}

## Metryki w DataFrame-ach

In [11]:
metrics_df = pd.DataFrame(metrics_dict)

display(metrics_df)
display(metrics_df.mean().to_frame().transpose().rename(index={0: "avg"}))

Unnamed: 0,SGDClassifier,decision_tree_gini,decision_tree_entropy,decision_tree_log_loss,random_forest,neural_network_default,neural_network_with_settings
0,0.87,0.805,0.818,0.8205,0.9515,0.927,0.9385
1,0.89,0.813,0.815,0.8165,0.9625,0.918,0.9365
2,0.864,0.7955,0.787,0.794,0.95,0.9165,0.934
3,0.8695,0.796,0.811,0.8115,0.9525,0.9075,0.931
4,0.869,0.8125,0.8175,0.8145,0.949,0.926,0.942


Unnamed: 0,SGDClassifier,decision_tree_gini,decision_tree_entropy,decision_tree_log_loss,random_forest,neural_network_default,neural_network_with_settings
avg,0.8725,0.8044,0.8097,0.8114,0.9531,0.919,0.9364


In [12]:
metrics_df_tran = pd.DataFrame(metrics_dict).transpose()

display(metrics_df_tran)
display(metrics_df_tran.mean(axis=1).to_frame().rename(columns={0: "avg"}))

Unnamed: 0,0,1,2,3,4
SGDClassifier,0.87,0.89,0.864,0.8695,0.869
decision_tree_gini,0.805,0.813,0.7955,0.796,0.8125
decision_tree_entropy,0.818,0.815,0.787,0.811,0.8175
decision_tree_log_loss,0.8205,0.8165,0.794,0.8115,0.8145
random_forest,0.9515,0.9625,0.95,0.9525,0.949
neural_network_default,0.927,0.918,0.9165,0.9075,0.926
neural_network_with_settings,0.9385,0.9365,0.934,0.931,0.942


Unnamed: 0,avg
SGDClassifier,0.8725
decision_tree_gini,0.8044
decision_tree_entropy,0.8097
decision_tree_log_loss,0.8114
random_forest,0.9531
neural_network_default,0.919
neural_network_with_settings,0.9364
