# Załadowanie pakietów

In [1]:
from typing import Generator, List, Tuple

import numpy as np
import pandas as pd

from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score, recall_score as recall

# Funkcje pomocnicze

In [2]:
def get_data(path: str) -> Tuple[pd.DataFrame, pd.Series]:
    data = pd.read_csv(path).astype('category')
    
    # Kodujemy zmienne kategoryczne
    for col in data:
        data[col] = data[col].cat.codes

    features = data.drop('class', axis=1)
    target = data['class']
    
    return features, target


def validation(features: pd.DataFrame, target: pd.Series
              ) -> Generator[Tuple[Tuple[pd.DataFrame, pd.Series], 
                                   Tuple[pd.DataFrame, pd.Series]], None, None]:

    # Z uwagi na parametryczne charakter naszego modelu,
    # zależy nam, aby dystrybucja w każdym podziele była bliska ogólnej.
    splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, valid_index in splitter.split(features, target):
        X_train, y_train = features.iloc[train_index], target[train_index]
        X_valid, y_valid = features.iloc[valid_index], target[valid_index]
        
        yield (X_train, y_train), (X_valid, y_valid)


def run_pipeline(model, features: pd.DataFrame, target: pd.Series) -> Tuple[List, List]:
    models = []
    folds = []
    for fold_id, (train_data, valid_data) in enumerate(validation(features, target)):
        X_train, y_train = train_data
        X_valid, y_valid = valid_data

        fitted_model = model.fit(X_train, y_train)
        y_pred = fitted_model.predict_proba(X_valid)[:,1]

        # W przypadku wykrywania, czy dany grzyb jest trujący
        # ważne jest ograniczenie liczby błędów typów II.
        # Dlatego też chcemy zmaksymalizować recall lub/i F_2
        recall_score = recall(y_valid, y_pred>0.5)
        f_two_score = fbeta_score(y_valid, y_pred>0.5, beta=2)

        print(f"Fold {fold_id}:\tRecall: {round(recall_score, 4)}\t\tF_2: {round(f_two_score, 4)}")
        
        models.append(fitted_model)
        folds.append(X_train)
        
    return models, folds

# Pipeline

In [3]:
features, target = get_data('data/mushrooms.csv')

In [4]:
features.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1


In [5]:
# Skorzystamy z tego, że wszystkie zmienne są kategoryczne
# i dobierzemy model beyesowki dotego zadania
model = CategoricalNB(alpha=1e-9)

In [6]:
# Jak widzimy poniżej, poniższy model
# sprawuje się wyjątkowo dobrze, poza 
# pojedynczymi przypadkami
models, folds = run_pipeline(model, features, target)

Fold 0:	Recall: 1.0		F_2: 0.9987
Fold 1:	Recall: 1.0		F_2: 0.9982
Fold 2:	Recall: 1.0		F_2: 0.9982
Fold 3:	Recall: 0.9974		F_2: 0.9972
Fold 4:	Recall: 0.9987		F_2: 0.9985
