_1671780 - Pau Domínguez Ruiz_

_1671197 - Gerard Souto Eslava_

# Cas Kaggle | Mushroom Classification


## Imports i carreguem dataset

In [15]:
import random
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as patches

from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, StandardScaler, 
                                    OneHotEncoder)

from sklearn.linear_model import (LinearRegression, Lasso, Ridge, LogisticRegression)
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier, BaggingClassifier, 
                               StackingClassifier)

from sklearn.model_selection import (train_test_split, GridSearchCV, 
                                     cross_val_score, learning_curve)

from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, ConfusionMatrixDisplay, 
                             roc_curve, auc, precision_recall_curve, classification_report)

## Funcions d'utilitat

In [16]:
# Funcio per a llegir dades en format csv
def load_dataset(path, split='Train'):

    # Llegim el fitxer csv. Obtenim un DataFrame de Pandas:
    dataset = pd.read_csv(path, header=0, delimiter=',')   

    # Convertim el DataFrame en un parell d'arrays de numpy: 
    y = dataset["label"].to_numpy()
    X = dataset.drop("label", axis="columns").to_numpy()

    # Mostrem informació sobre el dataset que acabem de carregar:
    unique_labels = np.unique(y).tolist()
    print(f'>>> El conjunt de {split} consta de {X.shape[0]} mostres amb {X.shape[1]} característiques, distribuides en les següents categories: {unique_labels}')

    return X, y

# Donat un model i un split del dataset, aquesta funció permet visualitzar varis valors (mètriques) de rendiment (performance):
# - Accuracy
# - Precission
# - Recall (altrament dit: Sensitivity o True Positive Rate)
# - Confusion Matrix
def show_performance(x, y, model, average='binary', plot_title='confusion matrix'):

    predictions = model.predict(x)
    acc = accuracy_score(y, predictions)
    prec = precision_score(y, predictions, average=average)
    rec = recall_score(y, predictions, average=average)
    f1 = f1_score(y,predictions)
    conf_mat = confusion_matrix(y, predictions)

    print('-' * 25)
    print('>>> Rendiment del model:')
    print(f'- Accuracy: {round(acc, 3)}')
    print(f'- Precision: {round(prec, 3)}')
    print(f'- Recall (Sensitivity, TPR): {round(rec, 3)}')
    print(f'- f1: {round(f1, 3)}')
    print('-' * 25)
    
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat)  
    disp.plot()  
    plt.title(plot_title)
    plt.show()
    
# Mostra les corbes d'aprenentatge:
def plot_learning_curve(train_sizes, train_scores, val_scores, scoring, ylim=None):
        
    #train_sizes, train_scores, val_scores = learning_curve(estimator, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)

    plt.figure()
    plt.title('Corbes d\'Aprenentatge')
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Nombre d\'elements en el Training set')
    plt.ylabel(f'{scoring} Score')
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color='g')
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
    plt.plot(train_sizes, val_scores_mean, 'o-', color='g', label='Validation score')
    plt.legend(loc='best')
    plt.show()

# Donat un model i un split del dataset, funció que permet avaluar el model per a uns quants samples seleccionats
# de forma aleatòria i mostrar-los per pantalla enmarcats amb un color que indica si la predicció ha estat correcta 
# o no...
def random_infer_and_visualize_on_set(model, x, y):

    pred = model.predict(x)
    plt.ion()
    plt.figure(figsize=(10, 7))
    
    for i in range(10):
        for j in range(10):
            plt.subplot(10, 10, 1+j+i*10)
            ind = random.randint(0, x.shape[0]-1)
            plt.imshow(x[ind].reshape((28,28)), cmap='gray')
            col = 'green'
            ls = '-'
            if (pred[ind] != y[ind]):
                col = 'red'
            plt.gca().add_patch(patches.Rectangle((0,0), 27, 27, lw=6, color=col, ls=ls, fill=False))
            plt.axis('off')

    plt.subplots_adjust(wspace=0, hspace=0.1)

    # Funció per mostrar la corba de Precisió-Recall
def plot_precision_recall_curve(y_true, y_probs, plot_title='Precision-Recall Curve'):
    precision, recall, _ = precision_recall_curve(y_true, y_probs)
    plt.plot(recall, precision, marker='.', color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(plot_title)
    plt.show()

# Funció per mostrar la corba ROC
def plot_roc_curve(y_true, y_probs, plot_title='ROC Curve'):
    fpr, tpr, _ = roc_curve(y_true, y_probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, marker='.', color='r', label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal de no-discriminació
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(plot_title)
    plt.legend(loc='lower right')
    plt.show()

In [17]:
data = pd.read_csv('Data/mushrooms.csv')

In [18]:
print(data.info())
print(data['class'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  