In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


 Safe to eat or deadly poisonous?
## An analysis on mushroom classification by Lorenzo Santolini

### Code snippet for google colab

In [0]:
# Little code snippet to import on Google Colab the dataset
'''
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

# Insert here your kaggle API key
from google.colab import files
files.upload()

!cp kaggle.json ~/.kaggle/
!kaggle datasets download -d uciml/mushroom-classification
!unzip mushroom-classification.zip
!ls
'''


In [0]:
# Define all the constants that will be used

PLOTLY_COLORS = ['#140DFF', '#FF0DE2']
COLOR_PALETTE = ['#140DFF', '#FF0DE2', '#CAFFD0', '#C9E4E7', '#B4A0E5', '#904C77']
COLORSCALE_HEATMAP = [         [0.0, 'rgb(70,0,252)'], 
                [0.1111111111111111, 'rgb(78,0,252)'], 
                [0.2222222222222222, 'rgb(90,0,252)'], 
                [0.3333333333333333, 'rgb(110,0,248)'], 
                [0.4444444444444444, 'rgb(130,0,238)'], 
                [0.5555555555555556, 'rgb(145,0,228)'], 
                [0.6666666666666666, 'rgb(166,0,218)'], 
                [0.7777777777777778, 'rgb(187,0,213)'], 
                [0.8888888888888888, 'rgb(200,0,202)'], 
                               [1.0, 'rgb(210,0,191)']]
PLOTLY_OPACITY = 0.7
RANDOM_SEED = 11

LOGISTIC_REGRESSION_PARAMS = {
    'clf__solver': ['liblinear'],  # best for small datasets
    'clf__C': [0.01, 0.1, 1, 10, 100], # smaller value, stronger regularization, like svm
    'clf__penalty': ['l2', 'l1']
}

SVM_PARAMS = [
{
    'clf__kernel': ['linear'],
    'clf__C': [0.1, 1, 10, 100],
}, 
{
    'clf__kernel': ['rbf'],
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__gamma': [0.01, 0.1, 1, 10, 100],
}]

RANDOM_FOREST_PARAMS = {
    'clf__max_depth': [25, 50, 75],
    'clf__max_features': ["sqrt", "log2"], # sqrt is the same as auto
    'clf__criterion': ['gini', 'entropy'],
    'clf__n_estimators': [100, 300, 500, 1000]
}

KNN_PARAMS = {
    'clf__n_neighbors': [5, 15, 25, 35, 45, 55, 65],
    'clf__weights': ['uniform', 'distance'],
    'clf__p': [1, 2, 10]
}



# Introduction

In [0]:
# Import all the libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import plotly
import plotly.plotly as py
from plotly.plotly import plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff

from scipy.cluster import hierarchy as hc
import scipy.spatial as scs

from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE

from prettytable import PrettyTable
from functools import wraps
import time

plotly.tools.set_credentials_file(username='modusV', api_key='OBKKnTR2vYTeKIOKtRU6')


In [0]:

# Wrapper to calculate functions speed

def watcher(func):
    """
    Decorator for dumpers.
    Shows how much time it
    takes to create/retrieve
    the blob.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        print(f" ===> took {end-start} seconds")
        return result
    return wrapper


In [0]:

# Define classes 

class Dataset:
    
    def __init__(self, data, seed, name):
        self.dataset = data
        self.seed = seed
        self.name = name
        
    def set_name(name):
        self.name = name
    
    def get_name(name):
        return self.name

    def import_data(path):
        self.dataset = pd.read_csv(path)

    def count_classes():
        self.n_classes = self.dataset['class'].unique().size
        print(f"There are {self.n_classes} different classes:"
              f"\n {self.dataset['class'].unique().tolist()}")



class Classifier:

    def __init__(self, classifier, params, dataset, seed, name):
        self.classifier = classifier
        self.params = params
        self.dataset = dataset
        self.seed = seed
        self.name = name



# Dataset load and overall view

In [0]:
# Load the dataset
dataset = pd.read_csv("./Input/mushrooms.csv")
# dataset = pd.read_csv("./mushrooms.csv")


In [0]:
# Shape of the dataset
print("The dataset has %d rows and %d columns." % dataset.shape)


In [0]:
# Count number of classes for classification
print(f"There are {dataset['class'].unique().size} different classes:"
      f"\n {dataset['class'].unique().tolist()}")

# Count number of unique data for every column
print(f"Unique values for every field: \n{dataset.nunique()}")


# Preprocessing

## 1 - Check data types

In [0]:
# See data types 
print(f"Data types: \n{dataset.head(5)}")


## 2 - Remove any not significant column

In [0]:
n_columns_original = len(dataset.columns)
to_drop = [col for col in dataset.columns if dataset[col].nunique() == 1]
dataset.drop(to_drop, axis=1, inplace=True)

for d in to_drop:
    print(str(d) + " ", end="")
print("have been removed because zero variance")
print(f"{n_columns_original - len(dataset.columns)} not significant columns have been removed")


## 3 - Handling missing values

In [0]:
# Check if any field is null
if dataset.isnull().any().any():
    print("There are some null values")
else:
    print("There are no null values")

In [0]:
print("There are " + str((dataset['stalk-root'] == "?").sum()) + " missing values in stalk-root column")
# df_drop = dataset[dataset['stalk-root'] != "?"]


## 4 - Encode string values

In [0]:
def encode_values(dataset):
    mapping = {}  
    d = dataset.copy()
    labelEncoder = LabelEncoder()
    for column in dataset.columns:
        labelEncoder.fit(dataset[column])
        mapping[column] = dict(zip(labelEncoder.classes_, labelEncoder.transform(labelEncoder.classes_)))
        d[column] = labelEncoder.transform(dataset[column])
        
    return d, labelEncoder, mapping

def print_encoding(mapping):
    t = PrettyTable()
    field_names = []
    rows = []
    for key, value in mapping.items():
        r = []
        r.append(key)
        for k, v in value.items():
            r.append(k)
        rows.append(r)
    max = []
    for r in rows:
        if len(r) > len(max):
            max = r

    for r in rows:
        r = r + ['-'] * (len(max) - len(r))
        t.add_row(r)
    t.field_names = ["Columns / Values"] + list(range(0, len(max)-1))
    print(t)



le = 0
pre_data, l_encoder, le_mapping = encode_values(dataset)

# Check mapping
print_encoding(le_mapping)

# Check new data
pre_data.head(5)


## 5/6 - Check class and data distribution

### 5 - Check classes distribution

In [0]:
y = dataset["class"].value_counts()
print(y)
class_dict = ["edible", "poisonous"]


In [0]:
# Get insights on the dataset
pre_data.describe()


In [0]:
data = [go.Bar(
            x=class_dict,
            y=y,
            marker=dict(
            color=PLOTLY_COLORS),
            opacity=PLOTLY_OPACITY,
    )]

layout = go.Layout(title="Class distribution",
                   autosize=True,
                   yaxis=dict(
                        title='N. samples',
                    ),
                   )
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='distribution-bar')


### 6 - Box plot

In [0]:

def create_box(type, data, col, visible=False):
    if type == "edible":
        c = PLOTLY_COLORS[0]
    else:
        c = PLOTLY_COLORS[1]
    return go.Box(
        y = data[col],
        name = type,
        marker=dict(color = c),
        visible=visible,
        opacity=PLOTLY_OPACITY,
    )

edible = pre_data[pre_data["class"] == 0]
poisonous = pre_data[pre_data["class"] == 1]
box_features = [col for col in pre_data.columns if ((col != 'class') and (dataset[col].nunique() > 5))]

active_index = 0
box_edible = [(create_box("edible", edible, col, False) if i != active_index 
               else create_box("edible", edible, col, True)) 
              for i, col in enumerate(box_features)]

box_poisonous = [(create_box("poisonous", poisonous, col, False) if i != active_index 
               else create_box("poisonous", poisonous, col, True)) 
              for i, col in enumerate(box_features)]

data = box_edible + box_poisonous
n_features = len(box_features)
steps = []

for i in range(n_features):
    step = dict(
        method = 'restyle',  
        args = ['visible', [False] * len(data)],
        label = box_features[i],
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    step['args'][1][i + n_features] = True # Toggle i'th trace to "visible"
    steps.append(step)
    
sliders = [dict(
    active = active_index,
    currentvalue = dict(
        prefix = "Feature: ", 
        xanchor= 'center',
    ),
    pad = {"t": 50},
    steps = steps,
    len=1,
)]

layout = dict(
    sliders=sliders,
    autosize=True,
    yaxis=dict(
        title='value',
        automargin=True,
    ),
    legend=dict(
        x=0,
        y=1,
    ),
)

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='box_slider')


### 6 - Bar graph

In [0]:

def create_bar(type, data, col, visible=False):
    if type == "edible":
        c = PLOTLY_COLORS[0]
    else:
        c = PLOTLY_COLORS[1]
    return go.Histogram(
        x = data[col],
        name = type,
        marker=dict(color = c),
        visible=visible,
        opacity=PLOTLY_OPACITY,
    )

hist_features = [col for col in pre_data.columns if (col != 'class')]

active_index = 0
hist_edible = [(create_bar("edible", edible, col, False) if i != active_index 
               else create_bar("edible", edible, col, True)) 
              for i, col in enumerate(hist_features)]

hist_poisonous = [(create_bar("poisonous", poisonous, col, False) if i != active_index 
               else create_bar("poisonous", poisonous, col, True)) 
              for i, col in enumerate(hist_features)]

total_data = hist_edible + hist_poisonous
n_features = len(hist_features)
steps = []

for i in range(n_features):
    step = dict(
        method = 'restyle',  
        args = ['visible', [False] * len(total_data)],
        label = hist_features[i],
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    step['args'][1][i + n_features] = True # Toggle i'th trace to "visible"
    steps.append(step)
    
sliders = [dict(
    active = active_index,
    currentvalue = dict(
        prefix = "Feature: ", 
        xanchor= 'center',
    ),
    pad = {"t": 50},
    steps = steps,
)]

layout = dict(
    sliders=sliders,
    autosize=True,
    yaxis=dict(
        title='value',
        automargin=True,
    ),
    legend=dict(
        x=0,
        y=1,
    ),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1
)

fig = dict(data=total_data, layout=layout)
py.iplot(fig, filename='bar_slider')


## 7 - Correlation matrix

In [0]:
correlation_matrix = pre_data.corr(method='pearson')

trace = go.Heatmap(
    z=correlation_matrix.values.tolist(), 
    x=correlation_matrix.columns, 
    y=correlation_matrix.columns, 
    colorscale=COLORSCALE_HEATMAP,
    opacity=0.95,
    zmin=-1,
    zmax=1)
    

data=[trace]

layout = go.Layout(
    title='Heatmap of columns correlation',
    autosize=False,
    width=850,
    height=700,
    yaxis=go.layout.YAxis(automargin=True),
    xaxis=dict(tickangle=40),
    margin=go.layout.Margin(l=0, r=200, b=200, t=80)
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap4')


In [0]:
A dendrogram is a diagram representing a tree. This diagrammatic representation is frequently used 
in different contexts, but we will see the case representing hierarchical clustering. 
It illustrates the arrangement of the clusters, and its objective is to analyze if 
we have any duplicate features.
In order to reduce the dimensionality of our dataset, we can identify and remove duplicate features
according to their pairwise correlation with others.

The linkage criterion determines the distance between sets of observations as a function 
of the pairwise distances between observations.
We will use the between-group average linkage (UPGMA). Proximity between two clusters 
is the arithmetic mean of all the proximities between the objects of one, on one side, 
and the objects of the other, on the other side.
The method is frequently set the default one in hierarhical clustering packages.


In [0]:

names = pre_data.columns
inverse_correlation = 1 - abs(pre_data.corr()) # This is the 'dissimilarity' method

fig = ff.create_dendrogram(inverse_correlation.values, 
                           labels=names, 
                           colorscale=COLOR_PALETTE, 
                           linkagefun=lambda x: hc.linkage(x, 'average'))

fig['layout'].update(dict(
    title="Dendrogram of correlation among features",
    width=800, 
    height=600,
    xaxis=dict(
        title='Features',
    ),
    yaxis=dict(
        title='Distance',
        
    ),
))
iplot(fig, filename='dendrogram_corr_clustering')



## 8/9 - Scale and divide data

In [0]:

def dataframe_to_array(data):
    y_data = data['class']
    X_data = data.drop(['class'], axis=1)
    return X_data, y_data

def scale_data(X_data):
    scaler = StandardScaler(with_mean=True, with_std=True, copy=True)
    return scaler.fit_transform(X_data)


In [0]:

drop_data = pre_data[pre_data['stalk-root'] != le_mapping['stalk-root']['?']]

X_pre_data, y_data = dataframe_to_array(pre_data)
X_scaled_data = scale_data(X_pre_data)

X_drop_data, y_drop_data = dataframe_to_array(drop_data)
X_scaled_drop_data = scale_data(X_drop_data)


# Principal component analysis

In [0]:

pca = PCA(random_state=RANDOM_SEED)
projected_data = pca.fit_transform(X_scaled_data)

tot_var = np.sum(pca.explained_variance_)
ex_var = [(i / tot_var) * 100 for i in sorted(pca.explained_variance_, reverse=True)]
cum_ex_var = np.cumsum(ex_var)


In [0]:
cum_var_bar = go.Bar(
    x=list(range(1, len(cum_ex_var) + 1)), 
    y=ex_var,
    name="Variance of each component",
    marker=dict(
        color=PLOTLY_COLORS[0],
    ),
    opacity=PLOTLY_OPACITY
)
variance_line = go.Scatter(
    x=list(range(1, len(cum_ex_var) + 1)),
    y=cum_ex_var,
    mode='lines+markers',
    name="Cumulative variance",
    marker=dict(
        color=PLOTLY_COLORS[1],
    ),
    opacity=PLOTLY_OPACITY,
    line=dict(
        shape='hv',
    ))
data = [cum_var_bar, variance_line]
layout = go.Layout(
    title='Individual and Cumulative Explained Variance',
    autosize=True,
    yaxis=dict(
        title='Explained variance (%)',
    ),
    xaxis=dict(
        title="Principal components",
        dtick=1,
    ),
    legend=dict(
        x=0,
        y=1,
    ),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='basic-bar')


In [0]:

n_comp = 9
pca.components_ = pca.components_[:n_comp]
reduced_data = np.dot(projected_data, pca.components_.T)
# pca.inverse_transform(projected_data)
X_df_reduced = pd.DataFrame(reduced_data, columns=["PC#%d" % (x + 1) for x in range(n_comp)])
X_df_reduced.head(4)


In [0]:
'''
N=pre_data.values
pca = PCA(n_components=2)
x = pca.fit_transform(N)

kmeans = KMeans(n_clusters=2, random_state=RANDOM_SEED)
X_clustered = kmeans.fit_predict(N)
print(len(np.where(X_clustered == 0)[0]))
print(len(np.where(X_clustered == 1)[0]))

ed_idx = np.where(X_clustered == 0)
po_idx = np.where(X_clustered == 1)

p1 = go.Scatter(
    x=np.take(x[:,0], indices=ed_idx)[0],
    y=np.take(x[:,1], indices=ed_idx)[0],
    mode='markers',
    name="Edible",
    marker=dict(
        color=PLOTLY_COLORS[0],
    ),
    opacity=PLOTLY_OPACITY)

p2 = go.Scatter(
    x=np.take(x[:,0], indices=po_idx)[0],
    y=np.take(x[:,1], indices=po_idx)[0],
    mode='markers',
    name="Poisonous",
    marker=dict(
        color=PLOTLY_COLORS[1],
    ),
    opacity=PLOTLY_OPACITY)
    

data = [p1, p2]

layout = go.Layout(
    title='Data clustered using first two components',
    autosize=True,
    yaxis=dict(
        title='Second component',
    ),
    xaxis=dict(
        title="First component",
        dtick=1,
    ),
    legend=dict(
        x=0,
        y=1,
    ),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='clusters-scatter')
'''

# Classification

In [0]:

X_train, X_test, y_train, y_test = train_test_split(X_scaled_data, y_data, test_size=0.2, random_state=RANDOM_SEED)
X_train_pc, X_test_pc, y_train_pc, y_test_pc = train_test_split(X_df_reduced, y_data, test_size=0.2, random_state=RANDOM_SEED)
X_train_drop, X_test_drop, y_train_drop, y_test_drop = train_test_split(X_scaled_drop_data, y_drop_data, test_size=0.2, random_state=RANDOM_SEED)


In [0]:

def print_gridcv_scores(grid_search, n=5):
    """
    Prints the best score achieved by a grid_search, alongside with its parametes

    :param (estimator) clf: Classifier object
    :param (int) n: Best n scores 
    """    

    if not hasattr(grid_search, 'best_score_'):
        raise KeyError('grid_search is not fitted.')
    
    t = PrettyTable()

    print("Best grid scores on validation set:")
    indexes = np.argsort(grid_search.cv_results_['mean_test_score'])[::-1][:n]
    means = grid_search.cv_results_['mean_test_score'][indexes]
    stds = grid_search.cv_results_['std_test_score'][indexes]
    params = np.array(grid_search.cv_results_['params'])[indexes]
    
    t.field_names = ['Score'] + [f for f in params[0].keys()] 
    for mean, std, params in zip(means, stds, params):
        row=["%0.3f (+/-%0.03f)" % (mean, std * 2)] + [p for p in params.values()]
        t.add_row(row)
    print(t)
               
@watcher
def param_tune_grid_cv(clf, params, X_train, y_train, cv):
    """
    Function that performs a grid search over some parameters

    :param (estimator) clf: Classifier object
    :param (dictionary) params: parameters to be tested in grid search
    :param (array-like) X_train: List of data to be trained with
    :param (array-like) y_train: Target relative to X for classification or regression
    :param (cross-validation generator) cv: Determines the cross-validation splitting strategy
    """   
    pipeline = Pipeline([('clf', clf)])
    grid_search = GridSearchCV(estimator=pipeline, 
                               param_grid=params, 
                               cv=cv, 
                               n_jobs=-1,       # Use all processors
                               scoring='f1',    # Use f1 metric for evaluation
                               return_train_score=True)
    grid_search.fit(X_train, y_train)
    return grid_search
   

def score(clfs, datasets):
    """
    Function that scores a classifier on some data
    
    :param (array of estimator) clf: Array of classifiers
    :param (dictionary) params: Dictionary of test data, passed like [(X_test, y_test)]

    """  
    scores = []
    for c, (X_test, y_test) in zip(clfs, datasets):
        scores.append(c.score(X_test, y_test))

    return scores


def hexToRGBA(hex, alpha):

    """
    Function that returns an rgba value from an hex and an opacity value
    
    :param (String) clf: Hex value 
    :param (float) params: Value between 0 and 1 indicating opacity

    """  

    r = int(hex[1:3], 16)
    g = int(hex[3:5], 16)
    b = int(hex[5:], 16)

    if alpha:
        return "rgba(" + str(r) + ", " + str(g) + ", " + str(b) + ", " + str(alpha) + ")"
    else:
        return "rgb(" + str(r) + ", " + str(g) + ", " + str(b) + ")"


def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=-1, train_sizes=np.linspace(.008, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring="f1", random_state=RANDOM_SEED)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    trace1 = go.Scatter(
        x=train_sizes, 
        y=train_scores_mean - train_scores_std, 
        showlegend=False,
        mode="lines",
        name="",
        hoverlabel = dict(
            namelength=20
        ),
        line = dict(
            width = 0.1,
            color = hexToRGBA(PLOTLY_COLORS[0], 0.4),
        ),
    )
    trace2 = go.Scatter(
        x=train_sizes, 
        y=train_scores_mean + train_scores_std, 
        showlegend=False,
        fill="tonexty",
        mode="lines",
        name="",
        hoverlabel = dict(
            namelength=20
        ),
        line = dict(
            width = 0.1,
            color = hexToRGBA(PLOTLY_COLORS[0], 0.4),
        ),
    )
    trace3 = go.Scatter(
        x=train_sizes, 
        y=train_scores_mean, 
        showlegend=True,
        name="Train score",
        line = dict(
            color = PLOTLY_COLORS[0],
        ),
    )
    
    trace4 = go.Scatter(
        x=train_sizes, 
        y=test_scores_mean - test_scores_std, 
        showlegend=False,
        mode="lines",
        name="",
        hoverlabel = dict(
            namelength=20
        ),
        line = dict(
            width = 0.1,
            color = hexToRGBA(PLOTLY_COLORS[1], 0.4),
        ),
    )
    trace5 = go.Scatter(
        x=train_sizes, 
        y=test_scores_mean + test_scores_std, 
        showlegend=False,
        fill="tonexty",
        mode="lines",
        name="",
        hoverlabel = dict(
            namelength=20
        ),
        line = dict(
            width = 0.1,
            color = hexToRGBA(PLOTLY_COLORS[1], 0.4),
        ),
    )
    trace6 = go.Scatter(
        x=train_sizes, 
        y=test_scores_mean, 
        showlegend=True,
        name="Test score",
        line = dict(
            color = PLOTLY_COLORS[1],
        ),
    )
    
    data = [trace1, trace2, trace3, trace4, trace5, trace6]
    layout = go.Layout(
        title=title,
        autosize=True,
        yaxis=dict(
            title='F1 Score',
        ),
        xaxis=dict(
            title="#Training samples",
        ),
        legend=dict(
            x=0.8,
            y=0,
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    return iplot(fig, filename=title)


def print_confusion_matrix(gs, X_test, y_test):

    """
    Function that prints confusion matrix for a classifier
    
    :param (estimator) clf: Classifier object
    :param (array-like) X_test: List of data to be tested with
    :param (array-like) y_test: List of labels for test 
    """  

    gs_score = gs.score(X_test, y_test)
    y_pred = gs.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    t = PrettyTable()
    t.add_row(["True Edible", cm[0][0], cm[0][1]])
    t.add_row(["True Poisonous", cm[1][0], cm[1][1]])
    t.field_names = [" ", "Predicted Edible", "Predicted Poisonous"]
    print(t)

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # normalize the confusion matrix
    cm_df = pd.DataFrame(cm.round(3), index=["True edible", "True Poisonous"], columns=["Predicted edible", "Predicted poisonous"])
    cm_df


def print_raw_score(clf, X_test, y_test):
    """
    Function that scores a classifier on some data
    
    :param (array of estimator) clf: Array of classifiers
    :param (array-like) X_test: List of data to be tested with
    :param (array-like) y_test: List of labels for test 

    """  
    print("Score achieved by NB: %0.3f" % (score([clf], [(X_test, y_test)])[0]))


def plot_feature_importance(feature_importance, title):
    """
    Function that plots feature importance for a decision tree or a random forest classifier
    
    :param (dictionary) feature_importance: Dictionary of most important features sorted
    :param (str) title: Title of the plot

    """ 
    
    trace1 = go.Bar(
        x=feature_importance[:, 0],
        y=feature_importance[:, 1],
        marker = dict(color = PLOTLY_COLORS[0]),
        opacity=PLOTLY_OPACITY,
        name='Feature importance'
    )
    data = [trace1]
    layout = go.Layout(
        title=title,
        autosize=True,
        margin=go.layout.Margin(l=50, r=100, b=150),
        xaxis=dict(
            title='feature',
            tickangle=30
        ),
        yaxis=dict(
            title='feature importance',
            automargin=True,
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    return iplot(fig, filename=title)


def print_performances(classifiers, classifier_names, auc_scores, X_test, y_test):
  
    """
    Function that scores a classifier on some data
    
    :param (array of estimator) clf: Array of classifiers
    :param (array-like) classifier_names: Title of the classifier
    :param (array-like) auc-score: Auc scores
    :param (array-like) X_test: List of data to be tested with
    :param (array-like) y_test: List of labels for test 

    """ 

    accs = []
    recalls = []
    precision = []
    results_table = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1", "auc"])
    for (i, clf), name, auc in zip(enumerate(classifiers), classifier_names, auc_scores):
        y_pred = clf.predict(X_test)
        row = []
        row.append(accuracy_score(y_test, y_pred))
        row.append(precision_score(y_test, y_pred))
        row.append(recall_score(y_test, y_pred))
        row.append(f1_score(y_test, y_pred))
        row.append(auc)
        row = ["%.3f" % r for r in row]
        results_table.loc[name] = row
    return results_table



In [0]:
kf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)
clf_lr = LogisticRegression(random_state=RANDOM_SEED)


In [0]:

print("Full dataset cv:")
gs_full = param_tune_grid_cv(clf_lr, LOGISTIC_REGRESSION_PARAMS, X_train, y_train, kf)
print("\nDataset projected on first 9 pc cv:")
gs_pc = param_tune_grid_cv(clf_lr, LOGISTIC_REGRESSION_PARAMS, X_train_pc, y_train_pc, kf)
print("\nFull dataset with dropped values took:")
gs_drop = param_tune_grid_cv(clf_lr, LOGISTIC_REGRESSION_PARAMS, X_train_drop, y_train_drop, kf)
gss = [gs_full, gs_pc, gs_drop]

test_results = score(gss, [(X_test, y_test), (X_test_pc, y_test_pc), (X_test_drop, y_test_drop)])


In [0]:
'''
print("Full dataset cv:")
gs_full_balanced = param_tune_grid_cv(clf_lr_balanced, LOGISTIC_REGRESSION_PARAMS, X_train, y_train, kf)
print("\nDataset projected on first 9 pc cv:")
gs_pc_balanced = param_tune_grid_cv(clf_lr_balanced, LOGISTIC_REGRESSION_PARAMS, X_train_pc, y_train_pc, kf)
print("\nFull dataset with dropped values took:")
gs_drop_balanced = param_tune_grid_cv(clf_lr_balanced, LOGISTIC_REGRESSION_PARAMS, X_train_drop, y_train_drop, kf)
gss_balanced = [gs_full_balanced, gs_pc_balanced, gs_drop_balanced]

test_results_balanced = score(gss_balanced, [(X_test, y_test), (X_test_pc, y_test_pc), (X_test_drop, y_test_drop)])
'''

In [0]:
X_train.shape


In [0]:
dataset_strings = ["full dataset", "dataset with first 9 principal components", "dataset with dropped missing values"]
method_strings = ["without any balancing"]

t = PrettyTable()
t.field_names = ["Score", "Dataset", "Type"]

result_row = []
for ms, results in zip(method_strings, [test_results, test_results_balanced]):
    for ds, res in zip(dataset_strings, results):
        result_row.append(["%.3f" % res, ds, ms])
        
result_row = sorted(result_row, key=lambda kv: kv[0], reverse=True)

for k in result_row:
    t.add_row(k)

t.title = "F1 score  dataset and method"
print(t)


In [0]:
print_gridcv_scores(gs_drop)


In [0]:
print_confusion_matrix(gs_drop, X_test_drop, y_test_drop)


In [0]:
plot_learning_curve(gs_drop.best_estimator_, "Learning Curve of Logistic Regression", 
                    np.concatenate((X_train_drop, X_test_drop)),
                    np.concatenate((y_train_drop, y_test_drop)), 
                    cv=5)


## Support vector machine

In [0]:
clf_svm = SVC(probability=True, random_state=RANDOM_SEED)
gs_pc_svm = param_tune_grid_cv(clf_svm, SVM_PARAMS, X_train_pc, y_train_pc, kf)
print_gridcv_scores(gs_pc_svm, n=5)


In [0]:
plot_learning_curve(gs_pc_svm.best_estimator_, "Learning curve of SVM", 
                    np.concatenate((X_train_pc, X_test_pc)),
                    np.concatenate((y_train_pc, y_test_pc)),
                    cv=5)


In [0]:
print_confusion_matrix(gs_pc_svm, X_test_pc, y_test_pc)

In [0]:
clf_nb = GaussianNB()
clf_nb.fit(X_train, y_train)
print_raw_score(clf_nb, X_test, y_test)
print_confusion_matrix(clf_nb, X_test, y_test)



In [0]:
plot_learning_curve(clf_nb, "Learning curve of GaussianNB", 
                    np.concatenate((X_train, X_test), axis=0), 
                    np.concatenate((y_train, y_test), axis=0), 
                    cv=5)


In [0]:

clf_pc_rf = RandomForestClassifier(random_state=RANDOM_SEED)
gs_pc_rf = param_tune_grid_cv(clf_pc_rf, RANDOM_FOREST_PARAMS, X_train_pc, y_train_pc, kf)
print_gridcv_scores(gs_pc_rf, n = 5)


In [0]:
print_confusion_matrix(gs_pc_rf, X_test_pc, y_test_pc)


In [0]:
plot_learning_curve(gs_pc_rf.best_estimator_, "Learning curve of Random Forest Classifier", 
                    np.concatenate((X_train_pc, X_test_pc)),
                    np.concatenate((y_train_pc, y_test_pc)), 
                    cv=5)


In [0]:
feature_importance = np.array(  sorted(zip(X_train_pc.columns, 
                                gs_pc_rf.best_estimator_.named_steps['clf'].feature_importances_),
                                key=lambda x: x[1], reverse=True))
plot_feature_importance(feature_importance, "Feature importance in the random forest")



In [0]:
'''
print("Full dataset cv:")
gs_full = param_tune_grid_cv(clf_svm, SVM_PARAMS, X_train, y_train, kf)
print("\nDataset projected on first 9 pc cv:")
gs_pc = param_tune_grid_cv(clf_svm, SVM_PARAMS, X_train_pc, y_train_pc, kf)
print("\nFull dataset with dropped values took:")
gs_drop = param_tune_grid_cv(clf_svm, SVM_PARAMS, X_train_drop, y_train_drop, kf)
gss = [gs_full, gs_pc, gs_drop]

test_results = score(gss, [(X_test, y_test), (X_test_pc, y_test_pc), (X_test_drop, y_test_drop)])
'''

In [0]:

clf_knn = KNeighborsClassifier()
gs_knn = param_tune_grid_cv(clf_knn, KNN_PARAMS, X_train_pc, y_train_pc, kf)
print_gridcv_scores(gs_knn, n=5)

'''
clf_knn = KNeighborsClassifier()
gs_knn = param_tune_grid_cv(clf_knn, KNN_PARAMS, X_train, y_train, kf)
print_gridcv_scores(gs_knn, n=5)
'''

In [0]:
print_confusion_matrix(gs_knn, X_train_pc, y_train_pc)


In [0]:

plot_learning_curve(gs_knn.best_estimator_, "Learning curve of Random Forest Classifier", 
                    np.concatenate((X_train_pc, X_test_pc)),
                    np.concatenate((y_train_pc, y_test_pc)), 
                    cv=5)



In [0]:

def plot_roc_curve(classifiers, legend, title, X_test, y_test):
    t1 = go.Scatter(
        x=[0, 1], 
        y=[0, 1], 
        showlegend=False,
        mode="lines",
        name="",
        line = dict(
            color = COLOR_PALETTE[0],
        ),
    )
    
    data = [t1]
    aucs = []
    for clf, string, c in zip(classifiers, legend, COLOR_PALETTE[1:]):
        y_test_roc = np.array([([0, 1] if y else [1, 0]) for y in y_test])
        y_score = clf.predict_proba(X_test)
        
        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(2):
            fpr[i], tpr[i], _ = roc_curve(y_test_roc[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test_roc.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        aucs.append(roc_auc['micro'])

        trace = go.Scatter(
            x=fpr['micro'], 
            y=tpr['micro'], 
            showlegend=True,
            mode="lines",
            name=string + " (area = %0.2f)" % roc_auc['micro'],
            hoverlabel = dict(
                namelength=30
            ),
            line = dict(
                color = c,
            ),
        )
        data.append(trace)

    layout = go.Layout(
        title=title,
        autosize=False,
        width=550,
        height=550,
        yaxis=dict(
            title='True Positive Rate',
        ),
        xaxis=dict(
            title="False Positive Rate",
        ),
        legend=dict(
            x=0.4,
            y=0.06,
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    return aucs, iplot(fig, filename=title)


In [0]:

classifiers = [gs_drop, gs_pc_svm, clf_nb, gs_pc_rf, gs_knn]
classifier_names = ["Logistic Regression", "SVM", "GaussianNB", "Random Forest", "KNN"]
auc_scores, roc_plot = plot_roc_curve(classifiers, classifier_names, "ROC curve", X_test, y_test)
roc_plot


In [0]:
print_performances(classifiers, classifier_names, auc_scores, X_test_pc, y_test_pc)


In [0]:
pre_data.columns



In [0]:
data_vis = pre_data.drop(['odor', 'spore-print-color'], axis=1)
data_vis = data_vis[data_vis['stalk-root'] != le_mapping['stalk-root']['?']]

data_vis.shape

X_data_vis, y_data_vis = dataframe_to_array(data_vis)
X_data_vis = scale_data(X_data_vis)

pca = PCA(random_state=RANDOM_SEED)
proj_data = pca.fit_transform(X_data_vis)
tot_var = np.sum(pca.explained_variance_)
ex_var = [(i / tot_var) * 100 for i in sorted(pca.explained_variance_, reverse=True)]
cum_ex_var = np.cumsum(ex_var)
n_comp = 9
pca.components_ = pca.components_[:n_comp]
reduced_data = np.dot(proj_data, pca.components_.T)
# pca.inverse_transform(projected_data)
X_vis_reduced = pd.DataFrame(reduced_data, columns=["PC#%d" % (x + 1) for x in range(n_comp)])


In [0]:
X_train_vis, X_test_vis, y_train_vis, y_test_vis = train_test_split(X_vis_reduced, y_data_vis, test_size=0.2, random_state=RANDOM_SEED)

clf_svm = SVC(probability=True, random_state=RANDOM_SEED)
gs_pc_svm = param_tune_grid_cv(clf_svm, SVM_PARAMS, X_train_vis, y_train_vis, kf)
print_gridcv_scores(gs_pc_svm, n=5)


In [0]:
plot_learning_curve(gs_pc_svm.best_estimator_, "Learning curve of SVM", 
                    np.concatenate((X_train_vis, X_test_vis)),
                    np.concatenate((y_train_vis, y_test_vis)),
                    cv=5)


In [0]:
print_confusion_matrix(gs_pc_svm, X_test_vis, y_test_vis)
