In [2]:
import itertools
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import yaml
import gdown
import os
import pprint
import zipfile
from sklearn.metrics import make_scorer 
from typing import List, Text
import matplotlib.colors
# https://drive.google.com/file/d/1v5PlKhhafsmWEvRUBj4Zpo2kkNDDR4HY/view?usp=sharing

In [4]:
%cd ..

c:\Users\Paulo\Documents\Projects\poisonous_mushroom


In [4]:
with open('params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

pprint.pprint(config)

{'base': {'n_jobs': -1, 'random_state': 42},
 'data_decompress': {'processed_path': 'data/processed'},
 'data_load': {'compact_name': 'mushroom.zip',
               'file_id': '1v5PlKhhafsmWEvRUBj4Zpo2kkNDDR4HY',
               'raw_path': 'data/raw'},
 'evaluate': None,
 'metadata': {'id_col': 'id', 'target_col': 'class'},
 'preprocessing': {'original_test': 'data/processed/test.csv',
                   'original_train': 'data/processed/train.csv',
                   'test_size': 0.2,
                   'testset_path': 'data/interim/test.csv',
                   'trainset_path': 'data/interim/train.csv',
                   'valset_path': 'data/interim/val.csv'},
 'train': {'model_path': 'models/model.joblib',
           'random_forest': {'max_depth': 5, 'n_estimators': 20}}}


In [5]:
# raw_data_path = config['data_load']['raw_path']
# filename = config['data_load']['compact_name']
# file_id = config['data_load']['file_id']


# def download_compact_file(config_path: str) -> None:
#     with open(config_path) as conf_file:
#         config = yaml.safe_load(conf_file)
#     raw_data_path = config['data_load']['raw_path']
#     filename = config['data_load']['compact_name']
#     file_id = config['data_load']['file_id']
#     download_url = f'https://drive.google.com/uc?id={file_id}'
#     output = os.path.join(raw_data_path, filename)
#     print(f'Downloading to {output}')
#     gdown.download(download_url, output, quiet=False)



# download_compact_file('params.yaml')

# Importing and preparing data

In [6]:
def extract_datasets(config_path: str) -> None:
    with open(config_path) as conf_file:
        config = yaml.safe_load(conf_file)
    raw_data_path = config['data_load']['raw_path']
    filename = config['data_load']['compact_name']
    file_id = config['data_load']['file_id']
    download_url = f'https://drive.google.com/uc?id={file_id}'
    output = os.path.join(raw_data_path, filename)
    print(f'Downloading to {output}')
    gdown.download(download_url, output, quiet=False)

    print('Extracting datasets')
    zip_file_path = os.path.join(raw_data_path, filename)
    processed_path = config['data_decompress']['processed_path']
    os.makedirs(processed_path, exist_ok=True)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(processed_path)


extract_datasets('params.yaml')

Downloading to data/raw\mushroom.zip


Downloading...
From (original): https://drive.google.com/uc?id=1v5PlKhhafsmWEvRUBj4Zpo2kkNDDR4HY
From (redirected): https://drive.google.com/uc?id=1v5PlKhhafsmWEvRUBj4Zpo2kkNDDR4HY&confirm=t&uuid=a83d7b9c-2717-4d7e-8707-c7832bdc3c9d
To: c:\Users\Paulo\Documents\Projects\poisonous_mushroom\data\raw\mushroom.zip
100%|██████████| 86.3M/86.3M [00:16<00:00, 5.14MB/s]


Extracting datasets


# Preprocessing

In [27]:
def split_and_preprocess_data(config: Text) -> None:

    with open(config) as conf_file:
        config = yaml.safe_load(conf_file)

    train_file = config['preprocessing']['original_train']
    test_file = config['preprocessing']['original_test']
    id_column = config['metadata']['id_col']
    target_column = config['metadata']['target_col']

    train_data = pd.read_csv(train_file, index_col=id_column)
    train_data.drop(columns=['stem-height', 'stem-width', 'cap-diameter'], inplace=True)
    

    test_data = pd.read_csv(test_file, index_col=id_column)
    test_data.drop(columns=['stem-height', 'stem-width', 'cap-diameter'], inplace=True)


    train_data[target_column] = train_data[target_column].replace({'e':0.0, 'p':1.0})

    x_train, x_val, y_train, y_val = train_test_split(
        train_data.drop(target_column, axis=1), train_data[target_column],
        test_size=config['preprocessing']['test_size'],
        random_state=config['base']['random_state']
    )

    columns = x_train.columns.to_list()
    idx_train = x_train.index
    idx_val = x_val.index
    idx_test = test_data.index


    imputer = SimpleImputer(strategy='most_frequent')
    x_train = imputer.fit_transform(x_train)
    x_val = imputer.transform(x_val)
    x_test = imputer.transform(test_data)

    ordinal_encoding = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    x_train = ordinal_encoding.fit_transform(x_train)
    x_val = ordinal_encoding.transform(x_val)
    x_test = ordinal_encoding.transform(x_test)

    x_train = pd.DataFrame(x_train, columns=columns, index=idx_train)
    x_val = pd.DataFrame(x_val, columns=columns, index=idx_val)
    x_test = pd.DataFrame(x_test, columns=columns, index=idx_test)

    train_data = pd.concat([x_train, y_train], axis=1)
    val_data = pd.concat([x_val, y_val], axis=1)

    train_data.to_csv(config['preprocessing']['trainset_path'])
    val_data.to_csv(config['preprocessing']['valset_path'])
    x_test.to_csv(config['preprocessing']['testset_path'])

    pass

split_and_preprocess_data('params.yaml')



        cap-shape cap-surface cap-color does-bruise-or-bleed gill-attachment  \
id                                                                             
1252551         s           w         n                    t               d   
1799166         b           g         o                    f             NaN   
1936146         x           i         o                    f               e   
1464811         f         NaN         y                    f               s   
767639          f         NaN         n                    f               d   

        gill-spacing gill-color stem-root stem-surface stem-color veil-type  \
id                                                                            
1252551            c          n       NaN          NaN          n       NaN   
1799166            c          n       NaN          NaN          n       NaN   
1936146          NaN          y       NaN          NaN          k       NaN   
1464811            d          y       NaN   

# Train

In [29]:
import joblib

def train_model(config: Text):

    with open(config) as conf_file:
        config = yaml.safe_load(conf_file)

    train_data = pd.read_csv(config['preprocessing']['trainset_path'],
                             index_col=config['metadata']['id_col'])

    x_train = train_data.drop("class", axis=1)
    y_train = train_data['class']



    rfc = RandomForestClassifier(
        n_estimators=config['train']['random_forest']['n_estimators'],
        max_depth=config['train']['random_forest']['max_depth'],
        random_state=config['base']['random_state'],
        n_jobs=config['base']['n_jobs']
    )

    rfc.fit(x_train, y_train)

    models_path = config['train']['model_path']

    joblib.dump(rfc, models_path)

    pass

train_model('params.yaml')

# Evaluating

In [5]:
def evaluate(config: Text) -> None:
    with open(config) as conf_file:
        config = yaml.safe_load(conf_file)

    model_path = config['train']['model_path']
    model = joblib.load(model_path)
    train_file = config['preprocessing']['trainset_path']
    val_file = config['preprocessing']['valset_path']
    id_column = config['metadata']['id_col']
    target_column = config['metadata']['target_col']



    train_data = pd.read_csv(train_file, index_col=id_column)
    x_train = train_data.drop(target_column, axis=1)
    y_train = train_data[target_column].values
    y_train_pred = model.predict(x_train)
    f1_train = f1_score(y_train, y_train_pred)

    

    val_data = pd.read_csv(val_file, index_col=id_column)
    x_val = val_data.drop(target_column, axis=1)
    y_val = val_data[target_column].values
    y_pred = model.predict(x_val)
    f1_val = f1_score(y_val, y_pred)

    report = {
        'train':{
            'f1': f1_train,
            # 'actual': list(y_train),
            # 'predicted': list(y_train_pred)
        },
        'val':{
            'f1': f1_val,
            # 'actual': list(y_val),
            # 'predicted': list(y_pred)
        }
    }
    report_path = os.path.join(config['evaluate']['reports_dir'], config['evaluate']['metrics_file'])
    with open(report_path, 'w+') as file:
        json.dump(report, file)


evaluate('params.yaml')

In [None]:
def plot_confusion_matrix(cm: np.array,
                          target_names: List[Text],
                          title: Text = 'Confusion matrix',
                          cmap: matplotlib.colors.LinearSegmentedColormap = None,
                          normalize: bool = True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))

    return plt.gcf()