In [1]:

def model_learning_curve (estimator, X, y, n_splits, name):
    '''
        Plots the learning curve for the given classifier, using a k-fold cross validation
        with n_splits.

        Inputs:
            estimator: scikit-learn classifier
            X, y: dataset
            n_splits: # of folders for k-fold cross validation
            name: str: name of the classifier, to use in saving figure
    '''
    import matplotlib.pyplot as plt
    from sklearn.model_selection import learning_curve

    n_examples, train_scores, test_scores = learning_curve(estimator = estimator,
                                                                X = X,
                                                                y = y,
                                                                train_sizes = np.linspace(0.1,1.0,20),
                                                                cv = n_splits)


    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)

    plt.plot(n_examples, train_mean,
                        color='red', marker='*',
                        markersize = 2, label='Mean training accuracy')

    plt.plot(n_examples, test_mean,
                color='blue', marker='s', 
                markersize = 3, label='Mean validation accuracy',)

    plt.grid()
    plt.xscale('log')
    plt.legend(loc='lower right')
    plt.xlabel('Nº of Data points')
    plt.title('Learning Curve '+name)
    plt.ylim([0.0, 1.5])
    plt.ylabel('Accuracy')
    plt.savefig('LearningCurve '+name+'test.png')
    plt.show()

def model_validation (estimator, X, y, param_name, param, n_splits, name):
    '''
        Plots the validation curve for the given classifier, using a k-fold cross validation
        with n_splits over the parameter 'param' range.

        Inputs:
            estimator: scikit-learn classifier
            param_name, param: estimator parameter to iterate over
            X, y: dataset
            n_splits: # of folders for k-fold cross validation
            name: str: name of the classifier, to use in saving figure
    '''

    import matplotlib.pyplot as plt    
    from sklearn.model_selection import validation_curve
    train_scores, test_scores = validation_curve(estimator = estimator,
                                                X = X,
                                                y = y,
                                                param_name = param_name,
                                                param_range = param,
                                                cv = n_splits)


    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)

    plt.plot(param, train_mean,
            color='red', marker='*',
            markersize = 2, label='Mean training accuracy')

    plt.plot(param, test_mean,
            color='blue', marker='s', 
            markersize = 3, label='Mean validation accuracy')

    plt.grid()
    plt.xscale('log')
    plt.legend(loc='lower right')
    xlabel_name = pd.DataFrame(np.array([['Logistic Regression','SVM','Decision Tree','Random Forest','SGD'],
                                        ['C','C','Máx. Depth', 'Nº Estimators','Learning Rate']]).T, 
                                        columns = ['Classifier','Param'])

    plt.xlabel(str((xlabel_name[xlabel_name['Classifier']==name]['Param']).values[0]))

    plt.ylim([0.0, 1.2])
    plt.title('Validation Curve '+name)
    plt.ylabel('Accuracy')  
    plt.savefig('ValidationCurve'+name+'test.png')
    plt.show()


#Loading modules 

# Preprocessing and splitting modules
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
train_test_split, KFold, GridSearchCV, StratifiedShuffleSplit)

# Learnign Algorithms Modules
from sklearn.linear_model import (
LogisticRegression, SGDClassifier, SGDRegressor)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline, Pipeline


import pandas as pd
import numpy as np


In [2]:
# Folder path
path =  "../data/"

# Which values are considered NaN
na_vls = ['#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', 
               '1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 
                'null', '...']

# Load datasets
df_products = pd.read_csv(path+"New_ProdutosVarejos.csv", na_values = na_vls, na_values = na_vls)
df_clts = pd.read_csv(path+"Coletas.csv", na_values = na_vls, na_values = na_vls,)

NameError: name 'path' is not defined

In [None]:
# Dataset dimensions:
n_data, n_features = X.shape

# Model Selection and Validation Parameters
n_splits = 10   # --> # of folders for K-Fold Cross Validation step
random_state = 42

#nPCA = np.array(np.arange(0, n_features,2))
nPCA = np.array([3])

# Splitting and Standardization of training and test datasets
std = StandardScaler()

# Train test splitting
test_size = 0.25
train_size = 0.75

# Stratified data splitting into Train Dataset and Test Dataset
sss = StratifiedShuffleSplit(n_splits = 2, test_size = test_size, random_state=random_state)
train_index,test_index = sss.split(X, y)

X_train = X[train_index[0]]
y_train = y[train_index[0]]
X_test = X[test_index[0]]
y_test = y[test_index[0]]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state)

# Data Standardization
X_train_std = std.fit_transform(X_train, y_train)
X_test_std = std.fit_transform(X_test, y_test)

## 3. Chosen estimators
LR_estimator = make_pipeline(StandardScaler(), LogisticRegression(
                    max_iter = 10000,
                    random_state=random_state,))
SVC_estimator =  make_pipeline(StandardScaler(), SVC())
DT_estimator =   make_pipeline(StandardScaler(), DecisionTreeClassifier())
RF_estimator =  make_pipeline(StandardScaler(), RandomForestClassifier())
SGD_estimator = make_pipeline(StandardScaler(), SGDClassifier())

# Grids for the estimators
param = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

# Logistic Regression
solvers = ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']
lr_grid = [{'logisticregression__solver': solvers, 'logisticregression__C':param[0:6]}]

# SVM
svc_grid = [{'svc__C': param, 'svc__kernel': ['linear']},
{'svc__C': param, 'svc__gamma': param, 'svc__kernel': ['rbf']},
{'svc__C': param, 'svc__kernel': ['poly'], 'svc__degree': [1,2,3,4]}]

# Decision Tree and Random Forest
max_depth = [1, 5, 10, 15, 20, 25, 30]
n_estimators = [1, 5, 10, 15, 20, 25, 30]
dt_grid = [{'decisiontreeclassifier__max_depth': max_depth}]
rf_grid = {'randomforestclassifier__max_depth': max_depth, 
        'randomforestclassifier__n_estimators':n_estimators}

#SGD parameters
sgd_grid = {'sgdclassifier__learning_rate':['constant', 
            'optimal','invscaling', 'adaptive'], 
            'sgdclassifier__eta0':param[0:5]}


## 4. Logistic Regression Validation
lr_gs = GridSearchCV(estimator = LR_estimator,
                    param_grid = lr_grid,
                    scoring = 'accuracy',
                    cv = n_splits)

lr_gs = lr_gs.fit(X_train_std,y_train)
    
## 5. Support Vector Machine Validation
svc_gs = GridSearchCV(estimator = SVC_estimator,
                        param_grid = svc_grid,
                        scoring = 'accuracy',
                        cv = n_splits)

svc_gs = svc_gs.fit(X_train_std, y_train)

## 6. Decision Tree Validation
dt_gs = GridSearchCV(DT_estimator,
                    param_grid = dt_grid, 
                    scoring ='accuracy',
                    cv = n_splits)
dt_gs = dt_gs.fit(X_train_std, y_train)        

## 7. Random Forest Validation
rf_gs = GridSearchCV(estimator = RF_estimator,
                    param_grid = rf_grid, 
                    scoring = 'accuracy',
                    cv = n_splits)
rf_gs = rf_gs.fit(X_train_std, y_train)

## 7. SGD Validation
sgd_gs = GridSearchCV(estimator = SGD_estimator,
                    param_grid = sgd_grid, 
                    scoring = 'accuracy',
                    cv = n_splits)
sgd_gs = sgd_gs.fit(X_train_std, y_train)

## 8. Classifier and hyperparameters choice
print("Logistic Regression:\nScore: ", lr_gs.best_score_,"\nBest Param: ", lr_gs.best_params_,
"\nSupport Vector Machine:\nScore: ", svc_gs.best_score_, "\nBest Param: ", svc_gs.best_params_,
"\nDecision Tree:\nScore: ", dt_gs.best_score_, "\nBest Param: ", dt_gs.best_params_,
"\nRandom Forest:\nScore: ", rf_gs.best_score_, "\nBest Param: ", rf_gs.best_params_,
"\nSGD:\nScore: ", sgd_gs.best_score_, "\nBest Param: ", sgd_gs.best_params_)


## 9. Final Accuracy
LR_estimator = lr_gs.best_estimator_
model_learning_curve(LR_estimator, X_test_std, y_test, 
                    n_splits,
                    'Logistic Regression')

SVC_estimator = svc_gs.best_estimator_
model_learning_curve(SVC_estimator, X_test_std, y_test, 
                     n_splits,'SVM')

dt_estimator = dt_gs.best_estimator_
model_learning_curve(DT_estimator, X_test_std, y_test,
                    n_splits, 
                    'Decision Tree')   

RF_estimator = rf_gs.best_estimator_
model_learning_curve(RF_estimator, X_test_std, y_test, 10, 'Random Forest')
                           
SGD_estimator = sgd_gs.best_estimator_
model_learning_curve(SGD_estimator, X_train_std, y_train, n_splits, 'SGD')


print('Scores:',
'\nLR:', LR_estimator.score(X_test_std, y_test), 
'\nSVM:', SVC_estimator.score(X_test_std, y_test), 
'\nDT:', dt_estimator.score(X_test_std, y_test),  
'\nRF:', RF_estimator.score(X_test_std, y_test),
'\nSGD:', SGD_estimator.score(X_test_std, y_test))