# <center> AN INTRODUCTION TO MACHINE LEARNING
## <center> CSCAR WORKSHOP <br/><br/> 03/22/2018
## <center> Marcio Mourao and Michael Clark

# <center> Setup for Anaconda / Jupyter Notebook

<ul>
    <li>Go to the page https://marcio-mourao.github.io/</li>
    <li>Download the materials (first two docs) under "" to your "username/Documents"</li><br/>    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3"</li>
    <li>Click "Jupyter Notebook" </li>
    <li>Click "Workshop.ipynb" (this should open a new tab in the browser)</li>
</ul>

# <center> References

<ul>
  <li>https://www.continuum.io/anaconda-overview</li>
  <li>http://www.numpy.org/</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/10min.html</li>
  <li>http://matplotlib.org/</li>
  <li>http://scikit-learn.org/stable/documentation.html</li>
  <li>https://pypi.python.org/pypi/patsy</li>
</ul>

##### Note 1 : This document does not do any pre-processing of the data (data is completely clean)
##### Note 2 : This document assumes binary classification tasks 

##  Import relevant general modules

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import sys
print(sys.version)
print(np.__version__)
print(pd.__version__)

## Choice of dataset

In [None]:
choice_dataset = 1 # Index of choice for the dataset names below - choose one dataset
dataset_names = ['wine', 'adult'] # List of available datasets
dv = ['good', 'income']

## Load and describe the data

In [None]:
#Creates a dataframe from the data in the file
data_df = pd.read_csv(dataset_names[choice_dataset] + '.csv', na_values=['?'])
data_df.head()

In [None]:
#Obtains the number of lines and columns of the dataframe
data_df.shape

In [None]:
#Obtains the dataframe main types
data_df.dtypes

In [None]:
#Provides a statistical summary of the data
data_df.describe(include='all')

## Setup modules, functions and data for Machine Learning

In [None]:
#Import scikit-learn and patsy modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from patsy import dmatrices

In [None]:
#Import plotly modules
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [None]:
#This function returns model predictions
def get_model_predictions(model):   
    y_pred_prob = model.predict_proba(X_test) #Obtain probability predictions
    y_pred_class = model.predict(X_test) #Obtain class predictions
    output_df = pd.DataFrame(y_pred_prob, columns=['Prob Class A', 'Prob Class B'])
    output_df['Predicted Class'] = y_pred_class
    
    return output_df

In [None]:
#This function plots values of a dataframe
def plot_table(df, width, height):
    trace = go.Table(
        header=dict(values = df.columns,
                    line = dict(color='#7D7F80'),
                    fill = dict(color='#a1c3d1'),
                    align = ['center'] * 5),
        cells=dict(values = [df[col] for col in df.columns],
                   line = dict(color='#7D7F80'),
                   fill = dict(color='#EDFAFF'),
                   align = ['center'] * 5))
    
    layout = dict(width=width, height=height)
    fig = dict(data=[trace], layout = layout)
    iplot(fig, filename = 'styled_table')

In [None]:
#This function obtains the formula to construct design matrices
def get_formula(dataset_name):
    if dataset_name=='adult':
        #Set formula to use in dmatrices
        formula = 'income ~ -1 + age + workclass + educationnum + maritalstatus + ' + \
                   'occupation + relationship + race + sex + ' + \
                   'capitalgain + capitalloss + hoursperweek + nativecountry'
    elif dataset_name=='wine':
        #Set formula to use in dmatrices
        formula = 'good ~ -1 + fixedacidity + volatileacidity + citricacid + residualsugar + ' + \
                   'chlorides + totalsulfurdioxide + pH + sulphates + alcohol'
    
    return formula

In [None]:
#This function plots a roc curve
def plot_roc_curve(fpr, tpr, roc_auc):
    plt.figure()
    colors = ['r', 'g', 'b', 'm', 'c']
    for i, color in zip(range(len(fpr)), colors):
        plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, 
                 label='ROC curve of model {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], color='k', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
#Rename columns of the dataframe for compatibility with patsy
data_df.columns = [col.replace('.','') for col in data_df.columns] 

In [None]:
#Converts column income into an integer
data_df[dv[choice_dataset]] = pd.factorize(data_df[dv[choice_dataset]])[0]

In [None]:
#Select numerical columns for later scaling
numericalColumns = data_df.select_dtypes(exclude=['object']).columns

In [None]:
#Obtain the design matrix
y, X = dmatrices(get_formula(dataset_names[choice_dataset]), data_df, return_type = 'dataframe')

In [None]:
#The dependent variable needs to be a unidimensional vector rather than a dataframe
y = y[dv[choice_dataset]].values

In [None]:
#Obtain the data for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234, test_size=.2, stratify = y)

In [None]:
#Scale both X_train and X_test datasets separately
scaleColumns = numericalColumns & X.columns
X_train = X_train.copy()
X_test = X_test.copy()
X_train[scaleColumns] = StandardScaler().fit_transform(X_train[scaleColumns])
X_test[scaleColumns] = StandardScaler().fit_transform(X_test[scaleColumns])

## Choice of models

In [None]:
#Select one or more of the following models -> 'KNN', 'LR', RF', 'SVM', 'NN'
modelsStr = ['RF']

## Machine Learning

In [None]:
#Models
KNN = KNeighborsClassifier()
LR = LogisticRegression()
RF = RandomForestClassifier(n_estimators=10, criterion='gini')
SVM = SVC(kernel='rbf', probability=True)
NN = MLPClassifier(activation='logistic', learning_rate='constant')

In [None]:
#Options for the models
KNNOpts = {'n_neighbors': np.square(np.arange(2,6))}
LROpts = {'C': np.arange(1,5)}
RFOpts = {'max_features': np.arange(2,7)}
SVMOpts = {'C': [.25, .5, 1, 2, 4]}
NNOpts = {'hidden_layer_sizes': [(3,), (5,), (7,), (9,)], 'alpha': np.linspace(.0001, 1, 10)}

In [None]:
#Performs search on a grid
gridsCV = [GridSearchCV(eval(modelStr), cv=10, param_grid=eval(modelStr+'Opts'), return_train_score=True) for modelStr in modelsStr]
modelsGrid = [gridCV.fit(X_train, y_train) for gridCV in gridsCV]

In [None]:
#Displays the best cross-validation score of all grid points
[print(np.round(modelGrid.best_score_,2)) for modelGrid in modelsGrid]

In [None]:
#Obtain the model predictions (both probabilities and classes in one single dataframe)
y_preds = [get_model_predictions(modelGrid) for modelGrid in modelsGrid]

In [None]:
#Display the predictions for each observation or case
[plot_table(np.round(y_pred.head(10),2), 500, 500) for y_pred in y_preds]

In [None]:
#Display the accuracy score
[print('Model Accuracy Score: ', np.round(metrics.accuracy_score(y_test, y_pred['Predicted Class']),2)) for y_pred in y_preds]

In [None]:
#Obtains and displays classification reports
crs = [metrics.classification_report(y_test, np.array(y_pred['Predicted Class'])) for y_pred in y_preds]
[print(cr) for cr in crs]

In [None]:
#Obtains and displays the confusion matrix
cms = [metrics.confusion_matrix(y_test, np.array(y_pred['Predicted Class'])) for y_pred in y_preds]
cms_df = [pd.DataFrame(cm, columns = ['Predicted Class A','Predicted Class B']) for cm in cms]
cms_df = [cm_df/cm_df.sum().sum() for cm_df in cms_df]
cms_df = [pd.concat([pd.DataFrame(['Real Class A', 'Real Class B']), cm_df], axis=1) for cm_df in cms_df]
[plot_table(np.round(cm_df,2), 500, 500) for cm_df in cms_df]

In [None]:
#Plot roc curves for the models
y_scores = [y_pred['Prob Class B'].values for y_pred in y_preds]

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(y_scores)):
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test, y_scores[i], pos_label=None, sample_weight=None, drop_intermediate=True)
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])
    
plot_roc_curve(fpr, tpr, roc_auc)