# <center> AN INTRODUCTION TO MACHINE LEARNING
## <center> CSCAR WORKSHOP <br/><br/> 02/23/2018
## <center> Marcio Mourao and Michael Clark

# <center> Setup for Anaconda / Jupyter Notebook

<ul>
    <li>Go to the page https://marcio-mourao.github.io/</li>
    <li>Download the materials (first two docs) under "" to your "username/Documents"</li><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3"</li>
    <li>Click "Anaconda Prompt" </li>
    <ul>
        <li>Enter "conda update pandas"</li>
        <li>Enter "conda update scikit-learn"</li>
    </ul><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3"</li>
    <li>Click "Jupyter Notebook" </li>
    <li>Click "Workshop.ipynb" (this should open a new tab in the browser)</li>
</ul>

# <center> References

<ul>
  <li>https://www.continuum.io/anaconda-overview</li>
  <li>http://www.numpy.org/</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/10min.html</li>
  <li>http://matplotlib.org/</li>
  <li>http://scikit-learn.org/stable/documentation.html</li>
  <li>https://pypi.python.org/pypi/patsy</li>
</ul>

## <center> Import relevant general modules

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import sys
print(sys.version)
print(np.__version__)
print(pd.__version__)

3.6.4 |Anaconda custom (64-bit)| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
1.13.3
0.22.0


## Load and describe the data

In [3]:
choice_dataset = 0 # Index of choice for the dataset names below
dataset_names = ['adult', 'wine'] # List of available datasets
dv = ['income', 'good']

In [4]:
#Creates a dataframe from the data in the file
data_df = pd.read_csv('data/' + dataset_names[choice_dataset] + '.csv', na_values=['?'])
data_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
#Obtains the number of lines and columns of the dataframe
data_df.shape

(32561, 15)

In [6]:
#Obtains the dataframe main types
data_df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [7]:
#Provides a statistical summary of the data
data_df.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
count,32561.0,30725,32561.0,32561,32561.0,32561,30718,32561,32561,32561,32561.0,32561.0,32561.0,31978,32561
unique,,8,,16,,7,14,6,5,2,,,,41,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


## Setup the data for Machine Learning

In [8]:
#Import scikit-learn and patsy modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from patsy import dmatrices

In [9]:
#Import plotly modules
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [10]:
#This function returns model predictions
def get_model_predictions(model):   
    y_pred_prob = model.predict_proba(X_test) #Obtain class predictions
    y_pred_class = model.predict(X_test) #Obtain probability predictions
    output_df = pd.DataFrame(y_pred_prob, columns=['Prob Class A', 'Prob Class B'])
    output_df['Predicted Class'] = y_pred_class
    
    return output_df

In [11]:
#This function plots values of a dataframe
def plot_table(df, width, height):
    trace = go.Table(
        header=dict(values = df.columns,
                    line = dict(color='#7D7F80'),
                    fill = dict(color='#a1c3d1'),
                    align = ['center'] * 5),
        cells=dict(values = [df[col] for col in df.columns],
                   line = dict(color='#7D7F80'),
                   fill = dict(color='#EDFAFF'),
                   align = ['center'] * 5))
    
    layout = dict(width=width, height=height)
    fig = dict(data=[trace], layout = layout)
    iplot(fig, filename = 'styled_table')

In [12]:
#This function obtains the formula to construct design matrices
def get_formula(dataset_name):
    if dataset_name=='adult':
        #Set formula to use in dmatrices
        formula = 'income ~ -1 + age + workclass + educationnum + maritalstatus + ' + \
                   'occupation + relationship + race + sex + ' + \
                   'capitalgain + capitalloss + hoursperweek + nativecountry'
    elif dataset_name=='wine':
        #Set formula to use in dmatrices
        formula = 'good ~ -1 + fixedacidity + volatileacidity + citricacid + residualsugar + ' + \
                   'chlorides + totalsulfurdioxide + pH + sulphates + alcohol'
    
    return formula

In [13]:
#Rename columns of the dataframe for compatibility with patsy
data_df.columns = [col.replace('.','') for col in data_df.columns] 

In [14]:
#Converts column income into an integer
data_df[dv[choice_dataset]] = pd.factorize(data_df[dv[choice_dataset]])[0]

In [15]:
#Obtain the design matrix for use in the logistic regression and random forests modeling approaches
y, X = dmatrices(get_formula(dataset_names[choice_dataset]), data_df, return_type = 'dataframe')

In [16]:
#The dependent variable needs to be a unidimensional vector rather than a dataframe
y = y[dv[choice_dataset]].values

In [17]:
#Obtain the data for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13, stratify = y)

## Machine Learning

In [18]:
#Creates the model and fits the model to the data
LR_model = LogisticRegression().fit(X_train, y_train)
RF_model = RandomForestClassifier(n_estimators=10, criterion='gini').fit(X_train, y_train)
models = [LR_model, RF_model]

In [19]:
#Obtain the model predictions (both probabilities and classes in one single dataframe)
y_preds = [get_model_predictions(model) for model in models]

In [20]:
#Display the predictions for each observation or case
[plot_table(np.round(y_pred.head(10),2), 500, 500) for y_pred in y_preds]

[None, None]

In [21]:
#Display the accuracy score
[print('Model Accuracy Score: ', metrics.accuracy_score(y_test, y_pred['Predicted Class'])) for y_pred in y_preds]

Model Accuracy Score:  0.847632939928
Model Accuracy Score:  0.836361225302


[None, None]

In [22]:
#Obtains and displays the confusion matrix
cms = [metrics.confusion_matrix(y_test, np.array(y_pred['Predicted Class'])) for y_pred in y_preds]
cms_df = [pd.DataFrame(cm, columns = ['Predicted Class A','Predicted Class B']) for cm in cms]
cms_df = [pd.concat([pd.DataFrame(['Real Class A', 'Real Class B']), cm_df], axis=1) for cm_df in cms_df]
[plot_table(cm_df,650,500) for cm_df in cms_df]

[None, None]

In [23]:
#KFolds and displays cross validation scores
kf = KFold(n_splits=10, shuffle=True)
[print('Cross Validation Score: ', cross_val_score(model, X, y, cv=kf).mean()) for model in models]

Cross Validation Score:  0.846793875378
Cross Validation Score:  0.840925834506


[None, None]