# <center> CSCAR WORKSHOP - Data Science Skills Series
## <center> Supervised Machine Learning: Support Vector Machines (SVMs)
### <center> Marcio Mourao - 11/29/2017


# <center> Setup for Anaconda / Jupyter Notebook

<ul>
    <li>Go to the page https://marcio-mourao.github.io/</li>
    <li>Download the materials under "Supervised Machine Learning in Python using Scikit-Learn (SVMs)" to your "username/Documents"</li><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3 (64-bit)"</li>
    <li>Click "Anaconda Prompt" </li>
    <li>Enter "conda update scikit-learn"</li><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3 (64-bit)"</li>
    <li>Click "Jupyter Notebook" </li><br/>
    
    <li>Click "Workshop.ipynb" (this should open a new tab in the browser)</li>
</ul>

# <center> Introduction

<ul>
  <li>Please, sign up the sheet! </li>
  <li>Don't forget to go to: http://cscar.research.umich.edu/ to know what we're offering!</li>
  <li>Any questions/feedback, you can send an email to <a href="mailto:mdam@umich.edu" target="_top">Marcio.</a>
</ul>

# <center> Summary of this workshop

<ul>
  <li>SVM (use handwritten digits dataset) </li>
  <ul>
     <li>Brief description of the dataset</li>
     <li>Load and describe the data</li>
     <li>Machine Learning</li>
  </ul><br>
  <li>SVM (use the 1994 census dataset)</li>
  <ul>
     <li>Brief description of the dataset</li>
     <li>Load and describe the data (using Pandas dataframes)</li>
     <li>Machine Learning</li>
  </ul><br>
</ul>


# <center> References

<ul>
  <li>https://www.continuum.io/anaconda-overview</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/10min.html</li>
  <li>http://scikit-learn.org/stable/</li>
  <li>http://scikit-learn.org/stable/modules/svm.html</li>
  <li>http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html</li>
</ul>

## Import relevant general modules

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import sys
print(sys.version)

print(np.__version__)
print(pd.__version__)

# <center> Handwritten Digits Dataset

## Some info about the dataset

The handwritten digits dataset is made up of 1797 8x8 images 

Each image is of a hand-written digit

The goal is to recognize handwritten digits

In [None]:
#Import modules
from sklearn import datasets

## Load and describe the data

In [None]:
#Load the digits dataset
digits = datasets.load_digits()
digits

In [None]:
print(digits.data)
print(digits.data.shape)

In [None]:
print(digits.images)
print(digits.images.shape)

In [None]:
print(digits.target)
print(digits.target.shape)

In [None]:
#As an example, displays a digit
print(digits.target[-2])
print(digits.data[-2])
plt.imshow(digits.images[-2], cmap=plt.cm.gray_r)
plt.show()

## Machine Learning

In [None]:
from sklearn import metrics, svm
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [None]:
#Create predictors and target sets
X, y = digits.data, digits.target

#Obtain the data for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13, stratify = y)

print('Total number of records: ', digits.data.shape[0])
print('Type of X_train: ', type(X_train))
print('Number of records in X_train: ', len(X_train))
print('Fraction on X_train: ', len(X_train)/digits.data.shape[0])
print('Number of records in y_train: ', len(y_train))
print('Type of y_train: \n\n', type(y_train))

print('Type of X_test: ', type(X_test))
print('Number of records in X_test: ', len(X_test))
print('Fraction on X_test: ', len(X_test)/digits.data.shape[0])
print('Number of records in y_test: ', len(y_test))
print('Type of y_test: ', type(y_test))

In [None]:
#Check SVC specification
?svm.SVC

In [None]:
#Creates the object
SVM_model = svm.SVC()

#Fit to the data
SVM_model.fit(X_train, y_train)

In [None]:
#Obtain scores from the SVM model on a subset of the test samples
dec = SVM_model.decision_function(X_test[0:2,:])
dec

In [None]:
#Predict class on a subset of the test samples
SVM_model.predict(X_test[0:2,:])

In [None]:
#Obtain probability predictions (only works if probability = True in the creation of the SVM object)
#y_pred_SVM_prob = SVM_model.predict_proba(X_test)
#print('Predicted probabilities: \n', y_pred_SVM_prob)

#Obtain class predictions
y_pred_SVM_class = SVM_model.predict(X_test)
print('Predicted classes: \n', y_pred_SVM_class)

In [None]:
#Obtains confusion matrix
SVM_cm=metrics.confusion_matrix(y_test, y_pred_SVM_class)
SVM_cm

In [None]:
#Obtains accuracy score in the test set
print('SVM Score: ', metrics.accuracy_score(y_test, y_pred_SVM_class))

In [None]:
#KFolds and Cross_val_scores
kf = KFold(n_splits = 10, shuffle = True, random_state = 13)
print('Cross validation score: ', cross_val_score(SVM_model, X, y, cv=kf).mean())

In [None]:
#Obtain optimal SVM for both parameters C and Gamma
from sklearn.model_selection import GridSearchCV

C_range = np.logspace(-6, 6, 5)
gamma_range = np.logspace(-6, 6, 5)
param_grid = dict(C = C_range, gamma = gamma_range)
cv = KFold(n_splits=3, random_state = 13)

grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
grid.fit(X, y)

print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

# <center> 1994 Census Dataset

## Some info about the dataset

This data was extracted from the 1994 Census bureau database by Ronny Kohavi and Barry Becker (Data Mining and Visualization, Silicon Graphics). A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1) && (HRSWK>0)). 

<b>The prediction task is to determine whether a person makes over $50K a year!</b>

<b>Attributes:</b>

income: >50K, <=50K

age: continuous

workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked

fnlwgt: continuous

education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool

education-num: continuous

marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse

occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces

relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried

race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black

sex: Female, Male

capital-gain: continuous

capital-loss: continuous

hours-per-week: continuous

native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands

## Load and describe the data

In [None]:
#Creates a dataframe named "adults" from reading the file "adult.csv"
adults = pd.read_csv('adult.csv', na_values=['?'])
adults.head()

In [None]:
#Displays number of lines and number of columns of the dataframe
adults.shape

In [None]:
#Displays the data types associated with each dataframe column
adults.dtypes

In [None]:
#Describes everything in the dataframe
adults.describe(include='all')

In [None]:
#Displays whether columns contain any null values
adults.isnull().any(axis=0)

In [None]:
#Count the number of missing values in each column of the dataframe
adults.apply(lambda x: sum(x.isnull()),axis=0)

In [None]:
#Count the number of missing values in each column of the dataframe and sums them up
adults.apply(lambda x: sum(x.isnull()),axis=0).sum()

In [None]:
#Count number of lines with NaNs
adults.apply(lambda x: x.isnull().any(),axis=1).sum()

In [None]:
#Fraction of observations with NaNs (potentially for removal)
2399/adults.shape[0]

In [None]:
#Removes any lines from the dataframe that contains NaNs 
#(be careful about what you decide to do with missing values)
adults = adults.dropna(axis=0,how='any')
adults.head()

In [None]:
#Displays number of lines and number of columns of the dataframe
adults.shape

In [None]:
#Displays the first rows of the dataframe
adults.head(10)

## Machine Learning

In [None]:
#A reminder of the variables we have
adults.columns

In [None]:
#Before moving forward, lets drop potentially irrelevant features of the data
#Feel free to go back here, drop or add features
adults2 = adults.drop(['workclass','fnlwgt','education','capital.gain','capital.loss','native.country'], axis=1)

In [None]:
#Check types
adults2.dtypes

In [None]:
#Obtain dummies on the object variables for scikit-learn use
adults2 = pd.get_dummies(adults2, columns = ['marital.status', 'occupation', 'relationship', 'race', 'sex'])

In [None]:
#Check new data types
adults2.dtypes

In [None]:
#Define covariates in X and dependent variable in y
X = adults2.loc[:,adults2.columns != 'income']
y = adults2.loc[:, 'income']

print(X.shape)
print(y.shape)

In [None]:
#Obtain the data for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13, stratify = y)

print('Total number of records: ', adults2.shape[0])
print('Type of X_train: ', type(X_train))
print('Number of records in X_train: ', len(X_train))
print('Fraction on X_train: ', len(X_train)/adults2.shape[0])
print('Number of records in y_train: ', len(y_train))
print('Type of y_train: \n\n', type(y_train))

print('Type of X_test: ', type(X_test))
print('Number of records in X_test: ', len(X_test))
print('Fraction on X_test: ', len(X_test)/adults2.shape[0])
print('Number of records in y_test: ', len(y_test))
print('Type of y_test: ', type(y_test))

In [None]:
#Creates the object
SVM_model = svm.SVC()

#Fit to the data
SVM_model.fit(X_train,y_train)

In [None]:
#Obtain class predictions
y_pred_SVM_class = SVM_model.predict(X_test)
print('Predicted classes: \n', y_pred_SVM_class)

In [None]:
#Obtains confusion matrix
SVM_cm=metrics.confusion_matrix(y_test,y_pred_SVM_class)
SVM_cm

In [None]:
#Obtains accuracy score
print('SVM Score: ', metrics.accuracy_score(y_test, y_pred_SVM_class))