# Review of the process

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()

### Get Iris data

In [None]:
iris_data = pd.read_csv("iris.csv", names = ["sepal_l", "sepal_w", "petal_l", "petal_w", "class"])

### Summarize the data

In [None]:
iris_data.describe()

### Explore the data

In [None]:
pair_plot = sns.pairplot(iris_data, hue='class')

### Clean, impute, transform data

In [None]:
from sklearn.preprocessing import Imputer

fill_nan = Imputer(missing_values=np.nan, strategy="mean", axis=0)

### Feature selection, feature engineering

In [None]:
# drop returns a copy
# df = iris_data.drop(['some_feature'], axis=1) 

# feature engineering
# df["new_feature"] = df["feature_1"] * df["feature_2"]

# lambda function
# f = lambda x: x**2            
# df["new_feature"].map(f)  # for element-wise application

### Train/Test split

In [None]:
# Get X and y
X = iris_data[["sepal_l", "sepal_w", "petal_l", "petal_w"]].values

# Transform 'Iris-virginica' to be the positive class (binary classification)
y = iris_data['class'].map({"Iris-setosa":0, 'Iris-versicolor':1, 'Iris-virginica': 0}).values

In [None]:
from sklearn.model_selection import train_test_split

# split dataset into test/train  using All features
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)


### Scale/Standardize

In [None]:
from sklearn.preprocessing import StandardScaler

# scale/standardize features
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test) 
X_train_std[:5]

### Models

In [None]:
# Logistic Regression
# --- Key Parameters ---
# C: the regularization strength (smaller values for greater regularization); default is 1.0
# penalty: used to specify the penalization used for regularizartion; default is l2
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C = 1)

# Support Vector Machine
# --- Key Parameters ---
# C: how much penalty there is for misclassification (smaller values for greater penalty); default is 1.0
# kernel: specifies the kernel type to be used (often 'rbf' or 'linear'); default is 'rbf'
# gamma: the coefficient for non-linear kernels
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1)

# Random Forest (ensemble of Decision Trees)
# --- Key Parameters ---
# n_estimators: the number of trees in the forest; default is 10
# max_depth: the depth of the tree; defualt is None, full expansion
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, random_state=0)

# k-Nearest Neighbor
# --- Key Parameters ---
# n_neighbors: number of neighbors to use; default is 5
# weights: weight function used; default is 'uniform'
        # 'uniform' means all points are weighted equally; '
        # 'distance' means closer points have greater influence;
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

# Neural Network
# --- Key Parameters ---
# hidden_layer_sizes: a tuple representing the number of nodes for respective layer; default is (100,)
# activation: the activation function for the hidden layer; defualt is 'relu'
        # usually 'relu', 'tanh', or 'sigmoid'
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier()

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, X_train_std, y_train, scoring='accuracy', cv=10)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())               # accuracy measure
    print("Standard deviation:", scores.std())  # std measures how precise the measure is

In [None]:
display_scores(scores)

In [None]:
classifiers = [log_reg, knn, svm, forest, nn]

model_scores = []
for clf in classifiers:
    model_scores.append(cross_val_score(clf, X_train_std, y_train, scoring='accuracy', cv=10))

In [None]:
models_df = pd.DataFrame(model_scores, columns=[1,2,3,4,5,6,7,8,9,10],
                               index=["LR", "KNN", "SVM", "Forest", "NN"])
models_df

In [None]:
models_df["Mean"] = models_df.mean(axis=1)
models_df

# Boxplot and Model Selection

In [None]:
# BOXPLOT comparing models and comparing SVM using different feature subsets
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(18, 8))
# rectangular box plot
bplot_models = axes.boxplot(model_scores, vert=True, patch_artist=True)

# fill with colors - Models
colors_d = ["lightgreen", "lightyellow", "lime", "yellow", "yellowgreen"]
for patch, color in zip(bplot_models['boxes'], colors_d):
    patch.set_facecolor(color)
    
    # adding axes labels
axes.yaxis.grid(True)
axes.set_xticks([y+1 for y in range(len(model_scores))])
axes.set_xlabel('Classification Models', fontsize=18)
axes.set_ylabel('Accuracy', fontsize=18)
axes.set_ylim((.4, 1.1))
axes.set_title('Classification Accuracy using All Features', fontsize = 18)

# Hyperparameter Tuning (on selected models)


In [None]:
# hyperparameter tuning can be done manually or using Grid Search with Cross-validation

from sklearn.grid_search import GridSearchCV

# Grid Search
param_range = [0.0001, 0.001, .005, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
gs = GridSearchCV(estimator=log_reg, param_grid=[{'C': param_range}], scoring='accuracy', cv=3)

# Cross Validation, evaluates the returned model
cross_val_score(gs, X_train_std, y_train, scoring='accuracy', cv=10)

### Retrain tuned model using ALL Training data

In [None]:
gs.fit(X_train, y_train)
train_score = gs.score(X_train, y_train)
test_score = gs.score(X_test, y_test)
print("Train score: {} \nTest score: {}".format(train_score, test_score))

# Confusion Matrix

In [None]:
# Random Forest Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')

# Precision, Recall, and F1 scores
from sklearn.metrics import precision_score, recall_score, f1_score
precision = precision_score(y_true=y_test, y_pred=y_pred)
recall = recall_score(y_true=y_test, y_pred=y_pred)
f1 = f1_score(y_true=y_test, y_pred=y_pred)

#print('Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}'.format(precision, recall, f1))
print(classification_report(y_test, y_pred, target_names=["other", "versicolor"]))

### Train final model on FULL dataset (ALL Training and Test data)

In [None]:
from sklearn.externals import joblib

# scale/standardize X (the original full dataset)
X_std = stdsc.fit_transform(X)

forest.fit(X_std, y_)

# save the model to disk
joblib.dump(forest, 'final_forest_model.sav')
 
# load the model from disk
forest_final = joblib.load('final_forest_model.sav')

# use the model as before
accuracy = forest_final.score(X_test, y_test)

# Lab Homework #2

Using the Pima Indian dataset, build the best machine learning model that you can to predict whether or not the women in the dataset have diabetes.

• Clean and transform the data as you desire

• Summarize and/or visualize the data 

• Standardize the data

• Choose 2-5 algorithms and perform 10-fold cross validation

• Display a boxplot and select the best performing model

• Tune its hyperparameters (manually or using grid search)

• Train the same algorithm on your full training set (no cross validation)   ( model.fit(X_std_train, y_train))

• Test the model on your test set ( model.score(X_test, y_test) )

• Display the Precision, Recall, and F1 score metrics along with a confusion matrix 

• Be able to explain what the scores and confusion matrix mean pertaining to your data 

** Things to possibly try to improve your model's performance:
				try different algorithms, tune the hyperparameters, Grid Search, 
				try a different scaler (standardization vs. normalization), imputation, feature engineering
