# Model Evaluation

In [1]:
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# create a synthetic dataset
X, y = make_blobs(random_state=0)
# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# instantiate a model and fit it to the training set
logreg = LogisticRegression().fit(X_train, y_train)
# evaluate the model on the test set
print("Test set score: {:.2f}".format(logreg.score(X_test, y_test)))

Test set score: 0.88


# Cross Validation

![alt-text](images/cv.png)
* The train/test split can still introduce issues
* Cross validation - same data, broken up more times / sliced up different ways
* What does this do for us?
* Tuneable w/ hyperparameters

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
logreg = LogisticRegression()
scores = cross_val_score(logreg, iris.data, iris.target)
print("Cross-validation scores: {}".format(scores))

Cross-validation scores: [0.96078431 0.92156863 0.95833333]


* How many folds were there by default?
* Let's increase that to see if we get better...

In [3]:
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
print("Cross-validation scores: {}".format(scores))

Cross-validation scores: [1.         0.96666667 0.93333333 0.9        1.        ]


In [4]:
print("Average cross-validation score: {:.2f}".format(scores.mean()))

Average cross-validation score: 0.96


# Uh-Oh!

In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
print("Iris labels:\n{}".format(iris.target))

Iris labels:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


* If the data is structured in a certain way (like this) we can have major variation between the folds.
* Even with the folding model, the data structure itself can cause issues.
* What is the average value of the 2nd fold of this data?

# Stratified Cross Validation

![alt-text](images/cv-2.png)
* Splits the splits, handling our Uh-Oh scenario
* Commonly used in Salesforce data analysis (web search data).
* Shuffle also deals with this (more later on that).
* What's the tradeoff?

# KFold Validation

In [6]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)

In [7]:
print("Cross-validation scores:\n{}".format(
    cross_val_score(logreg, iris.data, iris.target, cv=kfold)))

Cross-validation scores:
[1.         0.93333333 0.43333333 0.96666667 0.43333333]


In [8]:
kfold = KFold(n_splits=3)
print("Cross-validation scores:\n{}".format(
    cross_val_score(logreg, iris.data, iris.target, cv=kfold)))

Cross-validation scores:
[0. 0. 0.]


* What happened between 3 and 5 splits?

In [9]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
print("Cross-validation scores:\n{}".format(
    cross_val_score(logreg, iris.data, iris.target, cv=kfold)))

Cross-validation scores:
[0.9  0.96 0.96]


* What did we do differently? Did it help?

# Leave One Out Validation

In [None]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)
print("Number of cv iterations: ", len(scores))
print("Mean accuracy: {:.2f}".format(scores.mean()))

* Another type of cross-fold
* Train on everything but one sample, rotate and do it again, leaving out a different sample.
* What is the tradeoff?

# Shuffle Split Cross Validation

![alt-text](images/cv-3.png)
* Combine shuffle with split with cross validation
* The data blender is on "High" now..

In [None]:
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print('Cross-validation scores:\n', scores)

# Group Cross Validation

![alt-text](images/cv-4.png)
* Assume we are doing training on faces for face recognition.
* Which would be more difficult to train:
    * If we train on one person making faces and test on the same person making faces?
    * or
    * train on one person making faces and test on a different person making faces?

* To avoid the "easy" case, we create groups in the data (faces from same person), then train on one group / test on a different group.
* Goal is to recognize the condition across sample (the smile), not just the individual sample (person A)

# GroupKFold Validation

In [None]:
from sklearn.model_selection import GroupKFold
# create synthetic dataset
X, y = make_blobs(n_samples=12, random_state=0)
# assume the first three samples belong to the same group,
# then the next four, etc.
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
print("Cross-validation scores:\n", scores)

# Model Performance - Linear Classifiers
![alt-text](images/svc.png)

# Regularization Parameters
* Do we care more about generalizing, or getting things right?
* Support Vector Machines hyperparameters C and gamma
* C = _cost_ for making errors
  * A large C gives you low bias and high variance
  * A small C gives you higher bias and lower variance

![alt-text](images/svc-1.png)

In [None]:
# naive grid search implementation
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(
                    iris.data, iris.target, random_state=0)
print("Size of training set: {}   size of test set: {}".format(
         X_train.shape[0], X_test.shape[0]))

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters, train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the test set
        score = svm.score(X_test, y_test)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

## What's the problem with the above?
* what are we doing?
* what should we be doing?

# Train-Validate-Test

![alt-text](images/train-validate-test.png)
### Split the data 3 ways now....
* Training (to train the model on)
* Validation (to refine hyperparameters on)
* Test ( to actually test)

In [None]:
from sklearn.svm import SVC
# split data into train+validation set and test set
X_trainval, X_test, y_trainval, y_test = train_test_split(
    iris.data, iris.target, random_state=0)

In [None]:
# split train+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    X_trainval, y_trainval, random_state=1)
len(X_train)

In [None]:
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters train an SVC
        svm = SVC(gamma=gamma, C=C)
        svm.fit(X_train, y_train)
        # evaluate the SVC on the validation set
        score = svm.score(X_valid, y_valid)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
            
print(best_score, best_parameters)

In [None]:
def func(**kwargs):
    print(kwargs)

In [None]:
func(**{'foo': 'bar', 'debug': True})

In [None]:
print("Size of training set: {} size of validation set: {} size of test set: {}".format(
         X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
# rebuild a model on the combined training and validation set,
# and evaluate it on the test set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: ", best_parameters)
print("Test set score with best parameters: {:.2f}".format(test_score))

# Cross Validation and Grid Search

In [None]:
import numpy as np
# reference: manual_grid_search_cv
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        # for each combination of parameters,
        # train an SVC
        svm = SVC(gamma=gamma, C=C)
        # perform cross-validation
        scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
        # compute mean cross-validation accuracy
        score = np.mean(scores)
        # if we got a better score, store the score and parameters
        if score > best_score:
            best_score = score
            best_parameters = {'C': C, 'gamma': gamma}
# rebuild a model on the combined training and validation set
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
print(best_parameters, score)
        

# GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(SVC(),
                param_grid, cv=5, return_train_score=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     iris.data, iris.target, random_state=0)
grid_search.fit(X_train, y_train)

In [None]:
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

In [None]:
print("Best estimator:\n{}".format(grid_search.best_estimator_))