In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

digits = load_digits()

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target,test_size=0.3)

In [4]:
# Logistic Regression
lr = LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train, y_train)

print('Accuracy:',lr.score(X_test, y_test))

Accuracy: 0.9592592592592593


In [5]:
# SVM
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)

print('Accuracy:',svm.score(X_test, y_test))

Accuracy: 0.3277777777777778


In [6]:
# Random Forest
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)

print('Accuracy:',rf.score(X_test, y_test))

Accuracy: 0.9685185185185186


## KFold cross validation

In [7]:
# KFold cross validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [8]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [9]:
# Use KFold for our digits example

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [10]:

from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data,digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                       digits.target[train_index], digits.target[test_index]
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

In [11]:
scores_logistic

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]

In [12]:
scores_svm

[0.3806343906510851, 0.41068447412353926, 0.5125208681135225]

In [13]:
scores_rf

[0.9315525876460768, 0.9482470784641068, 0.9198664440734557]

## cross validation score function

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
# Logistic regression model performance using cross_val_score

cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), digits.data, digits.target,cv=3)

array([0.89482471, 0.95325543, 0.90984975])

In [16]:
# svm model performance using cross_val_score

cross_val_score(SVC(gamma='auto'), digits.data, digits.target,cv=3)

array([0.38063439, 0.41068447, 0.51252087])

In [17]:
# random forest performance using cross_val_score

cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target,cv=3)

array([0.94657763, 0.95158598, 0.93656093])

## Parameter tunning using k fold cross validation

In [18]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),digits.data, digits.target, cv=10)
scores2 = cross_val_score(RandomForestClassifier(n_estimators=20),digits.data, digits.target, cv=10)
scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),digits.data, digits.target, cv=10)
scores4 = cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target, cv=10)

print('score1:',np.average(scores1))
print('score2:',np.average(scores2))
print('score3:',np.average(scores3))
print('score4:',np.average(scores4))

score1: 0.8870297951582868
score2: 0.9415549348230912
score3: 0.9410117939168219
score4: 0.9387802607076349


## Exercise
Use iris flower dataset from sklearn library and use cross_val_score against following models to measure the performance of each. In the end figure out the model with best performance,

1) ogistic Regression

2) SVM

3) Decision Tree

4) Random Forest

In [20]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [21]:
iris = load_iris()

In [22]:
# Logistic Regression
l_scores = cross_val_score(LogisticRegression(), iris.data, iris.target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
# Decision Tree
d_scores = cross_val_score(DecisionTreeClassifier(), iris.data, iris.target)

In [24]:
# Support Vector Machine (SVM)
s_scores = cross_val_score(SVC(), iris.data, iris.target)

In [25]:
# Random Forest
r_scores = cross_val_score(RandomForestClassifier(n_estimators=40), iris.data, iris.target)

In [26]:
print('Logistic Regression Score:',np.average(l_scores))
print('Decision Tree Score:',np.average(d_scores))
print('Support Vector Machine (SVM) Score:',np.average(s_scores))
print('Random Forest Score:',np.average(r_scores))

Logistic Regression Score: 0.9733333333333334
Decision Tree Score: 0.9666666666666668
Support Vector Machine (SVM) Score: 0.9666666666666666
Random Forest Score: 0.96
