# This notebook implements cross validation for Iris dataset

## Remember the Iris dataset? 

In [36]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn import datasets
import pandas as pd

In [40]:
np.random.seed(42)

### Load Iris dataset

In [2]:
X, Y = datasets.load_iris(return_X_y=True)
X.shape, Y.shape

((150, 4), (150,))

In [7]:
iris = datasets.load_iris()
X = iris.data
Y = iris.target
X.shape, Y.shape

((150, 4), (150,))

### Set aside test samples

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((120, 4), (120,), (30, 4), (30,))

In [33]:
X_train

array([[6.4, 3.1, 5.5, 1.8],
       [5.4, 3. , 4.5, 1.5],
       [5.2, 3.5, 1.5, 0.2],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.2],
       [5.2, 2.7, 3.9, 1.4],
       [5.7, 3.8, 1.7, 0.3],
       [6. , 2.7, 5.1, 1.6],
       [5.9, 3. , 4.2, 1.5],
       [5.8, 2.6, 4. , 1.2],
       [6.8, 3. , 5.5, 2.1],
       [4.7, 3.2, 1.3, 0.2],
       [6.9, 3.1, 5.1, 2.3],
       [5. , 3.5, 1.6, 0.6],
       [5.4, 3.7, 1.5, 0.2],
       [5. , 2. , 3.5, 1. ],
       [6.5, 3. , 5.5, 1.8],
       [6.7, 3.3, 5.7, 2.5],
       [6. , 2.2, 5. , 1.5],
       [6.7, 2.5, 5.8, 1.8],
       [5.6, 2.5, 3.9, 1.1],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.3, 4.7, 1.6],
       [5.5, 2.4, 3.8, 1.1],
       [6.3, 2.7, 4.9, 1.8],
       [6.3, 2.8, 5.1, 1.5],
       [4.9, 2.5, 4.5, 1.7],
       [6.3, 2.5, 5. , 1.9],
       [7. , 3.2, 4.7, 1.4],
       [6.5, 3. , 5.2, 2. ],
       [6. , 3.4, 4.5, 1.6],
       [4.8, 3.1, 1.6, 0.2],
       [5.8, 2.7, 5.1, 1.9],
       [5.6, 2.7, 4.2, 1.3],
       [5.6, 2

In [32]:
Y_train

array([2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0, 0, 1, 2, 2, 2, 2, 1, 2,
       1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 2, 0, 0, 2, 1, 0, 0,
       1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2,
       0, 0, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2, 1, 2, 1,
       0, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 1, 2, 2, 1, 1, 0, 1, 2, 2, 0, 1,
       1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [38]:
X_train_df = pd.DataFrame(data=X_train, columns=["SepalLengthCM", "SepalWidthCM", "PentalLengthCM", "PetalWidthCM"])
print(X_train_df)

     SepalLengthCM  SepalWidthCM  PentalLengthCM  PetalWidthCM
0              6.4           3.1             5.5           1.8
1              5.4           3.0             4.5           1.5
2              5.2           3.5             1.5           0.2
3              6.1           3.0             4.9           1.8
4              6.4           2.8             5.6           2.2
..             ...           ...             ...           ...
115            4.9           3.1             1.5           0.1
116            6.3           2.9             5.6           1.8
117            5.8           2.7             4.1           1.0
118            7.7           3.8             6.7           2.2
119            4.6           3.2             1.4           0.2

[120 rows x 4 columns]


In [39]:
X_train_df.describe()

Unnamed: 0,SepalLengthCM,SepalWidthCM,PentalLengthCM,PetalWidthCM
count,120.0,120.0,120.0,120.0
mean,5.880833,3.053333,3.815833,1.231667
std,0.850437,0.431907,1.784904,0.775354
min,4.3,2.0,1.1,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.5,3.3,5.2,1.9
max,7.9,4.4,6.9,2.5


### Break training dataset into 5-fold cross validation sets

In [29]:
kfolds = KFold(n_splits=5)
for train_indices, validation_indices in kfolds.split(X_train):
    print("Training indices: "+str(train_indices))
    print("Validation indices: "+str(validation_indices))
    X_train_set, X_validation_set = X_train[train_indices], X_train[validation_indices]
    Y_train_set, Y_validation_set = Y_train[train_indices], Y_train[validation_indices]
    print("Shapes of train/valiation sets:")
    print(X_train_set.shape, Y_train_set.shape, X_validation_set.shape, Y_validation_set.shape)
    print("**********************************************")
    print("")

Training indices: [ 24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41
  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59
  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119]
Validation indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Shapes of train/valiation sets:
(96, 4) (96,) (24, 4) (24,)
**********************************************

Training indices: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  48  49  50  51  52  53  54  55  56  57  58  59
  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 1

In [30]:
stratkfolds = StratifiedKFold(n_splits=5)
for train_indices, validation_indices in stratkfolds.split(X_train, Y_train):
    print("Training indices: "+str(train_indices))
    print("Validation indices: "+str(validation_indices))
    X_train_set, X_validation_set = X_train[train_indices], X_train[validation_indices]
    Y_train_set, Y_validation_set = Y_train[train_indices], Y_train[validation_indices]
    print("Shapes of train/valiation sets:")
    print(X_train_set.shape, Y_train_set.shape, X_validation_set.shape, Y_validation_set.shape)
    print("**********************************************")
    print("")

Training indices: [ 21  23  24  25  26  27  28  29  30  32  33  34  35  36  37  39  40  41
  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59
  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119]
Validation indices: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 22 31 38]
Shapes of train/valiation sets:
(96, 4) (96,) (24, 4) (24,)
**********************************************

Training indices: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  22  31  38  41  44  46  47  49  50  51  53  54  55  56  59
  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 1

### Can include cross validation into model evaluation

In [10]:
model = svm.SVC()
model.fit(X_train, Y_train)
cv_results = cross_validate(model, X_train, Y_train, cv=5)
print(cv_results['test_score'])
print("Mean accuracy: "+str(cv_results['test_score'].mean()))

[0.95833333 0.95833333 1.         1.         0.875     ]
Mean accuracy: 0.9583333333333334
