# 12. Machine learning techniques

In [None]:
!git clone https://github.com/s7s/machine_learning_1.git
%cd  machine_learning_1/ML_in_practice

Cloning into 'machine_learning_1'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (178/178), done.[K
remote: Compressing objects: 100% (131/131), done.[K
remote: Total 178 (delta 79), reused 143 (delta 44), pack-reused 0[K
Receiving objects: 100% (178/178), 34.11 MiB | 20.58 MiB/s, done.
Resolving deltas: 100% (79/79), done.
/content/machine_learning_1/ML_in_practice


In [None]:
import random as rd
rd.seed(0)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 12.1 Loading and exploring the dataset

First, we use pandas to load the dataset from a csv file.

In [None]:
# use pabdas to read './titanic.csv'
raw_data = pd.read_csv('titanic.csv')
raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Next, we can explore the dataset.

In [None]:
# Use pandas to examine the length of the dataset
print(len(raw_data))


891


In [None]:
# Use pandas to examine the columns in the dataset
print(raw_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [None]:
# Use pandas to examine "survived" column (labels)
print(raw_data['Survived'])


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [None]:
# Use pandas to exanine more than one column at the same time ["Name", "Age"]
print(raw_data[['Name','Age']])

                                                  Name   Age
0                              Braund, Mr. Owen Harris  22.0
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0
2                               Heikkinen, Miss. Laina  26.0
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0
4                             Allen, Mr. William Henry  35.0
..                                                 ...   ...
886                              Montvila, Rev. Juozas  27.0
887                       Graham, Miss. Margaret Edith  19.0
888           Johnston, Miss. Catherine Helen "Carrie"   NaN
889                              Behr, Mr. Karl Howell  26.0
890                                Dooley, Mr. Patrick  32.0

[891 rows x 2 columns]


In [None]:
# Use pandas to check how many passengers survived
print(sum(raw_data['Survived']))

342


## 12.2. Cleaning up the data

Now, let's look at how many columns have missing data

In [None]:
# use pandas to check missing data (NA(not available) values) for all the columns
print(raw_data.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


The Cabin column is missing too many values to be useful. Let's drop it altogether.

In [None]:
# Use pandas to drop "Cabin" column
clean_data = raw_data.drop(columns=['Cabin'], axis=1)
print(clean_data.isna().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64


In [None]:
clean_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


Other columns such as Age or Embarked are missing some values, but they can still be useful.

For the age column, let's fill in the missing values with the median of all ages.

For the Embarked column, let's make a new category called 'U', for Unknown port of embarkment.

In [None]:
# get the median of age column using pandas
median_age = clean_data['Age'].median()
median_age

28.0

In [None]:
# use pandas to fill the na values in age column with the median age
clean_data["Age"] = clean_data["Age"].fillna(median_age)

In [None]:
# use pandas to fill the na values in embarked column with 'U'
clean_data["Embarked"] = clean_data["Embarked"].fillna('U')

In [None]:
clean_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [None]:
# view 10 rows of the clean dataset
print(clean_data.head(10))

   PassengerId  Survived  Pclass  ...            Ticket     Fare  Embarked
0            1         0       3  ...         A/5 21171   7.2500         S
1            2         1       1  ...          PC 17599  71.2833         C
2            3         1       3  ...  STON/O2. 3101282   7.9250         S
3            4         1       1  ...            113803  53.1000         S
4            5         0       3  ...            373450   8.0500         S
5            6         0       3  ...            330877   8.4583         Q
6            7         0       1  ...             17463  51.8625         S
7            8         0       3  ...            349909  21.0750         S
8            9         1       3  ...            347742  11.1333         S
9           10         1       2  ...            237736  30.0708         C

[10 rows x 11 columns]


### 12.2.3 Saving our data for the future

In [None]:
# save the clean dataset to './clean_titanic_data.csv'
clean_data.to_csv('./clean_titanic_data.csv', index=None)

## 12.3 Manipulating the features

- One-hot encoding
- Binning
- Feature selection

### 12.3.1 One-hot encoding

In [None]:
preprocessed_data = pd.read_csv('clean_titanic_data.csv')
preprocessed_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C


In [None]:
# Use pandas method .get_dummies() to get the one hot encoding of “embarked”, “pclass” and “gender”
preprocessed_data = pd.get_dummies(preprocessed_data,columns=["Embarked", "Pclass","Sex"])

# Use pandas method .drop() to remove the old columns and method .concat() to add the new columns
#clean_data = clean_data.drop(columns=["embarked", "pclass","gender"])




In [None]:
preprocessed_data

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,0,0,1,0,0,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,0,0,1,0,0,1,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,0,0,1,0,0,0,1,1,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,0,0,1,0,1,0,0,1,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,0,0,1,0,0,1,0,0,1
887,888,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,0,0,1,0,1,0,0,1,0
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",28.0,1,2,W./C. 6607,23.4500,0,0,1,0,0,0,1,1,0
889,890,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,1,0,0,0,1,0,0,0,1


### 12.3.2 Binning

In [None]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
# Use .cut() method to make bins from the age column

categorized_age = pd.cut(preprocessed_data['Age'], bins) 
preprocessed_data['Categorized_age'] = categorized_age
preprocessed_data = preprocessed_data.drop(["Age"], axis=1)
preprocessed_data

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Categorized_age
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.2500,0,0,1,0,0,0,1,0,1,"(20, 30]"
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,1,0,0,0,1,0,0,1,0,"(30, 40]"
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.9250,0,0,1,0,0,0,1,1,0,"(20, 30]"
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1000,0,0,1,0,1,0,0,1,0,"(30, 40]"
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.0500,0,0,1,0,0,0,1,0,1,"(30, 40]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",0,0,211536,13.0000,0,0,1,0,0,1,0,0,1,"(20, 30]"
887,888,1,"Graham, Miss. Margaret Edith",0,0,112053,30.0000,0,0,1,0,1,0,0,1,0,"(10, 20]"
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",1,2,W./C. 6607,23.4500,0,0,1,0,0,0,1,1,0,"(20, 30]"
889,890,1,"Behr, Mr. Karl Howell",0,0,111369,30.0000,1,0,0,0,1,0,0,0,1,"(20, 30]"


In [None]:
# Use pandas method .get_dummies() to get the one hot encoding of “Categorized_age”
# Use pandas method .drop() to remove the old column and method .concat() to add the new columns
preprocessed_data = pd.get_dummies(preprocessed_data, columns=['Categorized_age'])

preprocessed_data

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.2500,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.9250,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1000,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.0500,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",0,0,211536,13.0000,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0
887,888,1,"Graham, Miss. Margaret Edith",0,0,112053,30.0000,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",1,2,W./C. 6607,23.4500,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0
889,890,1,"Behr, Mr. Karl Howell",0,0,111369,30.0000,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0


### 12.3.4 Feature selection

In [None]:
# drop these columns['Name', 'Ticket', 'PassengerId']
preprocessed_data = preprocessed_data.drop(columns=['Name', 'Ticket', 'PassengerId'])
preprocessed_data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0
1,1,1,0,71.2833,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
2,1,0,0,7.925,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0
3,1,1,0,53.1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0
4,0,0,0,8.05,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0


### 12.3.5 Saving for future use

In [None]:
preprocessed_data.to_csv('./preprocessed_titanic_data.csv', index=None)

# 12.4 Training models

In [None]:
data = pd.read_csv('./preprocessed_titanic_data.csv')
data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0
1,1,1,0,71.2833,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
2,1,0,0,7.925,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0
3,1,1,0,53.1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0
4,0,0,0,8.05,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0


### 12.4.1 Features-labels split and train-validation split

In [None]:
# drop ["Survived"] column and save that to features
features = data.drop(columns=['Survived'])

# save the ["Survived"] column to labels
labels = data['Survived']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# split data by 60% train ; use random_state=100
features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(features,labels,test_size=0.4, random_state=100)

In [None]:
# split test data by 50% validation and 50% test ; use random_state=100
features_validation, features_test, labels_validation, labels_test = train_test_split(features_validation_test,labels_validation_test,test_size=0.5, random_state=100)

In [None]:
print(len(features_train))
print(len(features_validation))
print(len(features_test))
print(len(labels_train))
print(len(labels_validation))
print(len(labels_test))

534
178
179
534
178
179


### 12.4.2 Training different models on our dataset

We'll train six models:
- Logistic regression (perceptron)
- Decision tree
- Support vector machine (SVM)
- RandomForestClassifier
- GradientBoostingClassifier
- AdaBoostClassifier

In [None]:
# Train logistic regression model
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression().fit(features_train, labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# Train decision tree model ; don't use any hyperparameter
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier().fit(features_train, labels_train)

In [None]:
# Train SVM model ; don't use any hyperparameter
from sklearn.svm import SVC

svm_model = SVC().fit(features_train, labels_train)

In [None]:
# Train random forest model ; don't use any hyperparameter
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier().fit(features_train, labels_train)

In [None]:
# Train gradient boosting model ; don't use any hyperparameter
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier().fit(features_train, labels_train)

In [None]:
# Train Adaboost model ; don't use any hyperparameter
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier().fit(features_train, labels_train)

### 12.4.3 Evaluating the models

#### Accuracy

In [None]:
from sklearn.metrics import accuracy_score
# print accuracy of each model on validation data
print("Scores of the models")
print("Logistic regression:", accuracy_score(labels_validation,lr_model.predict(features_validation)))
print("Decision tree:", accuracy_score(labels_validation,dt_model.predict(features_validation)))
print("SVM:", accuracy_score(labels_validation,svm_model.predict(features_validation)))
print("Random forest:", accuracy_score(labels_validation,rf_model.predict(features_validation)))
print("Gradient boosting:", accuracy_score(labels_validation,gb_model.predict(features_validation)))
print("AdaBoost:", accuracy_score(labels_validation,ab_model.predict(features_validation)))

Scores of the models
Logistic regression: 0.7696629213483146
Decision tree: 0.7752808988764045
SVM: 0.6797752808988764
Random forest: 0.7696629213483146
Gradient boosting: 0.8146067415730337
AdaBoost: 0.7640449438202247


#### F1-score

In [None]:
# print F1-score of each model on validation data
from sklearn.metrics import f1_score

print("F1-scores of the models:")

print("Logistic regression:", f1_score(labels_validation,lr_model.predict(features_validation)))

print("Decision tree:", f1_score(labels_validation,dt_model.predict(features_validation)))

print("SVM:", f1_score(labels_validation,svm_model.predict(features_validation)))

print("Random forest:", f1_score(labels_validation,rf_model.predict(features_validation)))

print("Gradient boosting:", f1_score(labels_validation,gb_model.predict(features_validation)))

print("AdaBoost:", f1_score(labels_validation,ab_model.predict(features_validation)))

F1-scores of the models:
Logistic regression: 0.6870229007633588
Decision tree: 0.6969696969696969
SVM: 0.39999999999999997
Random forest: 0.6917293233082706
Gradient boosting: 0.744186046511628
AdaBoost: 0.6865671641791045


### 12.4.4 Testing the model

Finding the accuracy and the F1-score of the model in the testing set.

In [None]:
# print accuracy of gradient boost model on testing data
from sklearn.metrics import accuracy_score

print("Scores of the models")
print("Logistic regression:", accuracy_score(labels_test,lr_model.predict(features_test)))
print("Decision tree:", accuracy_score(labels_test,dt_model.predict(features_test)))
print("SVM:", accuracy_score(labels_test,svm_model.predict(features_test)))
print("Random forest:", accuracy_score(labels_test,rf_model.predict(features_test)))
print("Gradient boosting:", accuracy_score(labels_test,gb_model.predict(features_test)))
print("AdaBoost:", accuracy_score(labels_test,ab_model.predict(features_test)))


Scores of the models
Logistic regression: 0.7988826815642458
Decision tree: 0.8044692737430168
SVM: 0.5865921787709497
Random forest: 0.8212290502793296
Gradient boosting: 0.8324022346368715
AdaBoost: 0.7821229050279329


In [None]:
# print F1-score of gradient boost model on testing data
from sklearn.metrics import f1_score

print("F1-scores of the models:")

print("Logistic regression:", f1_score(labels_test,lr_model.predict(features_test)))

print("Decision tree:", f1_score(labels_test,dt_model.predict(features_test)))

print("SVM:", f1_score(labels_test,svm_model.predict(features_test)))

print("Random forest:", f1_score(labels_test,rf_model.predict(features_test)))

print("Gradient boosting:", f1_score(labels_test,gb_model.predict(features_test)))

print("AdaBoost:", f1_score(labels_test,ab_model.predict(features_test)))

F1-scores of the models:
Logistic regression: 0.763157894736842
Decision tree: 0.7770700636942676
SVM: 0.26
Random forest: 0.7922077922077921
Gradient boosting: 0.8026315789473685
AdaBoost: 0.7450980392156863


# 12.5 Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
svm_parameters = {'kernel': ['rbf'],
                  'C': [0.01, 0.1, 1 , 10, 100],
                  'gamma': [0.01, 0.1, 1, 10, 100]
                }
# use gridsearch to find the best hyperparameters 
svm = SVC()
svm_gs = GridSearchCV(estimator=svm, param_grid=svm_parameters).fit(features_train, labels_train)

# git the best model
svm_winner = svm_gs.best_estimator_ 
svm_winner

svm_winner.score(features_validation, labels_validation)

0.7191011235955056

In [None]:
svm_winner

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# 12.6 Cross validation

In [None]:
# print the k-fold cross validation output
svm_gs.cv_results_

{'mean_fit_time': array([0.01010184, 0.01026096, 0.01083078, 0.01216798, 0.00997124,
        0.01013765, 0.01048307, 0.0112206 , 0.01155272, 0.01002083,
        0.01020565, 0.01176305, 0.01201329, 0.01179972, 0.01054902,
        0.01153975, 0.01296477, 0.01507215, 0.01317773, 0.01235042,
        0.01500473, 0.01856232, 0.01327381, 0.01289897, 0.01189432]),
 'mean_score_time': array([0.00281458, 0.00291781, 0.00316572, 0.00349121, 0.00281725,
        0.00282459, 0.00311041, 0.00326166, 0.00360484, 0.00279822,
        0.00271611, 0.00290446, 0.00331073, 0.00316868, 0.00299649,
        0.00252118, 0.00300927, 0.00325661, 0.0030129 , 0.0032228 ,
        0.00267467, 0.00262218, 0.00299292, 0.00285597, 0.00304675]),
 'mean_test_score': array([0.64607653, 0.64607653, 0.64607653, 0.64607653, 0.64607653,
        0.65730912, 0.64607653, 0.64607653, 0.64607653, 0.64607653,
        0.69851878, 0.76411568, 0.72294128, 0.69289367, 0.69664962,
        0.8071945 , 0.75469935, 0.72294128, 0.6929113 , 0