**Load iris flower dataset**

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('IRIS.csv')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df.shape

(150, 5)

In [5]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [11]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [13]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

<h3 style='color:blue'>Approach 1: Use train_test_split and manually tune parameters by trial and error</h3>

In [16]:
X = df.drop('species', axis=1)
X

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [18]:
y = df['species']
y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: species, Length: 150, dtype: object

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [22]:
X_train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
29,4.7,3.2,1.6,0.2
145,6.7,3.0,5.2,2.3
112,6.8,3.0,5.5,2.1
148,6.2,3.4,5.4,2.3
119,6.0,2.2,5.0,1.5
...,...,...,...,...
144,6.7,3.3,5.7,2.5
26,5.0,3.4,1.6,0.4
1,4.9,3.0,1.4,0.2
118,7.7,2.6,6.9,2.3


In [24]:
X_test

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
120,6.9,3.2,5.7,2.3
27,5.2,3.5,1.5,0.2
116,6.5,3.0,5.5,1.8
34,4.9,3.1,1.5,0.1
134,6.1,2.6,5.6,1.4
52,6.9,3.1,4.9,1.5
136,6.3,3.4,5.6,2.4
56,6.3,3.3,4.7,1.6
138,6.0,3.0,4.8,1.8
49,5.0,3.3,1.4,0.2


In [26]:
from sklearn.svm import SVC

In [28]:
model = SVC(kernel='rbf', C=1, gamma='auto')

model.fit(X_train,y_train)

model.score(X_test, y_test)

0.9333333333333333

<h3 style='color:blue'>Approach 2: Use K Fold Cross validation</h3>

**Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation**

In [32]:
from sklearn.model_selection import cross_val_score

In [38]:
sv1=cross_val_score(SVC(kernel='rbf',C=1,gamma='auto'),X, y, cv=5)
sv1

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [40]:
sv1.mean()

0.9800000000000001

In [44]:
from sklearn.neighbors import KNeighborsClassifier

In [46]:
sv4 = cross_val_score(KNeighborsClassifier(n_neighbors=3), X, y, cv=5)
sv4

array([0.96666667, 0.96666667, 0.93333333, 0.96666667, 1.        ])

In [48]:
sv4.mean()

0.9666666666666668

<h3 style='color:blue'>Use GridSearchCV</h3>

**GridSearchCV does exactly same thing as for loop above but in a single line of code**

In [50]:
from sklearn.model_selection import GridSearchCV

In [56]:

clf = GridSearchCV(SVC(gamma='auto'), {
    'C': [1,10,20,30],
    'kernel': ['rbf','linear', 'poly']
}, cv=5)

clf.fit(X, y)

clf.cv_results_


{'mean_fit_time': array([0.00201559, 0.0018239 , 0.00180035, 0.00100031, 0.00139866,
        0.00279975, 0.00119972, 0.00119753, 0.00450406, 0.00119996,
        0.00099998, 0.00460019]),
 'std_fit_time': array([3.19300537e-04, 4.14394783e-04, 3.99616264e-04, 2.42953095e-06,
        4.88544804e-04, 1.16560742e-03, 3.99351817e-04, 3.99552249e-04,
        3.32574607e-03, 3.99708958e-04, 3.50402318e-07, 2.87103635e-03]),
 'mean_score_time': array([0.00097394, 0.00099902, 0.00059824, 0.00079994, 0.00080147,
        0.00119977, 0.00080075, 0.0008019 , 0.00120015, 0.00100012,
        0.00100088, 0.00079961]),
 'std_score_time': array([2.67030384e-04, 3.16800790e-06, 4.88466581e-04, 3.99972913e-04,
        4.00749806e-04, 3.99447469e-04, 4.00384565e-04, 4.00961096e-04,
        3.99735220e-04, 9.46494734e-07, 1.14837595e-06, 3.99804524e-04]),
 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10, 20, 20, 20, 30, 30, 30],
              mask=[False, False, False, False, False, False, False, False,
 

In [58]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002016,0.0003193005,0.000974,0.0002670304,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001824,0.0004143948,0.000999,3.168008e-06,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.0018,0.0003996163,0.000598,0.0004884666,1,poly,"{'C': 1, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
3,0.001,2.429531e-06,0.0008,0.0003999729,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.001399,0.0004885448,0.000801,0.0004007498,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
5,0.0028,0.001165607,0.0012,0.0003994475,10,poly,"{'C': 10, 'kernel': 'poly'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
6,0.0012,0.0003993518,0.000801,0.0004003846,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
7,0.001198,0.0003995522,0.000802,0.0004009611,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
8,0.004504,0.003325746,0.0012,0.0003997352,20,poly,"{'C': 20, 'kernel': 'poly'}",0.966667,0.966667,0.9,0.933333,1.0,0.953333,0.033993,11
9,0.0012,0.000399709,0.001,9.464947e-07,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,9


In [60]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,1,poly,0.966667
3,10,rbf,0.98
4,10,linear,0.973333
5,10,poly,0.966667
6,20,rbf,0.966667
7,20,linear,0.966667
8,20,poly,0.953333
9,30,rbf,0.96


In [62]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [64]:
clf.best_score_

0.9800000000000001