In [39]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

<img src='https://scikit-learn.org/stable/_images/grid_search_workflow.png' width=500 height=500>

In [40]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

### Cross Validation

- split dataset => train set & test set
- split train set => train set & validation set: A, B, C
    - model train AB, validate C => score
    - model train AC, validate B => score
    - model train BC, validate A => score
    - rata-rata score = ?

<img src="https://www.pngitem.com/pimgs/m/106-1062549_images-grid-search-cross-validation-k-fold-cross.png">

### K-Fold Cross Validation

In [41]:
from sklearn.model_selection import KFold

In [42]:
x = [1,2,3,4,5,6,7,8,9]
kf = KFold(n_splits=3)

In [43]:
for i, j in kf.split(x):
    print(i, j)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


### Studi Kasus

In [44]:
from sklearn.datasets import load_iris

In [45]:
iris = load_iris()
dir(iris)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [46]:
df = pd.DataFrame(iris['data'], columns=['SL', 'SW', 'PL', 'PW'])
df['target'] = iris['target']
df.head()

Unnamed: 0,SL,SW,PL,PW,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [47]:
from sklearn.model_selection import train_test_split
xtr, xts, ytr, yts = train_test_split(
    df[['SL', 'SW', 'PL', 'PW']],
    df['target'],
    test_size=.1
)
xtr.head()

Unnamed: 0,SL,SW,PL,PW
41,4.5,2.3,1.3,0.3
97,6.2,2.9,4.3,1.3
50,7.0,3.2,4.7,1.4
88,5.6,3.0,4.1,1.3
145,6.7,3.0,5.2,2.3


In [48]:
dfTr = pd.concat([xtr, ytr], axis=1)
dfTs = pd.concat([xts, yts], axis=1)

#### Cara semimanual

In [49]:
def hitungScore(model, xtr, xts, ytr, yts):
    model.fit(xtr, ytr)
    return model.score(xts, yts)

In [50]:
score = []
kf = KFold(n_splits=5)
for i, j in kf.split(dfTr):
#     print(i, j)  # i = index train, j = index validation
#     print(dfTr.iloc[i])
    xtrcv = dfTr.iloc[i][['SL', 'SW', 'PL', 'PW']]
    ytrcv = dfTr.iloc[i]['target']
    xvalcv = dfTr.iloc[j][['SL', 'SW', 'PL', 'PW']]
    yvalcv = dfTr.iloc[j]['target']
    score.append(
        hitungScore(
            LogisticRegression(),
            xtrcv, xvalcv, ytrcv, yvalcv
        )
    )

print(score)
print(np.mean(score))

[0.9629629629629629, 0.9259259259259259, 0.9629629629629629, 0.9629629629629629, 1.0]
0.962962962962963


#### Cara Otomatis

In [51]:
from sklearn.model_selection import cross_val_score

In [53]:
print(cross_val_score(
    LogisticRegression(),
    dfTr[['SL', 'SW', 'PL', 'PW']],
    dfTr['target'],
    cv=5
))

print(np.mean(cross_val_score(
    LogisticRegression(),
    dfTr[['SL', 'SW', 'PL', 'PW']],
    dfTr['target'],
    cv=5,
    scoring='accuracy'
)))

[0.96296296 0.92592593 1.         0.96296296 0.96296296]
0.962962962962963


In [54]:
from sklearn.neighbors import KNeighborsClassifier

In [55]:
model1 = KNeighborsClassifier(n_neighbors=3)
model2 = KNeighborsClassifier(n_neighbors=4)

In [57]:
score1 = cross_val_score(
    model1,
    dfTr[['SL', 'SW', 'PL', 'PW']],
    dfTr['target'],
    cv=5,
    scoring='balanced_accuracy'
)

print(score1)
print(np.mean(score1))

[1.         0.93333333 1.         0.95833333 0.96296296]
0.9709259259259259


In [59]:
score2 = cross_val_score(
    model2,
    dfTr[['SL', 'SW', 'PL', 'PW']],
    dfTr['target'],
    cv=5,
    scoring='balanced_accuracy'
)

print(score2)
print(np.mean(score2))

[1.         0.93333333 1.         1.         0.96296296]
0.9792592592592593


### training => train & validation:
- K-Fold, Repeated, Stratisfied
- Shuffle Split
- Leave One Out => K-Fold 1 => 1,2,3,4 => 1 2 3 4 => 123  4
- Leave P Out