In [1]:
# Salary_Data.csv

In [3]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('Salary_Data.csv')
df.head(3)

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0


### Holdout method

In [14]:
x = df['YearsExperience'].values
x = x.reshape(30,1)
y = df['Salary']

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   random_state= 0,
                                                   test_size= 0.25)

In [18]:
# Build the linear reg model

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
print('testing:', lr.score(x_test, y_test))
print('training:', lr.score(x_train, y_train))

testing: 0.9779208335417602
training: 0.9395413526983522


## K-fold

In [22]:
from sklearn.model_selection import KFold

In [24]:
kf = KFold(n_splits= 5, random_state= 0, shuffle= True)

In [26]:
for train, test in kf.split(x, y):
    print(train, '\n', test)

[ 0  1  3  4  5  6  7  8  9 11 12 14 15 16 17 18 19 20 21 22 23 25 27 29] 
 [ 2 10 13 24 26 28]
[ 0  1  2  3  4  6  7  8  9 10 12 13 14 15 18 19 20 21 23 24 25 26 28 29] 
 [ 5 11 16 17 22 27]
[ 0  2  3  4  5  6  7  9 10 11 12 13 15 16 17 18 19 21 22 24 25 26 27 28] 
 [ 1  8 14 20 23 29]
[ 0  1  2  3  5  8 10 11 12 13 14 15 16 17 20 21 22 23 24 25 26 27 28 29] 
 [ 4  6  7  9 18 19]
[ 1  2  4  5  6  7  8  9 10 11 13 14 16 17 18 19 20 22 23 24 26 27 28 29] 
 [ 0  3 12 15 21 25]


In [28]:
# Apply kfold

In [36]:
for train, test in kf.split(x, y):
    lr.fit(x[train], y[train])
    print('training:', lr.score(x[train], y[train]))
    print('testing:', lr.score(x[test], y[test]), '\n')

training: 0.9411949620562126
testing: 0.988169515729126 

training: 0.9612542631539638
testing: 0.9257917429817739 

training: 0.9635908373578711
testing: 0.9252385867407542 

training: 0.9625173469912476
testing: 0.8785189123539608 

training: 0.9540189535215173
testing: 0.9646120517676184 



# apply kfold to classification

In [39]:
#  banknotes.csv

In [41]:
df = pd.read_csv('banknotes.csv')
x = df.drop('Class', axis = 1)
y = df['Class']

In [43]:
df.shape

(1372, 5)

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(random_state=0)

In [49]:
for train, test in kf.split(x, y):
    dt.fit(x.iloc[train,:], y[train])
    y_pred = dt.predict(x.iloc[test,:])
    print('Accuracy:', accuracy_score(y[test], y_pred))

Accuracy: 0.9927272727272727
Accuracy: 0.9854545454545455
Accuracy: 0.9671532846715328
Accuracy: 0.9963503649635036
Accuracy: 0.9671532846715328


In [51]:
y.value_counts()

Class
0    762
1    610
Name: count, dtype: int64

In [59]:
print('0:', round(y.value_counts()[0]/len(y) * 100, 2))
print('1:', round(y.value_counts()[1]/len(y) * 100, 2))

0: 55.54
1: 44.46


# stratified kfold

In [62]:
# stratification
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                   random_state= 0,
                                                   test_size= 0.25,
                                                   stratify= y)

In [64]:
# training balance

print('0:', round(y_train.value_counts()[0]/len(y_train) * 100, 2))
print('1:', round(y_train.value_counts()[1]/len(y_train) * 100, 2))

0: 55.49
1: 44.51


In [68]:
# testing balance
print('0:', round(y_test.value_counts()[0]/len(y_test) * 100, 2))
print('1:', round(y_test.value_counts()[1]/len(y_test) * 100, 2))

0: 55.69
1: 44.31


In [70]:
for train, test in kf.split(x, y):
    print('0:', round(y[train].value_counts()[0]/len(y[train]) * 100, 2))
    print('1:', round(y[train].value_counts()[1]/len(y[train]) * 100, 2))
    print()

0: 55.15
1: 44.85

0: 55.79
1: 44.21

0: 54.64
1: 45.36

0: 55.92
1: 44.08

0: 56.19
1: 43.81



In [72]:
# stratified kfold

In [74]:
from sklearn.model_selection import StratifiedKFold

In [76]:
skf = StratifiedKFold(n_splits= 5, random_state= 0, shuffle= True)

In [78]:
for train, test in skf.split(x, y):
    print('0:', round(y[train].value_counts()[0]/len(y[train]) * 100, 2))
    print('1:', round(y[train].value_counts()[1]/len(y[train]) * 100, 2))
    print()

0: 55.52
1: 44.48

0: 55.52
1: 44.48

0: 55.56
1: 44.44

0: 55.56
1: 44.44

0: 55.56
1: 44.44



In [80]:
for train, test in skf.split(x, y):
    dt.fit(x.iloc[train,:], y[train])
    y_pred = dt.predict(x.iloc[test,:])
    print('Accuracy:', accuracy_score(y[test], y_pred))

Accuracy: 0.9854545454545455
Accuracy: 0.9927272727272727
Accuracy: 0.9890510948905109
Accuracy: 0.9854014598540146
Accuracy: 0.9671532846715328
