In [1]:
# Dataset: Salary_Data.csv
# Location: https://mitu.co.in/dataset

In [2]:
import pandas as pd

### Import the dataset

In [7]:
df = pd.read_csv('Salary_Data.csv')
x = df['YearsExperience'].values
x = x.reshape(30,1)
y = df['Salary']

### Hold out method

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, random_state=0, test_size=0.25)

### Build Linear Regression model

In [5]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)
print('Training:', reg.score(x_train, y_train))
print('Testing :', reg.score(x_test, y_test))

Training: 0.9395413526983522
Testing : 0.9779208335417602


### K-Fold Method

In [9]:
from sklearn.model_selection import KFold

In [10]:
kf = KFold(n_splits=5, random_state=0, shuffle=True)

In [14]:
for train, test in kf.split(x, y):
    print(train,'\n',test)

[ 0  1  3  4  5  6  7  8  9 11 12 14 15 16 17 18 19 20 21 22 23 25 27 29] 
 [ 2 10 13 24 26 28]
[ 0  1  2  3  4  6  7  8  9 10 12 13 14 15 18 19 20 21 23 24 25 26 28 29] 
 [ 5 11 16 17 22 27]
[ 0  2  3  4  5  6  7  9 10 11 12 13 15 16 17 18 19 21 22 24 25 26 27 28] 
 [ 1  8 14 20 23 29]
[ 0  1  2  3  5  8 10 11 12 13 14 15 16 17 20 21 22 23 24 25 26 27 28 29] 
 [ 4  6  7  9 18 19]
[ 1  2  4  5  6  7  8  9 10 11 13 14 16 17 18 19 20 22 23 24 26 27 28 29] 
 [ 0  3 12 15 21 25]


### Extract train and test data

In [22]:
for train, test in kf.split(x, y):
    print('Train:', x[train].flatten())
    print('Test :', x[test].flatten())

Train: [ 1.1  1.3  2.   2.2  2.9  3.   3.2  3.2  3.7  4.   4.   4.5  4.9  5.1
  5.3  5.9  6.   6.8  7.1  7.9  8.2  9.   9.6 10.5]
Test : [ 1.5  3.9  4.1  8.7  9.5 10.3]
Train: [ 1.1  1.3  1.5  2.   2.2  3.   3.2  3.2  3.7  3.9  4.   4.1  4.5  4.9
  5.9  6.   6.8  7.1  8.2  8.7  9.   9.5 10.3 10.5]
Test : [2.9 4.  5.1 5.3 7.9 9.6]
Train: [ 1.1  1.5  2.   2.2  2.9  3.   3.2  3.7  3.9  4.   4.   4.1  4.9  5.1
  5.3  5.9  6.   7.1  7.9  8.7  9.   9.5  9.6 10.3]
Test : [ 1.3  3.2  4.5  6.8  8.2 10.5]
Train: [ 1.1  1.3  1.5  2.   2.9  3.2  3.9  4.   4.   4.1  4.5  4.9  5.1  5.3
  6.8  7.1  7.9  8.2  8.7  9.   9.5  9.6 10.3 10.5]
Test : [2.2 3.  3.2 3.7 5.9 6. ]
Train: [ 1.3  1.5  2.2  2.9  3.   3.2  3.2  3.7  3.9  4.   4.1  4.5  5.1  5.3
  5.9  6.   6.8  7.9  8.2  8.7  9.5  9.6 10.3 10.5]
Test : [1.1 2.  4.  4.9 7.1 9. ]


### Apply Kfold

In [25]:
for train, test in kf.split(x, y):
    reg.fit(x[train], y[train])
    print('Train Accuracy:', reg.score(x[train], y[train]))
    print('Test Accuracy :', reg.score(x[test],  y[test]),'\n')

Train Accuracy: 0.9411949620562126
Test Accuracy : 0.988169515729126 

Train Accuracy: 0.9612542631539638
Test Accuracy : 0.925791742981774 

Train Accuracy: 0.9635908373578711
Test Accuracy : 0.9252385867407542 

Train Accuracy: 0.9625173469912476
Test Accuracy : 0.8785189123539611 

Train Accuracy: 0.9540189535215173
Test Accuracy : 0.9646120517676184 



### Apply the Kfold to classification

In [35]:
df = pd.read_csv('banknotes.csv')

x = df.drop('Class', axis = 1)
y = df['Class']

df.shape

(1372, 5)

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(random_state=0)

In [34]:
for train, test in kf.split(x, y):
    dt.fit(x.iloc[train,:], y[train])
    y_pred = dt.predict(x.iloc[test,:])
    print('Accuracy:', accuracy_score(y[test], y_pred))

Accuracy: 0.9927272727272727
Accuracy: 0.9854545454545455
Accuracy: 0.9671532846715328
Accuracy: 0.9963503649635036
Accuracy: 0.9671532846715328


In [36]:
y.value_counts()

0    762
1    610
Name: Class, dtype: int64

In [41]:
print('0:', round(y.value_counts()[0]/len(y) * 100, 2))
print('1:', round(y.value_counts()[1]/len(y) * 100, 2))

0: 55.54
1: 44.46


In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, stratify=y, random_state=0)

### Training balance

In [48]:
print('0:', round(y_train.value_counts()[0]/len(y_train) * 100, 2))
print('1:', round(y_train.value_counts()[1]/len(y_train) * 100, 2))

0: 55.49
1: 44.51


### Testing balance

In [49]:
print('0:', round(y_test.value_counts()[0]/len(y_test) * 100, 2))
print('1:', round(y_test.value_counts()[1]/len(y_test) * 100, 2))

0: 55.69
1: 44.31


In [51]:
for train, test in kf.split(x, y):
    print('0:', round(y[train].value_counts()[0]/len(y[train]) * 100, 2))
    print('1:', round(y[train].value_counts()[1]/len(y[train]) * 100, 2))
    print()

0: 55.15
1: 44.85

0: 55.79
1: 44.21

0: 54.64
1: 45.36

0: 55.92
1: 44.08

0: 56.19
1: 43.81



### Stratified K Fold

In [53]:
from sklearn.model_selection import StratifiedKFold

In [54]:
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

In [55]:
for train, test in skf.split(x, y):
    print('0:', round(y[train].value_counts()[0]/len(y[train]) * 100, 2))
    print('1:', round(y[train].value_counts()[1]/len(y[train]) * 100, 2))
    print()

0: 55.52
1: 44.48

0: 55.52
1: 44.48

0: 55.56
1: 44.44

0: 55.56
1: 44.44

0: 55.56
1: 44.44



In [56]:
for train, test in skf.split(x, y):
    dt.fit(x.iloc[train,:], y[train])
    y_pred = dt.predict(x.iloc[test,:])
    print('Accuracy:', accuracy_score(y[test], y_pred))

Accuracy: 0.9854545454545455
Accuracy: 0.9927272727272727
Accuracy: 0.9890510948905109
Accuracy: 0.9854014598540146
Accuracy: 0.9671532846715328
