In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, LeaveOneOut, cross_val_score, ShuffleSplit, cross_val_score

In [2]:
pima_df = pd.read_csv("raw_data\diabetes.csv")

#### 4 different techniques that we can use to split up our training dataset and create useful estimates of performance for our machine learning algorithms:

- Train and Test Sets.
- k-fold Cross Validation.
- Leave One Out Cross Validation.
- Repeated Random Test-Train Splits.

##### 1. Split into Train and Test Sets

In [3]:
# Split the data into features and target
X = pima_df.drop('Outcome', axis=1)
y = pima_df['Outcome']

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33,random_state=7)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print(f"Accuracy: {result*100.0}") #% %.3f%%(result*100.0)

Accuracy: 78.74015748031496


##### 2. K-fold Cross Validation

In [11]:
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {(results.mean()*100.0, results.std()*100.0)}")  # %.3f%% (%.3f%%)

Accuracy: (77.21633629528367, 4.96837651757489)


##### 3. Leave One Out Cross Validation

In [13]:
loocv = LeaveOneOut()
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, X, y, cv=loocv)
print(f"Accuracy: {(results.mean()*100.0, results.std()*100.0)}") # % %.3f%% (%.3f%%)

Accuracy: (77.60416666666666, 41.68944689773287)


##### 4. Repeated Random Test-Train Splits

In [15]:
n_splits = 10
test_size = 0.33
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {(results.mean()*100.0, results.std()*100.0)}") # % %.3f%% (%.3f%%) 

Accuracy: (76.53543307086613, 2.235444026232818)
