### Split into Train and Test Sets

In [16]:
# load required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    LeaveOneOut,
    ShuffleSplit,
)
from sklearn.linear_model import LogisticRegression

In [17]:
# load the dataset
filename = "data/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
data = pd.read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
y = array[:, 8]

In [18]:
# Set up collector for results
results_summary = []

In [19]:
# split the dataset
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=seed
)

# fit the model
model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)

# evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7559


In [20]:
results_summary.append(
    {
        "Method": "Train/Test (33%) Split",
        "Accuracy Mean": accuracy,
        "Accuracy Std": np.nan,
    }
)

### K-fold Cross-Validation

In [21]:
# evaluate the model using cross-validation
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
model = LogisticRegression(solver="liblinear")
kf_results = cross_val_score(model, X, y, cv=kfold)
print(f"Cross-Validation Accuracy: {kf_results.mean():.4f} ({kf_results.std():.4f})")

Cross-Validation Accuracy: 0.7709 (0.0509)


In [22]:
results_summary.append(
    {
        "Method": "K-Fold (10)",
        "Accuracy Mean": kf_results.mean(),
        "Accuracy Std": kf_results.std(),
    }
)

### Leave One Out Cross-Validation

In [23]:
# evaluate using leave-one-out cross-validation
loocv = LeaveOneOut()
model = LogisticRegression(solver="liblinear")
loo_results = cross_val_score(model, X, y, cv=loocv)
print(
    f"Leave-One-Out Cross-Validation Accuracy: {loo_results.mean():.4f} ({loo_results.std():.4f})"
)

Leave-One-Out Cross-Validation Accuracy: 0.7682 (0.4220)


In [24]:
results_summary.append(
    {
        "Method": "Leave-One-Out",
        "Accuracy Mean": loo_results.mean(),
        "Accuracy Std": loo_results.std(),
    }
)

### Repeated Random Test-Train Splits

In [25]:
# evaluate the model using shuffle split cross-validation
n_splits = 10
test_size = 0.33
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
model = LogisticRegression(solver="liblinear")
ss_results = cross_val_score(model, X, y, cv=kfold)
print(
    f"Shuffle Split Cross-Validation Accuracy: {ss_results.mean():.4f} ({ss_results.std():.4f})"
)

Shuffle Split Cross-Validation Accuracy: 0.7650 (0.0170)


In [26]:
results_summary.append(
    {
        "Method": "ShuffleSplit (10 x 33%)",
        "Accuracy Mean": ss_results.mean(),
        "Accuracy Std": ss_results.std(),
    }
)

In [27]:
# Create comparison table
results_df = pd.DataFrame(results_summary)
results_df = results_df.set_index("Method")

print(results_df.round(4))

                         Accuracy Mean  Accuracy Std
Method                                              
Train/Test (33%) Split          0.7559           NaN
K-Fold (10)                     0.7709        0.0509
Leave-One-Out                   0.7682        0.4220
ShuffleSplit (10 x 33%)         0.7650        0.0170
