# Cross validation

YT video - https://www.youtube.com/watch?v=fSytzGwwBVw&list=PLblh5JKOoLUICTaGLRoHQDuF_7q2GfuJF&index=2

Allows us to compare different machine learning methods and get a sense of how well they will work in practice

Split dataset into 2 parts - 
1) Training the model
2) Testing the model

In [None]:
from sklearn.datasets import load_digits

#load built-in handwritten digits dataset from scikit-learn
digit_data = load_digits()

#Extract image data 
pictures = digit_data.data

#Extract corresponding target labels
numbers = digit_data.target

#Take first 25 samples only for a smaller subset
small_pictures = pictures[:25]
small_numbers = numbers[:25]

#Display the shape of the selected image data and labels
print("Pictures shape:", small_pictures.shape)
print("Numbers shape:", small_numbers.shape)

#Print the pixel data for the first 5 images
print("First 5 digit images (pixel data):", small_pictures[:5])

#Print the actual digits corresponding to the first 5 images
print("First 5 actual numbers:", small_numbers[:5])

Pictures shape: (25, 64)
Numbers shape: (25,)
First 5 digit images (pixel data): [[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
  15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
   0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
   0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]
 [ 0.  0.  0. 12. 13.  5.  0.  0.  0.  0.  0. 11. 16.  9.  0.  0.  0.  0.
   3. 15. 16.  6.  0.  0.  0.  7. 15. 16. 16.  2.  0.  0.  0.  0.  1. 16.
  16.  3.  0.  0.  0.  0.  1. 16. 16.  6.  0.  0.  0.  0.  1. 16. 16.  6.
   0.  0.  0.  0.  0. 11. 16. 10.  0.  0.]
 [ 0.  0.  0.  4. 15. 12.  0.  0.  0.  0.  3. 16. 15. 14.  0.  0.  0.  0.
   8. 13.  8. 16.  0.  0.  0.  0.  1.  6. 15. 11.  0.  0.  0.  1.  8. 13.
  15.  1.  0.  0.  0.  9. 16. 16.  5.  0.  0.  0.  0.  3. 13. 16. 16. 11.
   5.  0.  0.  0.  0.  3. 11. 16.  9.  0.]
 [ 0.  0.  7. 15. 13.  1.  0.  0.  0.  8. 13.  6. 15.  4.  0.  0.  0.  2.
   1. 13. 13.  0.  0.  0.  0.  0.  2. 15. 11.  1. 

### K-Fold Cross Validation
Splits data into K parts to train and test the model K times, using a different part for testing each time.

In [2]:
from sklearn.model_selection import KFold
from sklearn.datasets import load_digits
from sklearn.neighbors import KNeighborsClassifier

# Load digit images and labels
X, y = load_digits(return_X_y=True)
# Use only first 30 samples
X, y = X[:30], y[:30]

# Create 3-fold cross-validator
cv = KFold(n_splits=3, shuffle=True, random_state=123)
# Create KNN model
model = KNeighborsClassifier()

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(cv.split(X)):
    model.fit(X[train_idx], y[train_idx]) # Train model on training data
    score = model.score(X[val_idx], y[val_idx]) # Test model on validation data
    print(f"Fold {fold+1}: {score:.3f}") # Print accuracy score for this fold

Fold 1: 0.300
Fold 2: 0.200
Fold 3: 0.500


### Stratified K-Fold

Splits data while preserving the original class proportions in each fold.

In [1]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

#Create a simple imbalanced dataset
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]])  #Features
y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1])  # Labels: 6 class-0, 4 class-1

#Create stratified K-Fold
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Splitting process
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}:")
    print(f"  Train labels: {y[train_idx]}")  # Shows class distribution in training
    print(f"  Val labels: {y[val_idx]}")      # Shows class distribution in validation
    print()

Fold 1:
  Train labels: [0 0 0 1 1]
  Val labels: [0 0 0 1 1]

Fold 2:
  Train labels: [0 0 0 1 1]
  Val labels: [0 0 0 1 1]



### Cross-Validation Score Aggregation

Computing mean and standard deviation from individual fold scores to get a statistical summary of model performance

In [3]:
from sklearn.model_selection import cross_val_score  
from sklearn.datasets import load_digits 
from sklearn.neighbors import KNeighborsClassifier  
import numpy as np  

X, y = load_digits(return_X_y=True) # Load digits dataset
X, y = X[:100], y[:100] #Use first 100 samples

model = KNeighborsClassifier()  # Create model

scores = cross_val_score(model, X, y, cv=5)  # Get scores from 5-fold CV
mean_score = np.mean(scores)  # Calculate average performance
std_score = np.std(scores)  # Calculate performance consistency

print(f"CV Score: {mean_score:.3f} ± {std_score:.3f}")  # Report results properly

CV Score: 0.950 ± 0.055


### Interpreting Standard Deviation

Low std (< 0.02): Model is stable and reliable

High std (> 0.10): Model is unstable, results less trustworthy