## A1. Dataset and Setup

### Import required libraries

In [43]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

### Load dataset

In [44]:
digits = load_digits()
print(digits.data.shape)

(1797, 64)


### Data splitting

In [45]:
X, y = digits['data'], digits['target']

# split into train (70%) and temp (30%)
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, temp_index in sss1.split(X, y):
    X_train, X_temp = X[train_index], X[temp_index]
    y_train, y_temp = y[train_index], y[temp_index]

# split temp into validation (50% of temp) and test (50% of temp)
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_index, test_index in sss2.split(X_temp, y_temp):
    X_validation, X_test = X_temp[val_index], X_temp[test_index]
    y_validation, y_test = y_temp[val_index], y_temp[test_index]

#### Check shapes

In [46]:
print("Train:", X_train.shape, "Validation:", X_validation.shape, "Test:", X_test.shape)

Train: (1257, 64) Validation: (270, 64) Test: (270, 64)


####  Check class distribution

In [47]:
print("Train labels:", np.bincount(y_train))
print("Validation labels:", np.bincount(y_validation))
print("Test labels:", np.bincount(y_test))

Train labels: [124 127 124 128 127 127 127 125 122 126]
Validation labels: [27 27 27 27 27 28 27 27 26 27]
Test labels: [27 28 26 28 27 27 27 27 26 27]


#### Data standardization 

In [51]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)

sc.fit(X_validation)
X_validation = sc.transform(X_validation)

sc.fit(X_test)
X_test = sc.transform(X_test)