# Machine Learning
Yorick Juffer - s1993623 \
Leon Koole - s4436563 \
... \
Alejandro Sánchez Roncero - s5279402

## Libraries

In [85]:

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix 

# new contribution
from sklearn import svm
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA

# Loading data
This dataset contains 2000 grayscale images of handwritten digits, 200 from each class. The images are 15 × 16 
sized, making for n = 240 dimensional image vectors. The data are in the attached text file mfeat-pix.txt, 
one vector per row, sorted such that the first 200 rows are "0" digit examples, the next 200 are "1" digit examples 
etc. The grayscale encoding in mfeat-pix.txt is done by integer steps from 0 (white) to 6 (black). 

In [5]:
# Load lines from mfeat-pix.txt
features = open('mfeat-pix.txt').readlines()

# Create labels for each line, 0-9 for each 200 lines
labels = []
for i in range(10):
  for j in range(200):
    labels.append(i)
labels = np.array(labels)

# Convert each line to a numpy array
for i in range(len(features)):
  features[i] = np.array(features[i].split()).astype('float').reshape(16*15)
  # Normalize
  features[i] = features[i] / 6
features = np.array(features)

print('features: {}, labels: {}'.format(features.shape, labels.shape))

features: (2000, 240), labels: (2000,)


In [38]:
# Load augmented data
features_augmented = np.load('X_train_augmented.npy')
features_augmented = np.array([ff.reshape(240) for ff in features_augmented])
labels_augmented = np.array(np.load('y_train_augmented.npy'))

print('feat_augmented: {}, labels: {}'.format(features_augmented.shape, labels_augmented.shape))

feat_augmented: (6600, 240), labels: (6600,)


## Split

In [58]:
split_train_test = 0.5

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=split_train_test, random_state=40, stratify=labels)
y_train.reshape(y_train.shape[0]); y_test.reshape(y_test.shape[0])

X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(features_augmented, labels_augmented, test_size=split_train_test, 
                                                                    random_state=40, stratify=labels_augmented)
y_train_aug.reshape(y_train_aug.shape[0]); y_test_aug.reshape(y_test_aug.shape[0])

print('y_train, y_test: {}, {}'.format(y_train.shape, y_test.shape))
print('y_train: ', np.unique(y_train, return_counts=True))
print('y_train_aug: ', np.unique(y_train_aug, return_counts=True))
print('labels_augmented: ', np.unique(labels_augmented, return_counts=True))

pca = PCA(n_components = 0.99)
pca.fit(X_train)
X_train_reduced = pca.transform(X_train)
X_test_reduced = pca.transform(X_test)

pca_aug = PCA(n_components = 0.99)
pca_aug.fit(X_train_aug)
X_train_aug_reduced = pca.transform(X_train_aug)
X_test_augg_reduced = pca.transform(X_test_aug)

y_train, y_test: (1000,), (1000,)
y_train:  (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([100, 100, 100, 100, 100, 100, 100, 100, 100, 100], dtype=int64))
y_train_aug:  (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64), array([333, 372, 311, 335, 343, 365, 326, 334, 269, 312], dtype=int64))
labels_augmented:  (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64), array([667, 743, 621, 671, 685, 731, 653, 668, 537, 624], dtype=int64))


# SVM Model
Training of SVC and NuSVC.
- Both use one-vs-one approach
- C: by default 1 is fine. Less C means more regularization (more smooth surface), only useful for noisy observations
- Data has been scaled since they're not scale invariant
- kernel options: polynomial, rbf, sigmoid

In [92]:
# Grid search evaluation
# scores = ["precision"]
gamma_search = [1e-3, 1e-2, 5e-2, 1e-1, 0.5e-1]
C_search = [1e-1, 1, 5, 10, 15, 100]
r_search = [0, 1, 10, 100]
degree_search = [2, 3]

param_grid = [
    {"kernel": ["rbf"], "gamma": gamma_search, "C": C_search},
    {"kernel": ["poly"], "degree": degree_search, "coef0": r_search, "gamma": gamma_search, "C": C_search},
    {"kernel": ["sigmoid"], "coef0": r_search, "gamma": gamma_search, "C": C_search}
]

grid_search = GridSearchCV(
    svm.SVC(), param_grid, 
    verbose=3,
    return_train_score=True,
    n_jobs=4
)
grid_search.fit(X_train_reduced, y_train)

print("params: {} with mean test score: {}".format(grid_search.best_params_, grid_search.best_score_))

grid_search_df = pd.DataFrame(grid_search.cv_results_)
grid_search_df.to_csv("SVM/grid_search.csv")

Fitting 5 folds for each of 510 candidates, totalling 2550 fits
params: {'C': 1, 'coef0': 1, 'degree': 3, 'gamma': 0.05, 'kernel': 'poly'} with mean test score: 0.9799999999999999


In [95]:
# Train the model with the best obtained results
svc = svm.SVC(decision_function_shape='ovo', C=1, coef0=1, degree=3, gamma=0.05, kernel="poly") 
svc.fit(X_train_reduced, y_train)

## Model evaluation -> average across several splits of the dataset

In [98]:
random_seeds = [0, 21, 42, 100, 200, 300, 400, 1000]
scores = []
for rs in random_seeds:

    split_train_test = 0.5

    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=split_train_test, random_state=rs, stratify=labels)
    y_train.reshape(y_train.shape[0]); y_test.reshape(y_test.shape[0])

    X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(features_augmented, labels_augmented, test_size=split_train_test, 
                                                                        random_state=rs, stratify=labels_augmented)
    y_train_aug.reshape(y_train_aug.shape[0]); y_test_aug.reshape(y_test_aug.shape[0])

    pca = PCA(n_components = 0.99)
    pca.fit(X_train)
    X_train_reduced = pca.transform(X_train)
    X_test_reduced = pca.transform(X_test)

    pca_aug = PCA(n_components = 0.99)
    pca_aug.fit(X_train_aug)
    X_train_aug_reduced = pca.transform(X_train_aug)
    X_test_augg_reduced = pca.transform(X_test_aug)

    # Train the model with the best obtained results
    svc = svm.SVC(decision_function_shape='ovo', C=1, coef0=1, degree=3, gamma=0.05, kernel="poly") 
    svc.fit(X_train_reduced, y_train)
    
    scores.append(svc.score(X_test_reduced, y_test))

print("Mean: {:.3f}, Individuals: {}".format(np.mean(scores), scores))


Mean: 0.979, Individuals: [0.982, 0.977, 0.981, 0.976, 0.981, 0.98, 0.979, 0.978]
