# CS3920 - Machine Learning

Assignment 2

## Task 1
### Load the data set into Python. Merge the USPS training and test sets into one dataset.

In [1]:
import numpy as np
from sklearn.datasets import load_wine

wine = load_wine()
X_wine = wine["data"]
y_wine = wine["target"]
print("X size (wine):", X_wine.shape)
print("y size (wine):", y_wine.shape)

zip_train = np.genfromtxt("zip.train", delimiter=' ')
zip_test = np.genfromtxt("zip.test", delimiter=' ')
X_zip = np.concatenate((zip_train[:, 1:], zip_test[:, 1:]))
y_zip = np.concatenate((zip_train[:, 0], zip_test[:, 0]))
print("X size (zip):", X_zip.shape)
print("y size (zip):", y_zip.shape)

X size (wine): (178, 13)
y size (wine): (178,)
X size (zip): (9298, 256)
y size (zip): (9298,)


## Task 2
### Divide the datasets into a training set and a test set.

In [2]:
from sklearn.model_selection import train_test_split

X_wine_train, X_wine_test, y_wine_train, y_wine_test = \
        train_test_split(X_wine, y_wine, test_size=0.2, random_state=2311)
print("X_train size (wine):", X_wine_train.shape)
print("y_train size (wine):", y_wine_train.shape)
print("X_test size (wine):", X_wine_test.shape)
print("y_test size (wine):", y_wine_test.shape)

X_zip_train, X_zip_test, y_zip_train, y_zip_test = \
        train_test_split(X_zip, y_zip, test_size=0.2, random_state=2311)
print("X_train size (zip):", X_zip_train.shape)
print("y_train size (zip):", y_zip_train.shape)
print("X_test size (zip):", X_zip_test.shape)
print("y_test size (zip):", y_zip_test.shape)

X_train size (wine): (142, 13)
y_train size (wine): (142,)
X_test size (wine): (36, 13)
y_test size (wine): (36,)
X_train size (zip): (7438, 256)
y_train size (zip): (7438,)
X_test size (zip): (1860, 256)
y_test size (zip): (1860,)


## Task 3
### Using cross-validation and the training set only, estimate the generalization accuracy of the SVM with the default values of the parameters. You may use the function cross_val_score.

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svm = SVC()

wine_cvs = cross_val_score(svm, X_wine_train, y_wine_train)
wine_accuracy = np.mean(wine_cvs)
print("Accuracy (wine):", wine_accuracy * 100, "%")

zip_cvs = cross_val_score(svm, X_zip_train, y_zip_train)
zip_accuracy = np.mean(zip_cvs)
print("Accuracy (zip):", zip_accuracy * 100, "%")

Accuracy (wine): 68.32512315270935 %
Accuracy (zip): 97.12291472330087 %


## Task 4
### Find the test error rate of the SVM with the default values of parameters, compare it with the estimate obtained in the previous task, and write down your observations.

In [4]:
svm.fit(X_wine_train, y_wine_train)
y_wine_prediction = svm.predict(X_wine_test)
wine_incorrect = np.sum(y_wine_prediction != y_wine_test)
wine_error = wine_incorrect / len(y_wine_test)
print("Error Rate (wine):", wine_error * 100, "%")

svm.fit(X_zip_train, y_zip_train)
y_zip_prediction = svm.predict(X_zip_test)
zip_incorrect = np.sum(y_zip_prediction != y_zip_test)
zip_error = zip_incorrect / len(y_zip_test)
print("Error Rate (zip):", zip_error * 100, "%")

Error Rate (wine): 16.666666666666664 %
Error Rate (zip): 2.5806451612903225 %


#### Observations:
- Wine not as close to the predicted cross validation score.
- Zip very close to the predicted cross validation score.

## Task 5 & 6
### Create a pipeline for SVM involving data normalization and SVC, and use grid search and cross-validation to tune parameters C and gamma for the pipeline, avoiding data snooping and data leakage.
### Fit the GridSearchCV object of task 5 to the training set and use it to predict the test labels. Write the resulting test error rate in your Jupyter notebook.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

wine_pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('svc', svm)
])

wine_param_grid = {
    'scalar': [StandardScaler(), MinMaxScaler(), RobustScaler(), Normalizer()],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [0.01, 0.1, 1]
}

wine_grid = GridSearchCV(wine_pipeline, param_grid=wine_param_grid, cv=5)
wine_grid.fit(X_wine_train, y_wine_train)
print("Best cv accuracy (wine):", wine_grid.best_score_ * 100, '%')
print("Test set score (wine):", wine_grid.score(X_wine_test, y_wine_test) * 100, '%')
print("Best parameters (wine):", wine_grid.best_params_)

zip_pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('svc', svm)
])

zip_param_grid = {
    'scalar': [StandardScaler(), MinMaxScaler(), RobustScaler(), Normalizer()]
}

zip_grid = GridSearchCV(zip_pipeline, param_grid=zip_param_grid, cv=5)
zip_grid.fit(X_zip_train, y_zip_train)
print("Best cv accuracy (zip):", zip_grid.best_score_ * 100, '%')
print("Test set score (zip):", zip_grid.score(X_zip_test, y_zip_test) * 100, '%')
print("Best parameters (zip):", zip_grid.best_params_)

Best cv accuracy (wine): 99.28571428571429 %
Test set score (wine): 94.44444444444444 %
Best parameters (wine): {'scalar': StandardScaler(), 'svc__C': 10, 'svc__gamma': 0.01}
Best cv accuracy (zip): 97.13636462242661 %
Test set score (zip): 97.41935483870968 %
Best parameters (zip): {'scalar': MinMaxScaler()}


#### Observations (Task 6):
- Error Rate (wine): 5.56% (2 d.p.)
- Error Rate (zip): 2.95% (2 d.p.)

## Task 7
### Implement a cross-conformal predictor.

In [6]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def cross_conformal_predictor(X, y, pipeline, param_grid):
    kfold = KFold(shuffle=True, random_state=2311)
    p_values = []
    
    for rest_index, fold_index in kfold.split(X):
        X_rest, X_fold = X[rest_index], X[fold_index]
        y_rest, y_fold = y[rest_index], y[fold_index]
        
        grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
        grid.fit(X_rest, y_rest)
        
        conformity_scores = grid.decision_function(X_fold)
        
        p_value = np.sum(conformity_scores <= conformity_scores[-1]) / len(conformity_scores)
        p_values.append(p_value)
    
    return np.array(p_values)

## Task 8
### Experiment with a neural network.

In [7]:
from sklearn.neural_network import MLPClassifier

# Task 3 (wine)
wine_mlp = MLPClassifier(random_state=2311, max_iter=10000)
wine_accuracy = cross_val_score(wine_mlp, X_wine_train, y_wine_train)
print("Accuracy (wine):", wine_accuracy.mean() * 100, "%")

# Task 4 (wine)
wine_mlp.fit(X_wine_train, y_wine_train)
y_wine_prediction = wine_mlp.predict(X_wine_test)
wine_incorrect = np.sum(y_wine_prediction != y_wine_test)
wine_error = wine_incorrect / len(y_wine_test)
print("Error Rate (wine):", wine_error * 100, "%")

# Task 5 (wine)
wine_pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('mlp', wine_mlp)
])

# Task 6 (wine)
wine_param_grid = {
    'scalar': [StandardScaler(), MinMaxScaler(), RobustScaler(), Normalizer()]
}
wine_grid = GridSearchCV(wine_pipeline, param_grid=wine_param_grid, cv=5)
wine_grid.fit(X_wine_train, y_wine_train)
print("Best cv accuracy (wine):", wine_grid.best_score_ * 100, '%')
print("Test set score (wine):", wine_grid.score(X_wine_test, y_wine_test) * 100, '%')
print("Best parameters (wine):", wine_grid.best_params_)

# Task 3 (zip)
zip_mlp = MLPClassifier(random_state=2311, max_iter=10000)
zip_accuracy = cross_val_score(zip_mlp, X_zip_train, y_zip_train)
print("Accuracy (zip):", zip_accuracy.mean() * 100, "%")

# Task 4 (zip)
zip_mlp.fit(X_zip_train, y_zip_train)
y_zip_prediction = zip_mlp.predict(X_zip_test)
zip_incorrect = np.sum(y_zip_prediction != y_zip_test)
zip_error = zip_incorrect / len(y_zip_test)
print("Error Rate (zip):", zip_error * 100, "%")

# Task 5 (zip)
zip_pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('mlp', wine_mlp)
])

# Task 6 (zip)
zip_param_grid = {
    'scalar': [StandardScaler(), MinMaxScaler(), RobustScaler(), Normalizer()]
}
zip_grid = GridSearchCV(zip_pipeline, param_grid=zip_param_grid, cv=5)
zip_grid.fit(X_zip_train, y_zip_train)
print("Best cv accuracy (zip):", zip_grid.best_score_ * 100, '%')
print("Test set score (zip):", zip_grid.score(X_zip_test, y_zip_test) * 100, '%')
print("Best parameters (zip):", zip_grid.best_params_)

Accuracy (wine): 80.76354679802955 %
Error Rate (wine): 5.555555555555555 %
Best cv accuracy (wine): 97.9064039408867 %
Test set score (wine): 100.0 %
Best parameters (wine): {'scalar': MinMaxScaler()}
Accuracy (zip): 96.26241042439494 %
Error Rate (zip): 2.849462365591398 %
Best cv accuracy (zip): 96.34312789697088 %
Test set score (zip): 96.98924731182795 %
Best parameters (zip): {'scalar': MinMaxScaler()}
