# Tasks 1-2

## Loading Wine and USPS datasets

In [1]:
import math
import numpy as np

from sklearn.datasets import load_wine

wine = load_wine()
# loading the train and test zip datasets seperately
zip_Xtrain = np.genfromtxt("zip.train.gz", delimiter=" ",usecols=np.arange(start = 1, stop = 257))
zip_ytrain = np.genfromtxt("zip.train.gz", delimiter=" ",usecols=0)

zip_Xtest = np.genfromtxt("zip.test.gz", delimiter=" ",usecols=np.arange(start = 1, stop = 257))
zip_ytest = np.genfromtxt("zip.test.gz", delimiter=" ",usecols=0)

In [2]:
from sklearn.model_selection import train_test_split

# concatenating the zip train and test datasets before splitting them
X_zip = np.concatenate((zip_Xtrain, zip_Xtest))
y_zip = np.concatenate((zip_ytrain, zip_ytest))

X_train, X_test, y_train, y_test = train_test_split(wine.data, 
                                                   wine.target, random_state = 103)

X_zip_train, X_zip_test, y_zip_train, y_zip_test = train_test_split(X_zip,
y_zip, random_state=103)

# Task 3

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [4]:
svm = SVC()

# cross-val-score for wine

cvs_wine = np.mean(cross_val_score(svm, X_train, y_train))

# cvs for zip

cvs_zip = np.mean(cross_val_score(svm, X_zip_train, y_zip_train))

In [5]:
print("The mean CVS for the wine dataset is: " + str(cvs_wine))
print("The mean CVS for the zip dataset is: " + str(cvs_zip))

The mean CVS for the wine dataset is: 0.6683760683760683
The mean CVS for the zip dataset is: 0.9710321243629894


# Task 4

In [6]:
# Fitting svm with the wine train dataset
fit_wine = svm.fit(X_train, y_train)

# Fitting svm with the zip train dataset
fit_zip = svm.fit(X_zip_train, y_zip_train)

In [7]:
# test error rate of the wine dataset

score_wine = svm.fit(X_train, y_train).score(X_test, y_test)

test_error_wine = 1 - score_wine

# test error rate of the zip dataset

score_zip = svm.fit(X_zip_train, y_zip_train).score(X_zip_test, y_zip_test)

test_error_zip = 1 - score_zip

In [8]:
print("The test error rate of the SVM for the wine dataset is: " + str(test_error_wine))
print("The test error rate of the SVM for the zip dataset is: " + str(test_error_zip))

The test error rate of the SVM for the wine dataset is: 0.3111111111111111
The test error rate of the SVM for the zip dataset is: 0.02580645161290318


*The mean accuracy of the test data taken away from one gives the mean error rate for the test data. The test error rate of the SVC is in line with the mean cross-val-scores found above for default parameters.*  

# Task 5 and 6


In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

In [10]:
# We first build a pipeline using different normalizers for either dataset
# first, the wine dataset

# a pipeline using minMax scaler
pipe_wine_mm = make_pipeline(MinMaxScaler(), SVC())

# a pipeline using standard scaler
pipe_wine_std = make_pipeline(StandardScaler(), SVC())


# a pipeline using robust scaler
pipe_wine_rbt =make_pipeline(RobustScaler(), SVC())

In [11]:
# we fit different pipelines with the wine training dataset

In [12]:
pipe_wine_mm.fit(X_train, y_train)
pipe_wine_std.fit(X_train, y_train)
pipe_wine_rbt.fit(X_train, y_train)

# to find the best parameters for SVC we create a  grid of best potential suits

param_grid = {"svc__C" : [0.001, 0.01, 0.1, 1, 10, 100],
             "svc__gamma" : [0.001, 0.01, 0.1, 1, 10, 100]}

In [13]:
from sklearn.model_selection import GridSearchCV

# we create a grid using GridSearchCV for each normaliser

# first for MinMax Scaler
grid_wine_mm = GridSearchCV(pipe_wine_mm, param_grid = param_grid)

# then for Standard Scaler
grid_wine_std = GridSearchCV(pipe_wine_std, param_grid = param_grid)

# and for Robust Scaler
grid_wine_rbt = GridSearchCV(pipe_wine_rbt, param_grid = param_grid)

In [14]:
# we fit each normaliser varying grid with the test set of wine dataset

grid_wine_mm.fit(X_train, y_train)
grid_wine_std.fit(X_train, y_train)
grid_wine_rbt.fit(X_train, y_train)

# print each error rate for each grid using a different normaliser 
print("The error rate for MinMaxScaler with the wine dataset is: ")
print(1 - grid_wine_mm.best_score_)
print("The error rate for StandardScaler with the wine dataset is: ")
print(1 - grid_wine_std.best_score_)
print("The error rate for RobustScaler with the wine dataset is: ")
print(1 - grid_wine_rbt.best_score_)

The error rate for MinMaxScaler with the wine dataset is: 
0.01538461538461533
The error rate for StandardScaler with the wine dataset is: 
0.015099715099715083
The error rate for RobustScaler with the wine dataset is: 
0.0225071225071225


### Out of all the error rates stated above, making a pipeline using the StandardScaler normaliser and creating a GridSearchCV object using it, then fitting this grid with the wine dataset's training set gives the lowest error rate for predicting test labels.

### Therefore, StandardScaler is the most appropriate data normaliser for the wine dataset.

## We repeat this process for the zip dataset to find the most appropriate normalisation for it:

In [15]:
pipe_zip_std = make_pipeline(StandardScaler(), SVC())

In [16]:
# we fit different pipelines with the zip training dataset

In [17]:
pipe_zip_std.fit(X_zip_train, y_zip_train)

# to find the best parameters for SVC we create a  grid of best potential suits

param_grid = {"svc__C" : [0.1, 1, 10],
             "svc__gamma" : [0.1, 1, 10]}

In [18]:
# we create a grid using GridSearchCV for each normaliser

grid_zip_std = GridSearchCV(pipe_zip_std, param_grid = param_grid)

In [19]:
grid_zip_std.fit(X_zip_train, y_zip_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('svc', SVC())]),
             param_grid={'svc__C': [0.1, 1, 10], 'svc__gamma': [0.1, 1, 10]})

In [20]:
print("The error rate for StandardScaler with the zip dataset is: ")
print(1 - grid_zip_std.best_score_)

The error rate for StandardScaler with the zip dataset is: 
0.6763221795405809


In [22]:
pipe_zip_rbt =make_pipeline(RobustScaler(), SVC())

pipe_zip_rbt.fit(X_zip_train, y_zip_train)

grid_zip_rbt = GridSearchCV(pipe_zip_rbt, param_grid = param_grid)

In [23]:
grid_zip_rbt.fit(X_zip_train, y_zip_train)

print("The error rate for RobustScaler with the zip dataset is: ")
print(1 - grid_zip_rbt.best_score_)

The error rate for RobustScaler with the zip dataset is: 
0.522444475298643


In [25]:
pipe_zip_mm = make_pipeline(MinMaxScaler(), SVC())

pipe_zip_mm.fit(X_zip_train, y_zip_train)
grid_zip_mm = GridSearchCV(pipe_zip_mm, param_grid = param_grid)

In [26]:
grid_zip_mm.fit(X_zip_train, y_zip_train)
print("The error rate for MinMaxScaler with the zip dataset is: ")
print(1 - grid_zip_mm.best_score_)


The error rate for MinMaxScaler with the zip dataset is: 
0.037286681785223985


### The standard scaler and robust scaler when used in the GridSearchCV with pipelines gives extraordinarily high test error rates.

### Therefore, MinMax is the most appropriate data normaliser for the zip dataset.