# Assignment 3 - CS3920

In [114]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler

import numpy as np

In [115]:
CV: int = 3
RANDOM_STATE: int = 3103

# 1 - Load Datasets

## Loading Wine Dataset

In [116]:
wine: np.ndarray = load_wine()
X_wine: np.ndarray = wine.data
y_wine: np.ndarray = wine.target

## Loading USPS Dataset

In [117]:
DATA_SPLIT: int = 10 # percentage of data to be used 

In [118]:
zip_train: np.ndarray = np.genfromtxt('zip.train', delimiter=' ')
zip_train = zip_train[::DATA_SPLIT]

zip_test: np.ndarray = np.genfromtxt('zip.test', delimiter=' ')
zip_test = zip_test[::DATA_SPLIT]

zip_data: np.ndarray = np.concatenate((zip_train, zip_test), axis=0)
zip_data = zip_data[::DATA_SPLIT]

X_zip: np.ndarray = zip_data[:, 1:]
y_zip: np.ndarray = zip_data[:, 0]

In [119]:
del zip_train, zip_test

# 2 - Split Datasets 

## Splitting Wine Dataset

In [120]:
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=RANDOM_STATE)

## Splitting Zip Dataset

In [121]:
X_zip_train, X_zip_test, y_zip_train, y_zip_test = train_test_split(X_zip, y_zip, test_size=0.3, random_state=RANDOM_STATE)

# 3 - Cross Validation

## Cross Validate Wine

In [122]:
svm: SVC = SVC()
scores_wine: np.ndarray[float] = cross_val_score(svm, X_wine_train, y_wine_train, cv=CV)
print("Cross-validation scores: ", scores_wine)
print("Average cross-validation score: ", scores_wine.mean())

Cross-validation scores:  [0.64285714 0.63414634 0.68292683]
Average cross-validation score:  0.6533101045296168


## Cross Validate Zip

In [123]:
score_zip: np.ndarray[float] = cross_val_score(svm, X_zip_train, y_zip_train, cv=CV)
print("Cross-validation scores: ", score_zip)
print("Average cross-validation score: ", score_zip.mean())

Cross-validation scores:  [0.63636364 0.72727273 0.61904762]
Average cross-validation score:  0.660894660894661


# 4 - Test Error Rate of `SVM`

## Test Wine

In [124]:
svm.fit(X_wine_train, y_wine_train)
print("Test error rate: ", 1 - svm.score(X_wine_test, y_wine_test))

Test error rate:  0.2777777777777778


## Test Zip

In [125]:
svm.fit(X_zip_train, y_zip_train)
print("Test error rate: ", 1 - svm.score(X_zip_test, y_zip_test))

Test error rate:  0.4482758620689655


# 5 - Pipeline

In [126]:
param_grid: dict[str, list[float]] = {
	'svc__C': [0.001, 0.01, 0.1, 1, 10, 100], 
	'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

## Wine Dataset

### MinMax Scaler

In [127]:
pipe_wine_minmax: Pipeline = make_pipeline(MinMaxScaler(), SVC())
grid_wine_minmax: GridSearchCV = GridSearchCV(pipe_wine_minmax, param_grid=param_grid, cv=CV)
grid_wine_minmax.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_minmax.best_score_)
print("Error Rate: ", 1 - grid_wine_minmax.best_score_)
print("Best parameters: ", grid_wine_minmax.best_params_)

Best cross-validation score:  0.991869918699187
Error Rate:  0.008130081300813052
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.1}


### Standard Scaler

In [128]:
pipe_wine_standard: Pipeline = make_pipeline(StandardScaler(), SVC())
grid_wine_standard: GridSearchCV = GridSearchCV(pipe_wine_standard, param_grid=param_grid, cv=CV)
grid_wine_standard.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_standard.best_score_)
print("Error Rate: ", 1 - grid_wine_standard.best_score_)
print("Best parameters: ", grid_wine_standard.best_params_)

Best cross-validation score:  0.983933410762679
Error Rate:  0.01606658923732096
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.001}


### Normalizer

In [129]:
pipe_wine_normalizer: Pipeline = make_pipeline(Normalizer(), SVC())
grid_wine_normalizer: GridSearchCV = GridSearchCV(pipe_wine_normalizer, param_grid=param_grid, cv=CV)
grid_wine_normalizer.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_normalizer.best_score_)
print("Error Rate: ", 1 - grid_wine_normalizer.best_score_)
print("Best parameters: ", grid_wine_normalizer.best_params_)

Best cross-validation score:  0.927022841656988
Error Rate:  0.072977158343012
Best parameters:  {'svc__C': 100, 'svc__gamma': 100}


### Robust Scaler

In [130]:
pipe_wine_robust: Pipeline = make_pipeline(RobustScaler(), SVC())
grid_wine_robust: GridSearchCV = GridSearchCV(pipe_wine_robust, param_grid=param_grid, cv=CV)
grid_wine_robust.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_robust.best_score_)
print("Error Rate: ", 1 - grid_wine_robust.best_score_)
print("Best parameters: ", grid_wine_robust.best_params_)

Best cross-validation score:  0.983739837398374
Error Rate:  0.016260162601625994
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.01}


## Zip Dataset

### MinMax Scaler

In [131]:
pipe_zip_minmax: Pipeline = make_pipeline(MinMaxScaler(), SVC())
grid_zip_minmax: GridSearchCV = GridSearchCV(pipe_zip_minmax, param_grid=param_grid, cv=CV)
grid_zip_minmax.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_minmax.best_score_)
print("Error Rate: ", 1 - grid_zip_minmax.best_score_)
print("Best parameters: ", grid_zip_minmax.best_params_)

Best cross-validation score:  0.7373737373737373
Error Rate:  0.26262626262626265
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.001}


### Standard Scaler

In [132]:
pipe_zip_standard: Pipeline = make_pipeline(StandardScaler(), SVC())
grid_zip_standard: GridSearchCV = GridSearchCV(pipe_zip_standard, param_grid=param_grid, cv=CV)
grid_zip_standard.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_standard.best_score_)
print("Error Rate: ", 1 - grid_zip_standard.best_score_)
print("Best parameters: ", grid_zip_standard.best_params_) 

Best cross-validation score:  0.7070707070707071
Error Rate:  0.29292929292929293
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.001}


### Normalizer

In [133]:
pipe_zip_normalizer: Pipeline = make_pipeline(Normalizer(), SVC())
grid_zip_normalizer: GridSearchCV = GridSearchCV(pipe_zip_normalizer, param_grid=param_grid, cv=CV)
grid_zip_normalizer.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_normalizer.best_score_)
print("Error Rate: ", 1 - grid_zip_normalizer.best_score_)
print("Best parameters: ", grid_zip_normalizer.best_params_)

Best cross-validation score:  0.7532467532467533
Error Rate:  0.24675324675324672
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.1}


### Robust Scaler

In [134]:
pipe_zip_robust: Pipeline = make_pipeline(RobustScaler(), SVC())
grid_zip_robust: GridSearchCV = GridSearchCV(pipe_zip_robust, param_grid=param_grid, cv=CV)
grid_zip_robust.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_robust.best_score_)
print("Error Rate: ", 1 - grid_zip_robust.best_score_)
print("Best parameters: ", grid_zip_robust.best_params_)

Best cross-validation score:  0.4437229437229437
Error Rate:  0.5562770562770563
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.001}
