# Assignment 3 - CS3920

In [191]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler
from sklearn.base import BaseEstimator

import numpy as np

In [178]:
CV: int = 3
RANDOM_STATE: int = 3103

# 1 - Load Datasets

## Loading Wine Dataset

In [179]:
wine: np.ndarray = load_wine()
X_wine: np.ndarray = wine.data
y_wine: np.ndarray = wine.target

## Loading USPS Dataset

In [180]:
DATA_SPLIT: int = 10 # percentage of data to be used 

In [188]:
zip_train: np.ndarray = np.genfromtxt('zip.train', delimiter=' ')
zip_train = zip_train[::DATA_SPLIT]

zip_test: np.ndarray = np.genfromtxt('zip.test', delimiter=' ')
zip_test = zip_test[::DATA_SPLIT]

zip_data: np.ndarray = np.concatenate((zip_train, zip_test), axis=0)
zip_data = zip_data[::DATA_SPLIT]

X_zip: np.ndarray = zip_data[:, 1:]
y_zip: np.ndarray = zip_data[:, 0]

In [182]:
del zip_train, zip_test

# 2 - Split Datasets 

In [183]:
def split(X: np.ndarray, y: np.ndarray) -> tuple[np.ndarray[float]]:
	return train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

## Splitting Wine Dataset

In [184]:
# X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=RANDOM_STATE)
X_wine_train, X_wine_test, y_wine_train, y_wine_test = split(X_wine, y_wine)

## Splitting Zip Dataset

In [176]:
# X_zip_train, X_zip_test, y_zip_train, y_zip_test = train_test_split(X_zip, y_zip, test_size=0.3, random_state=RANDOM_STATE)
X_zip_train, X_zip_test, y_zip_train, y_zip_test = split(X_zip, y_zip)

# 3 - Cross Validation

## Cross Validate Wine

In [147]:
svm: SVC = SVC()
scores_wine: np.ndarray[float] = cross_val_score(svm, X_wine_train, y_wine_train, cv=CV)
print("Cross-validation scores: ", scores_wine)
print("Average cross-validation score: ", scores_wine.mean())

Cross-validation scores:  [0.64285714 0.63414634 0.68292683]
Average cross-validation score:  0.6533101045296168


## Cross Validate Zip

In [148]:
score_zip: np.ndarray[float] = cross_val_score(svm, X_zip_train, y_zip_train, cv=CV)
print("Cross-validation scores: ", score_zip)
print("Average cross-validation score: ", score_zip.mean())

Cross-validation scores:  [0.63636364 0.72727273 0.61904762]
Average cross-validation score:  0.660894660894661


# 4 - Test Error Rate of `SVM`

## Test Wine

In [149]:
svm.fit(X_wine_train, y_wine_train)
print("Test error rate: ", 1 - svm.score(X_wine_test, y_wine_test))

Test error rate:  0.2777777777777778


## Test Zip

In [150]:
svm.fit(X_zip_train, y_zip_train)
print("Test error rate: ", 1 - svm.score(X_zip_test, y_zip_test))

Test error rate:  0.4482758620689655


# 5 - Pipeline

In [151]:
param_grid: dict[str, list[float]] = {
	'svc__C': [0.001, 0.01, 0.1, 1, 10, 100], 
	'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

## Wine Dataset

### MinMax Scaler

In [195]:
class Super_Pipeline:
	def __init__ (self, param_grid: dict[str, list[float]], scaler: BaseEstimator, X_train: np.ndarray[float], y_train: np.ndarray[float], X_test: np.ndarray[float], y_test: np.ndarray[float]):
		self.pipeline: Pipeline = make_pipeline(scaler(), SVC())
		self.grid: GridSearchCV = GridSearchCV(self.pipeline, param_grid, cv=CV)
		self.grid.fit(X_wine_train, y_wine_train)
		self.X_train: np.ndarray[float] = X_train
		self.y_train: np.ndarray[float] = y_train
		self.X_test: np.ndarray[float] = X_test
		self.y_test: np.ndarray[float] = y_test
		
	@property
	def best_validation_score(self) -> float:
		return self.grid.best_score_

	@property
	def best_params(self) -> dict[str, float]:
		return self.grid.best_params_

	@property
	def test_score(self) -> float:
		return self.grid.score(self.X_test, self.y_test)

	@property
	def error_rate(self) -> float:
		return 1 - self.best_validation_score

In [152]:
pipe_wine_minmax: Pipeline = make_pipeline(MinMaxScaler(), SVC())
grid_wine_minmax: GridSearchCV = GridSearchCV(pipe_wine_minmax, param_grid=param_grid, cv=CV)
grid_wine_minmax.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_minmax.best_score_)
print("Error Rate: ", 1 - grid_wine_minmax.best_score_)
print("Best parameters: ", grid_wine_minmax.best_params_)
print("Test error rate: ", 1 - grid_wine_minmax.score(X_wine_test, y_wine_test))

Best cross-validation score:  0.991869918699187
Error Rate:  0.008130081300813052
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.1}
Test error rate:  0.01851851851851849


In [196]:
wine_super_pipe: Super_Pipeline = Super_Pipeline(param_grid, MinMaxScaler, X_wine_train, y_wine_train, X_wine_test, y_wine_test)
print("Best cross-validation score: ", wine_super_pipe.best_validation_score)
print("Error Rate: ", wine_super_pipe.error_rate)
print("Best parameters: ", wine_super_pipe.best_params)
print("Test error rate: ", wine_super_pipe.error_rate)

Best cross-validation score:  0.991869918699187
Error Rate:  0.008130081300813052
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.1}
Test error rate:  0.008130081300813052


### Standard Scaler

In [153]:
pipe_wine_standard: Pipeline = make_pipeline(StandardScaler(), SVC())
grid_wine_standard: GridSearchCV = GridSearchCV(pipe_wine_standard, param_grid=param_grid, cv=CV)
grid_wine_standard.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_standard.best_score_)
print("Error Rate: ", 1 - grid_wine_standard.best_score_)
print("Best parameters: ", grid_wine_standard.best_params_)
print("Test error rate: ", 1 - grid_wine_standard.score(X_wine_test, y_wine_test))

Best cross-validation score:  0.983933410762679
Error Rate:  0.01606658923732096
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.001}
Test error rate:  0.01851851851851849


### Normalizer

In [154]:
pipe_wine_normalizer: Pipeline = make_pipeline(Normalizer(), SVC())
grid_wine_normalizer: GridSearchCV = GridSearchCV(pipe_wine_normalizer, param_grid=param_grid, cv=CV)
grid_wine_normalizer.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_normalizer.best_score_)
print("Error Rate: ", 1 - grid_wine_normalizer.best_score_)
print("Best parameters: ", grid_wine_normalizer.best_params_)
print("Test error rate: ", 1 - grid_wine_normalizer.score(X_wine_test, y_wine_test))

Best cross-validation score:  0.927022841656988
Error Rate:  0.072977158343012
Best parameters:  {'svc__C': 100, 'svc__gamma': 100}
Test error rate:  0.03703703703703709


### Robust Scaler

In [155]:
pipe_wine_robust: Pipeline = make_pipeline(RobustScaler(), SVC())
grid_wine_robust: GridSearchCV = GridSearchCV(pipe_wine_robust, param_grid=param_grid, cv=CV)
grid_wine_robust.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_robust.best_score_)
print("Error Rate: ", 1 - grid_wine_robust.best_score_)
print("Best parameters: ", grid_wine_robust.best_params_)
print("Test error rate: ", 1 - grid_wine_robust.score(X_wine_test, y_wine_test))

Best cross-validation score:  0.983739837398374
Error Rate:  0.016260162601625994
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.01}
Test error rate:  0.01851851851851849


## Zip Dataset

### MinMax Scaler

In [156]:
pipe_zip_minmax: Pipeline = make_pipeline(MinMaxScaler(), SVC())
grid_zip_minmax: GridSearchCV = GridSearchCV(pipe_zip_minmax, param_grid=param_grid, cv=CV)
grid_zip_minmax.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_minmax.best_score_)
print("Error Rate: ", 1 - grid_zip_minmax.best_score_)
print("Best parameters: ", grid_zip_minmax.best_params_)
print("Test error rate: ", 1 - grid_zip_minmax.score(X_zip_test, y_zip_test))

Best cross-validation score:  0.7373737373737373
Error Rate:  0.26262626262626265
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.001}
Test error rate:  0.4137931034482759


### Standard Scaler

In [157]:
pipe_zip_standard: Pipeline = make_pipeline(StandardScaler(), SVC())
grid_zip_standard: GridSearchCV = GridSearchCV(pipe_zip_standard, param_grid=param_grid, cv=CV)
grid_zip_standard.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_standard.best_score_)
print("Error Rate: ", 1 - grid_zip_standard.best_score_)
print("Best parameters: ", grid_zip_standard.best_params_) 
print("Test error rate: ", 1 - grid_zip_standard.score(X_zip_test, y_zip_test))

Best cross-validation score:  0.7070707070707071
Error Rate:  0.29292929292929293
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.001}
Test error rate:  0.4482758620689655


### Normalizer

In [158]:
pipe_zip_normalizer: Pipeline = make_pipeline(Normalizer(), SVC())
grid_zip_normalizer: GridSearchCV = GridSearchCV(pipe_zip_normalizer, param_grid=param_grid, cv=CV)
grid_zip_normalizer.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_normalizer.best_score_)
print("Error Rate: ", 1 - grid_zip_normalizer.best_score_) 
print("Best parameters: ", grid_zip_normalizer.best_params_)
print("Test error rate: ", 1 - grid_zip_normalizer.score(X_zip_test, y_zip_test))

Best cross-validation score:  0.7532467532467533
Error Rate:  0.24675324675324672
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.1}
Test error rate:  0.4137931034482759


### Robust Scaler

In [159]:
pipe_zip_robust: Pipeline = make_pipeline(RobustScaler(), SVC())
grid_zip_robust: GridSearchCV = GridSearchCV(pipe_zip_robust, param_grid=param_grid, cv=CV)
grid_zip_robust.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_robust.best_score_)
print("Error Rate: ", 1 - grid_zip_robust.best_score_)
print("Best parameters: ", grid_zip_robust.best_params_)
print("Test error rate: ", 1 - grid_zip_robust.score(X_zip_test, y_zip_test))

Best cross-validation score:  0.4437229437229437
Error Rate:  0.5562770562770563
Best parameters:  {'svc__C': 100, 'svc__gamma': 0.001}
Test error rate:  0.4137931034482759


# 6 - Predict Test Labels

## Wine Dataset

### MinMax Scaler

In [164]:
grid_wine_minmax.fit(X_wine_train, y_wine_train)
prediction_wine_minmax: np.ndarray = grid_wine_minmax.predict(X_wine_test)
print("Test Set Score: ", grid_wine_minmax.score(X_wine_test, y_wine_test))
# print("Test Set Score: ", np.mean(prediction_wine_minmax == y_wine_test))

Test Set Score:  0.9814814814814815


### Standard Scaler

In [166]:
grid_wine_standard.fit(X_wine_train, y_wine_train)
prediction_wine_standard: np.ndarray = grid_wine_standard.predict(X_wine_test)
print("Test Set Score: ", grid_wine_standard.score(X_wine_test, y_wine_test))

Test Set Score:  0.9814814814814815


### Normalizer

In [167]:
grid_wine_normalizer.fit(X_wine_train, y_wine_train)
prediction_wine_normalizer: np.ndarray = grid_wine_normalizer.predict(X_wine_test)
print("Test Set Score: ", grid_wine_normalizer.score(X_wine_test, y_wine_test))

Test Set Score:  0.9629629629629629


### Robust Scalar

In [168]:
grid_wine_robust.fit(X_wine_train, y_wine_train)
prediction_wine_robust: np.ndarray = grid_wine_robust.predict(X_wine_test)
print("Test Set Score: ", grid_wine_robust.score(X_wine_test, y_wine_test))

Test Set Score:  0.9814814814814815


## Zip Dataset

### MinMax Scaler

In [169]:
grid_zip_minmax.fit(X_zip_train, y_zip_train)
prediction_zip_minmax: np.ndarray = grid_zip_minmax.predict(X_zip_test)
print("Test Set Score: ", grid_zip_minmax.score(X_zip_test, y_zip_test))

Test Set Score:  0.5862068965517241


### Standard Scaler

In [170]:
grid_zip_standard.fit(X_zip_train, y_zip_train)
prediction_zip_standard: np.ndarray = grid_zip_standard.predict(X_zip_test)
print("Test Set Score: ", grid_zip_standard.score(X_zip_test, y_zip_test))

Test Set Score:  0.5517241379310345


### Normalizer

In [171]:
grid_zip_normalizer.fit(X_zip_train, y_zip_train)
prediction_zip_normalizer: np.ndarray = grid_zip_normalizer.predict(X_zip_test)
print("Test Set Score: ", grid_zip_normalizer.score(X_zip_test, y_zip_test))

Test Set Score:  0.5862068965517241


### Robust Scaler

In [172]:
grid_zip_robust.fit(X_zip_train, y_zip_train)
prediction_zip_robust: np.ndarray = grid_zip_robust.predict(X_zip_test)
print("Test Set Score: ", grid_zip_robust.score(X_zip_test, y_zip_test))

Test Set Score:  0.5862068965517241
