# Assignment 3 - CS3920

In [188]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler

import numpy as np

# 1 - Load Datasets

## Loading Wine Dataset

In [189]:
wine: np.ndarray = load_wine()
X_wine: np.ndarray = wine.data
y_wine: np.ndarray = wine.target

## Loading UPSU Dataset

In [190]:
DATA_SPLIT: int = 10

In [191]:
zip_train: np.ndarray = np.genfromtxt('zip.train', delimiter=' ')
# zip_train = zip_train[::DATA_SPLIT]

zip_test: np.ndarray = np.genfromtxt('zip.test', delimiter=' ')
# zip_test = zip_test[::DATA_SPLIT]

zip_data: np.ndarray = np.concatenate((zip_train, zip_test), axis=0)
zip_data = zip_data[::DATA_SPLIT]

X_zip: np.ndarray = zip_data[:, 1:]
y_zip: np.ndarray = zip_data[:, 0]

In [192]:
del zip_train, zip_test

# 2 - Split Datasets 

## Splitting Wine Dataset

In [193]:
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=3103)

## Splitting Zip Dataset

In [194]:
X_zip_train, X_zip_test, y_zip_train, y_zip_test = train_test_split(X_zip, y_zip, test_size=0.3, random_state=3103)

# 3 - Cross Validation

## Cross Validate Wine

In [195]:
svm: SVC = SVC()
scores_wine: np.ndarray[float] = cross_val_score(svm, X_wine_train, y_wine_train, cv=10)
print("Cross-validation scores: ", scores_wine)
print("Average cross-validation score: ", scores_wine.mean())

Cross-validation scores:  [0.61538462 0.61538462 0.53846154 0.61538462 0.66666667 0.58333333
 0.75       0.66666667 0.66666667 0.75      ]
Average cross-validation score:  0.6467948717948718


## Cross Validate Zip

In [196]:
score_zip: np.ndarray[float] = cross_val_score(svm, X_zip_train, y_zip_train, cv=10)
print("Cross-validation scores: ", score_zip)
print("Average cross-validation score: ", score_zip.mean())

Cross-validation scores:  [0.89473684 1.         0.84210526 0.89473684 1.         0.84210526
 0.94736842 0.89473684 0.78947368 0.94736842]
Average cross-validation score:  0.9052631578947368


# 4 - Test Error Rate of `SVM`

## Test Wine

In [197]:
svm.fit(X_wine_train, y_wine_train)
print("Test error rate: ", 1 - svm.score(X_wine_test, y_wine_test))

Test error rate:  0.2777777777777778


## Test Zip

In [198]:
svm.fit(X_zip_train, y_zip_train)
print("Test error rate: ", 1 - svm.score(X_zip_test, y_zip_test))

Test error rate:  0.1585365853658537


# 5 - Pipeline

In [199]:
param_grid: dict[str, list[float]] = {
	'svc__C': [0.001, 0.01, 0.1, 1, 10, 100], 
	'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

## Wine Dataset

### MinMaxScalar

In [200]:
pipe_wine_minmax: Pipeline = make_pipeline(MinMaxScaler(), SVC())
grid_wine_minmax: GridSearchCV = GridSearchCV(pipe_wine_minmax, param_grid=param_grid, cv=5)
grid_wine_minmax.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_minmax.best_score_)
print("Error Rate: ", 1 - grid_wine_minmax.best_score_)
print("Best parameters: ", grid_wine_minmax.best_params_)

Best cross-validation score:  0.984
Error Rate:  0.016000000000000014
Best parameters:  {'svc__C': 1, 'svc__gamma': 1}


### StandardScalar

In [201]:
pipe_wine_standard: Pipeline = make_pipeline(StandardScaler(), SVC())
grid_wine_standard: GridSearchCV = GridSearchCV(pipe_wine_standard, param_grid=param_grid, cv=5)
grid_wine_standard.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_standard.best_score_)
print("Error Rate: ", 1 - grid_wine_standard.best_score_)
print("Best parameters: ", grid_wine_standard.best_params_)

Best cross-validation score:  0.9836666666666666
Error Rate:  0.016333333333333422
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.1}


### Normalizer

In [206]:
pipe_wine_normalizer: Pipeline = make_pipeline(Normalizer(), SVC())
grid_wine_normalizer: GridSearchCV = GridSearchCV(pipe_wine_normalizer, param_grid=param_grid, cv=5)
grid_wine_normalizer.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_normalizer.best_score_)
print("Error Rate: ", 1 - grid_wine_normalizer.best_score_)
print("Best parameters: ", grid_wine_normalizer.best_params_)

Best cross-validation score:  0.9196666666666665
Error Rate:  0.08033333333333348
Best parameters:  {'svc__C': 100, 'svc__gamma': 100}


### RobustScalar

In [207]:
pipe_wine_robust: Pipeline = make_pipeline(RobustScaler(), SVC())
grid_wine_robust: GridSearchCV = GridSearchCV(pipe_wine_robust, param_grid=param_grid, cv=5)
grid_wine_robust.fit(X_wine_train, y_wine_train)
print("Best cross-validation score: ", grid_wine_robust.best_score_)
print("Error Rate: ", 1 - grid_wine_robust.best_score_)
print("Best parameters: ", grid_wine_robust.best_params_)

Best cross-validation score:  0.9756666666666666
Error Rate:  0.02433333333333343
Best parameters:  {'svc__C': 1, 'svc__gamma': 0.1}


## Zip Dataset

### MinMaxScalar

In [208]:
pipe_zip_minmax: Pipeline = make_pipeline(MinMaxScaler(), SVC())
grid_zip_minmax: GridSearchCV = GridSearchCV(pipe_zip_minmax, param_grid=param_grid, cv=5)
grid_zip_minmax.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_minmax.best_score_)
print("Error Rate: ", 1 - grid_zip_minmax.best_score_)
print("Best parameters: ", grid_zip_minmax.best_params_)

Best cross-validation score:  0.9157894736842105
Error Rate:  0.08421052631578951
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.01}


### StandardScalar

In [205]:
pipe_zip_standard: Pipeline = make_pipeline(StandardScaler(), SVC())
grid_zip_standard: GridSearchCV = GridSearchCV(pipe_zip_standard, param_grid=param_grid, cv=5)
grid_zip_standard.fit(X_zip_train, y_zip_train)
print("Best cross-validation score: ", grid_zip_standard.best_score_)
print("Error Rate: ", 1 - grid_zip_standard.best_score_)
print("Best parameters: ", grid_zip_standard.best_params_)

Best cross-validation score:  0.9
Error Rate:  0.09999999999999998
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.001}
