# Assignment 3 - CS3920

In [19]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import numpy as np

# 1 - Load Datasets

## Loading Wine Dataset

In [20]:
wine: np.ndarray = load_wine()
X_wine: np.ndarray = wine.data
y_wine: np.ndarray = wine.target

## Loading UPSU Dataset

In [21]:
# get the data from the wine dataset text
zip_train: np.ndarray = np.genfromtxt('zip.train', delimiter=' ')
zip_test: np.ndarray = np.genfromtxt('zip.test', delimiter=' ')
zip_data: np.ndarray = np.concatenate((zip_train, zip_test), axis=0)
X_zip: np.ndarray = zip_data[:, 1:]
y_zip: np.ndarray = zip_data[:, 0]

# 2 - Split Datasets 

## Splitting Wine Dataset

In [22]:
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=3103)

## Splitting Zip Dataset

In [23]:
X_zip_train, X_zip_test, y_zip_train, y_zip_test = train_test_split(X_zip, y_zip, test_size=0.3, random_state=3103)

# 3 - Cross Validation

## Cross Validate Wine

In [24]:
svm: SVC = SVC()
scores_wine: np.ndarray[float] = cross_val_score(svm, X_wine_train, y_wine_train, cv=5)
print("Cross-validation scores: ", scores_wine)
print("Average cross-validation score: ", scores_wine.mean())

Cross-validation scores:  [0.64       0.64       0.6        0.68       0.70833333]
Average cross-validation score:  0.6536666666666667


## Cross Validate Zip

In [25]:
score_zip: np.ndarray[float] = cross_val_score(svm, X_zip_train, y_zip_train, cv=5)
print("Cross-validation scores: ", score_zip)
print("Average cross-validation score: ", score_zip.mean())

Cross-validation scores:  [0.96236559 0.96697389 0.97542243 0.9707917  0.97002306]
Average cross-validation score:  0.9691153325280919


# 4 - Test Error Rate of `SVM`

## Test Wine

In [26]:
svm.fit(X_wine_train, y_wine_train)
print("Test error rate: ", 1 - svm.score(X_wine_test, y_wine_test))

Test error rate:  0.2777777777777778


## Test Zip

In [32]:
svm.fit(X_zip_train, y_zip_train)
print("Test error rate: ", 1 - svm.score(X_zip_test, y_zip_test))

Test error rate:  0.024372759856630788


# 5 - Pipeline

In [27]:
param_grid: dict[str, list[float]] = {
	'svc__C': [0.001, 0.01, 0.1, 1, 10, 100], 
	'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

pipe: Pipeline = make_pipeline(StandardScaler(), SVC())

In [28]:
min_max_pipe_wine: Pipeline = make_pipeline(MinMaxScaler(), SVC())
min_grid_wine: GridSearchCV = GridSearchCV(min_max_pipe_wine, param_grid, cv=5)
min_grid_wine.fit(X_wine_train, y_wine_train)
print("Best cross-validation accuracy: ", min_grid_wine.best_score_)

Best cross-validation accuracy:  0.984


In [29]:
# use StandardScaler to normalize the data
grid_wine: GridSearchCV = GridSearchCV(pipe, param_grid, cv=5)
grid_wine.fit(X_wine_train, y_wine_train)
print("Best cross-validation accuracy: ", grid_wine.best_score_)
print("Best parameters: ", grid_wine.best_params_)
print("Test set score: ", grid_wine.score(X_wine_test, y_wine_test))


Best cross-validation accuracy:  0.9836666666666666
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.1}
Test set score:  0.9814814814814815


In [30]:
standard_scalar_wine: StandardScaler = StandardScaler()
X_wine_train_scaled: np.ndarray = standard_scalar_wine.fit_transform(X_wine_train)
X_wine_test_scaled: np.ndarray = standard_scalar_wine.transform(X_wine_test)

svm.fit(X_wine_train_scaled, y_wine_train)
print("Test error rate: ", 1 - svm.score(X_wine_test_scaled, y_wine_test))


Test error rate:  0.01851851851851849
