# Assignment 3 - CS3920

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import numpy as np

## 1 - Load Datasets

- Loading Wine dataset

In [2]:
wine: np.ndarray = load_wine()
X_wine: np.ndarray = wine.data
y_wine: np.ndarray = wine.target

In [3]:
# get the data from the wine dataset text
zip_train: np.ndarray = np.genfromtxt('zip.train', delimiter=' ')
zip_test: np.ndarray = np.genfromtxt('zip.test', delimiter=' ')

## 2 - Split Datasets 

In [4]:
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=55)

In [5]:
X_zip_train: np.ndarray = zip_train[:, 1:]
y_zip_train: np.ndarray = zip_train[:, 0]

X_zip_test: np.ndarray = zip_test[:, 1:]
y_zip_test: np.ndarray = zip_test[:, 0]

## 3 - Cross Validation

In [6]:
svm: SVC = SVC()
scores_wine: np.ndarray[float] = cross_val_score(svm, X_wine_train, y_wine_train, cv=5)
print("Cross-validation scores: ", scores_wine)
print("Average cross-validation score: ", scores_wine.mean())

Cross-validation scores:  [0.76       0.6        0.64       0.76       0.66666667]
Average cross-validation score:  0.6853333333333332


## 4 - Test Error Rate of `SVM`

In [7]:
svm.fit(X_wine_train, y_wine_train)
print("Test error rate: ", 1 - svm.score(X_wine_test, y_wine_test))

Test error rate:  0.35185185185185186


## 5 - Pipeline

In [8]:
param_grid: dict[str, list[float]] = {
	'svc__C': [0.001, 0.01, 0.1, 1, 10, 100], 
	'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Create a pipeline for SVM involving data normalization and SVC, and use grid search and cross-validation to tune parameters C and gamma
pipe: Pipeline = make_pipeline(StandardScaler(), SVC())

In [9]:
min_max_pipe_wine: Pipeline = make_pipeline(MinMaxScaler(), SVC())
min_grid_wine: GridSearchCV = GridSearchCV(min_max_pipe_wine, param_grid, cv=5)
min_grid_wine.fit(X_wine_train, y_wine_train)
print("Best cross-validation accuracy: ", min_grid_wine.best_score_)

Best cross-validation accuracy:  0.992


In [10]:
# use StandardScaler to normalize the data
grid_wine: GridSearchCV = GridSearchCV(pipe, param_grid, cv=5)
grid_wine.fit(X_wine_train, y_wine_train)
print("Best cross-validation accuracy: ", grid_wine.best_score_)
print("Best parameters: ", grid_wine.best_params_)
print("Test set score: ", grid_wine.score(X_wine_test, y_wine_test))


Best cross-validation accuracy:  0.992
Best parameters:  {'svc__C': 10, 'svc__gamma': 0.01}
Test set score:  0.9629629629629629


In [11]:
standard_scalar_wine: StandardScaler = StandardScaler()
X_wine_train_scaled: np.ndarray = standard_scalar_wine.fit_transform(X_wine_train)
X_wine_test_scaled: np.ndarray = standard_scalar_wine.transform(X_wine_test)

svm.fit(X_wine_train_scaled, y_wine_train)
print("Test error rate: ", 1 - svm.score(X_wine_test_scaled, y_wine_test))


Test error rate:  0.03703703703703709
