# Code

In [23]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [2]:
cancer: np.ndarray = load_breast_cancer()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

In [4]:
print(X_train.shape)
print(X_test.shape)

(426, 30)
(143, 30)


## Preprocessing Data

- Makes the data compatible with the models
- `MinMaxScaler` - Transform features by scaling each feature to a given range
  - This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one.


In [5]:
scaler: MinMaxScaler = MinMaxScaler()

- `fit()` computes the minimum and maximum value of each feature of the training set

In [6]:
scaler.fit(X_train)

- The `transform()` method applies the transformation

In [7]:
X_train_scaled: np.ndarray = scaler.transform(X_train) # transform training data

In [8]:
print(X_train_scaled.shape)
print(X_train.min(axis=0)) # min value per feature before scaling
print(X_train.max(axis=0)) # max value per feature before scaling
print(X_train_scaled.min(axis=0)) # min value per feature after scaling
print(X_train_scaled.max(axis=0)) # max value per feature after scaling

(426, 30)
[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [9]:
X_test_scaled: np.ndarray = scaler.transform(X_test) # transform test data

In [10]:
print(X_test_scaled.shape)
print(X_test.min(axis=0)) # min value per feature before scaling
print(X_test.max(axis=0)) # max value per feature before scaling
print(X_test_scaled.min(axis=0)) # min value per feature after scaling
print(X_test_scaled.max(axis=0)) # max value per feature after scaling

(143, 30)
[7.729e+00 1.072e+01 4.798e+01 1.788e+02 6.576e-02 3.398e-02 0.000e+00
 0.000e+00 1.203e-01 5.024e-02 1.144e-01 3.602e-01 7.714e-01 6.802e+00
 2.826e-03 3.746e-03 0.000e+00 0.000e+00 1.013e-02 1.217e-03 8.964e+00
 1.249e+01 5.717e+01 2.422e+02 8.409e-02 4.619e-02 0.000e+00 0.000e+00
 1.603e-01 5.865e-02]
[2.321e+01 3.928e+01 1.535e+02 1.670e+03 1.634e-01 3.454e-01 4.264e-01
 1.823e-01 2.906e-01 9.502e-02 1.370e+00 3.647e+00 1.107e+01 1.765e+02
 3.113e-02 1.354e-01 1.438e-01 4.090e-02 7.895e-02 2.193e-02 3.101e+01
 4.487e+01 2.068e+02 2.944e+03 1.902e-01 9.327e-01 1.170e+00 2.910e-01
 5.440e-01 1.446e-01]
[ 0.03540158  0.04190871  0.02895446  0.01497349  0.14260888  0.04999658
  0.          0.          0.07222222  0.00589722  0.00105015 -0.00057494
  0.00067851 -0.0007963   0.05148726  0.01434497  0.          0.
  0.04195752  0.01113138  0.03678406  0.01252665  0.03366702  0.01400904
  0.08531995  0.01833687  0.          0.          0.00749064  0.02367834]
[0.76809125 1.226970

- For the test set, after scaling, the minimum and maximum are not 0 and 1
- `MinMaxScaler` (and all the other scalers) always applies exactly the same transformation to the training and the test set
- This means the transform method always subtracts the training set minimum and divides by the training set range, which might be different from the minimum and range for the test set

## SVM

In [11]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

In [12]:
svm: SVC = SVC(C=100)
svm.fit(X_train_scaled, y_train)
print("Accuracy on training set: {:.3f}".format(svm.score(X_train_scaled, y_train)))

Accuracy on training set: 1.000


In [13]:
X_test_scaled: np.ndarray = scaler.transform(X_test)
svm.fit(X_test_scaled, y_test)
print("Accuracy on test set: {:.3f}".format(svm.score(X_test_scaled, y_test)))

Accuracy on test set: 1.000


## Parameter Selection using Validation & Cross Validation 

In [14]:
from sklearn.datasets import load_iris

In [15]:
iris: np.ndarray = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

- `SVM()` takes 2 arguments 
  - `gamma` - kernel bandwidth 
  - `C` - regularization parameter

In [16]:
best_score: float = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: # try different values for gamma
	for C in [0.001, 0.01, 0.1, 1, 10, 100]: # try different values for C
		svm: SVC = SVC(gamma=gamma, C=C) # build the model
		svm.fit(X_train, y_train) # train the model
		score: float = svm.score(X_test, y_test) # evaluate the model on the test set
		if score > best_score: # if we got a better score, store the score and parameters
			best_score = score # store the best score
			best_parameters = {'C': C, 'gamma': gamma} # store the best parameters

In [17]:
print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

Best score: 0.97
Best parameters: {'C': 100, 'gamma': 0.001}


### Using a Validation Set

- Repeating same procedure as before but with a validation set
- Training set is split into 2 parts, the training set and the validation set

In [37]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0) # split the data into training and test sets
X_train_pr, X_valid, y_train_pr, y_valid = train_test_split(X_train, y_train, random_state=1) # split training set into training and validation sets

In [38]:
print("Size of training set: {} \nSize of validation set: {} \nSize of test set: {}".format(X_train_pr.shape[0], X_valid.shape[0], X_test.shape[0]))

Size of training set: 84 
Size of validation set: 28 
Size of test set: 38


In [19]:
best_score: float = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]: # try different values for gamma
	for C in [0.001, 0.01, 0.1, 1, 10, 100]: # try different values for C
		svm: SVC = SVC(gamma=gamma, C=C) # build the model 
		svm.fit(X_train_pr, y_train_pr) # train the model on the training set 
		score: float = svm.score(X_valid, y_valid) # evaluate the model on the validation set
		if score > best_score: # if we got a better score, store the score and parameters
			best_score = score # store the best score
			best_parameters = {'C': C, 'gamma': gamma} # store best parameters (will use **kwargs)

In [22]:
svm: SVC = SVC(**best_parameters) # build a model with best parameters (**kwargs)
svm.fit(X_train, y_train) # fit the model using the whole training set
test_score: float = svm.score(X_test, y_test) # evaluate the model on the test set

In [39]:
print("Best score on validation set: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

Best score on validation set: 0.97
Best parameters: {'C': 10, 'gamma': 0.001}


### Using Cross Validation

In [24]:
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
	for C in [0.001, 0.01, 0.1, 1, 10, 100]:
		svm: SVC = SVC(gamma=gamma, C=C)
		score: float = np.mean(cross_val_score(svm, X_train, y_train, cv=5))
		if score > best_score:
			best_score = score
			best_C = C
			best_gamma = gamma

In [25]:
svm = SVC(C=best_C, gamma=best_gamma)
svm.fit(X_train, y_train)
test_score = svm.score(X_test, y_test)

In [26]:
print("Best cross-validation score: {:.2f}".format(best_score))
print("Best parameters: C = {}, gamma = {}".format(best_C, best_gamma))
print("Test set score with best parameters: {:.2f}".format(test_score))

Best cross-validation score: 0.97
Best parameters: C = 10, gamma = 0.1
Test set score with best parameters: 0.97


- `GridSearchCV` implements the grid search with-cross validation
- It will perform all the necessary model fits

In [28]:
from sklearn.model_selection import GridSearchCV

- A dictionary is required

In [27]:
param_grid: dict[str, list[float]] = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

- Behaves similarly to a classifier as it can call:
  - `fit`
  - `predict`
  - `score`
- However, when calling `fit`, it will run cross-validation for each combination of parameters which was specified in `param_grid`

In [29]:
grid_search: GridSearchCV = GridSearchCV(SVC(), param_grid, cv=5)

- Data still needs to be split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)

In [33]:
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))

Test set score: 0.97


In [34]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'C': 10, 'gamma': 0.1}
Best cross-validation score: 0.97


In [35]:
print(grid_search.best_estimator_)

SVC(C=10, gamma=0.1)
