### Hyperparameter
Parameter that influence the model structure.
- C and gamma of SVM
- k of kNN

#### sklearn
https://scikit-learn.org/stable/modules/classes.html  
See hyper-parameter optimizers  
- GridSearchCV
- ParameterGrid
- ParameterSampler
- RandomizedSearchCV
- fit_grid_point

#### Dataset split

##### Validation dataset
Used to find ideal hyperparameters (e.g. 20%).

##### Training dataset
Used to find ideal model parameters (e.g. 60%).

##### Test dataset
Used to compute model performance (e.g. 20%).

In [None]:
# import
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
# Read CSV
df = pd.read_csv("../res/classification.csv")
df.head()

In [None]:
# Prepare data
x = df[["age", "interest"]].values
y = df["success"].values

In [None]:
# Create pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors = 1))
])

# Change value in pipeline
pipeline.set_params(knn__n_neighbors = 3)

In [None]:
# Train model in pipeline
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.25)

pipeline.fit(x_train, y_train)
pipeline.score(x_test, y_test)

In [None]:
# Find best hyperparameter k for KNeighborsClassifier
clf = GridSearchCV(pipeline, param_grid = {
    "knn__n_neighbors":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
})

clf.fit(x, y)

print(clf.best_params_)
print(clf.best_score_)

In [None]:
# Split data into [train, test, validate]
# Automatically done by GridSearchCV !!!

In [None]:
# Project: Find best hyperparameter C, Gamma for SVM

# Create pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_harry", SVC())
])

# Create GridSearch
clf = GridSearchCV(pipeline, param_grid = {
    "svm_harry__C":[0.01, 0.1, 1.0, 10.0, 100.0],
    "svm_harry__gamma":[0.01, 0.1, 1.0, 10.0, 100.0]    
})

# Optimize model in GridSearch
clf.fit(x, y)

print(clf.best_params_)
print(clf.best_score_)