In [6]:
# import all the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [20]:
df = pd.read_csv("../data/speeddating/speeddating_cat.csv")

train, test = train_test_split(df, test_size=0.2, random_state=44)

X_train = train.drop(['match'], axis=1)
y_train = train['match']
X_test = test.drop(['match'], axis=1)

#### Define Hyperparameters to test

In [21]:
# knn
n_neighbour = np.arange(9, 21, 2) 
weights = ['uniform', 'distance']

# svc
svc_kernel = ['linear', 'poly', 'rbf', 'sigmoid']

# trees
tree_max_depth = [6, 9, 12, 15]
tree_min_split = [3, 6, 9, 12]
forest_n_estimators = np.arange(15, 30)
forest_max_depth = np.arange(15, 30)
forest_min_split = np.arange(15, 30)

#### KNN

In [22]:
def knn_grid(X_train, y_train, X_test):
    print('KNN')
    grid = GridSearchCV(estimator=KNeighborsClassifier(),
                 param_grid={'n_neighbors': n_neighbour,
                            'weights': weights})
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    return grid.predict(X_test)

def knn(X_train, y_train, X_test):
    # num {'n_neighbors': 19, 'weights': 'distance'}
    # cat {'n_neighbors': 19, 'weights': 'distance'}
    knn = KNeighborsClassifier(n_neighbors = 9, weights= 'distance')
    knn.fit(X_train, y_train)
    return knn.predict(X_test)
    
y_test_knn = knn_grid(X_train, y_train, X_test)


KNN
{'n_neighbors': 19, 'weights': 'distance'}


#### SVC

In [23]:
def svc_grid(X_train, y_train, X_test):
    print('\nSVC')
    grid = GridSearchCV(estimator=SVC(random_state = 42),
                 param_grid={'kernel': svc_kernel})
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    y_pred = grid.predict(X_test)
    return y_pred

def svc(X_train, y_train, X_test):
    # num {'kernel': 'linear'}
    # cat {'kernel': 'linear'}
    svc = SVC(random_state = 42)
    svc.fit(X_train, y_train)
    return svc.predict(X_test)

y_test_svc = svc_grid(X_train, y_train, X_test)



SVC
{'kernel': 'linear'}


#### Random Forest

In [24]:
def randomForest_grid(X_train, y_train, X_test):
    print('\nRandom Forest')
    grid = GridSearchCV(estimator=RandomForestClassifier(),
                 param_grid={'n_estimators': forest_n_estimators,
                             'max_depth': forest_max_depth,
                             'min_samples_leaf': forest_min_split})
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    y_pred = grid.predict(X_test)
    return y_pred

def randomForest(X_train, y_train, X_test):
    # num {'max_depth': 20, 'min_samples_leaf': 16, 'n_estimators': 27
    # cat {'max_depth': 28, 'min_samples_leaf': 15, 'n_estimators': 18
    print('\nRandom Forest')
    rf = RandomForestClassifier(max_depth = 20, min_samples_leaf = 15, n_estimators = 20)
    rf.fit(X_train, y_train)
    return rf.predict(X_test)

y_test_rf = randomForest_grid(X_train, y_train, X_test)



Random Forest
{'max_depth': 28, 'min_samples_leaf': 15, 'n_estimators': 18}
