## KNN Model Notebook

Goals of this notebook:
* goal: find the captures per week in each spot with knn,  calculate the RMSE
* Given: y_true, y_pred
* normalize the data, both X_train and X_test
* fit and evaluate the model (rmse)
* cross validation to find best value of k (# of neighbours) or even GridSearch

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

# to measure the time
from timeit import default_timer as timer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

from sklearn import set_config
set_config(transform_output="pandas")

# Set random seed 
RSEED = 73

warnings.filterwarnings("ignore")

# Reminder: Baseline 3.657 

In [2]:
# importing data and having a look at it
final_data = pd.read_csv("../data/wrangled_data.csv", index_col=0)
final_data.head()

Unnamed: 0.1,Unnamed: 0,date_caught,capture_site,tag_2,ccl_cm,ccw_cm,weight_kg,status,release_site,cm_beached,...,species_4,species_5,species_6,species_7,year,week_of_year,year_woy,cs_category,type,capture_number
0,0,2000-12-22,0,0,64.7,62.6,8.5,429,237,False,...,False,False,True,False,2000,51,200051,2,1,1
1,1,2001-10-28,0,0,35.85,31.35,8.5,429,250,False,...,False,False,True,False,2001,43,200143,2,1,1
2,2,2001-11-01,0,0,51.8,49.2,8.5,429,237,False,...,False,True,False,False,2001,44,200144,2,1,1
3,3,2002-03-11,0,0,60.5,59.0,8.5,429,237,False,...,False,False,True,False,2002,11,200211,2,1,1
4,4,2002-08-08,0,0,34.7,33.0,8.5,429,250,True,...,False,True,False,False,2002,32,200232,2,1,2


In [3]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18062 entries, 0 to 18061
Data columns (total 41 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            18062 non-null  int64  
 1   date_caught           18062 non-null  object 
 2   capture_site          18062 non-null  int64  
 3   tag_2                 18062 non-null  int64  
 4   ccl_cm                18062 non-null  float64
 5   ccw_cm                18062 non-null  float64
 6   weight_kg             18062 non-null  float64
 7   status                18062 non-null  int64  
 8   release_site          18062 non-null  int64  
 9   cm_beached            18062 non-null  bool   
 10  cm_by hand            18062 non-null  bool   
 11  cm_collected floater  18062 non-null  bool   
 12  cm_fish trap          18062 non-null  bool   
 13  cm_jarife             18062 non-null  bool   
 14  cm_longline           18062 non-null  bool   
 15  cm_net                18

In [4]:
# TODO: Solving date formate issue with date_caught
del final_data['date_caught']

In [5]:
# Select X and y features
X = final_data.drop(['capture_number'], axis = 1)
y = final_data['capture_number']


# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RSEED, shuffle=True, stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)  
print("y_train:", y_train.shape)  
print("X_test:", X_test.shape) 
print("y_test:", y_test.shape)  

X_train: (12643, 39)
y_train: (12643,)
X_test: (5419, 39)
y_test: (5419,)


In [6]:
# Implement a basic evaluation function
def evaluate_rmse(y_true, y_pred, ndigits=3):
    """ Prints the RMSE (root mean squared error) of y_pred in relation to y_true"""
    rmse = mean_squared_error(y_true, y_pred, squared=False )
    print("Number of predictions: ", len(y_pred))
    print("RMSE: ", round(rmse, ndigits))
    return rmse

In [7]:
# check the colum names to see which will be scaled
final_data.columns

Index(['Unnamed: 0', 'capture_site', 'tag_2', 'ccl_cm', 'ccw_cm', 'weight_kg',
       'status', 'release_site', 'cm_beached', 'cm_by hand',
       'cm_collected floater', 'cm_fish trap', 'cm_jarife', 'cm_longline',
       'cm_net', 'cm_not_recorded', 'cm_speargun', 'cm_stranded', 'cm_uzio',
       'foraging_ground_0', 'foraging_ground_1', 'cs_category_0',
       'cs_category_1', 'cs_category_2', 'cs_category_3', 'cs_category_4',
       'species_0', 'species_1', 'species_2', 'species_3', 'species_4',
       'species_5', 'species_6', 'species_7', 'year', 'week_of_year',
       'year_woy', 'cs_category', 'type', 'capture_number'],
      dtype='object')

In [8]:
# define the columns to scale:
col_scale = ["ccl_cm", "ccw_cm", "weight_kg", "year", "week_of_year", "year_woy"]


In [9]:
# setting up the scalar
standard_scaler = MinMaxScaler()
X_train_scaled = standard_scaler.fit_transform(X_train[col_scale])

X_test_scaled = standard_scaler.transform(X_test[col_scale])

In [10]:
# fit and evaluate the model (rmse), normalize the data with euclidean as a first shot
knn = KNeighborsClassifier(n_neighbors=5, metric= "euclidean")
knn.fit(X_train_scaled, y_train)

In [11]:
# calculate the predicted target
y_predict = knn.predict(X_test_scaled)

rmse = evaluate_rmse(y_test, y_predict)

Number of predictions:  5419
RMSE:  4.659


In [12]:
# cross validation to find best value of k (# of neighbours) or even GridSearch
scorer_rmse = make_scorer(mean_squared_error, squared = False)

print("CV RSME scores: ", cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring=scorer_rmse, verbose = 5))

[CV] END ................................ score: (test=4.662) total time=   0.1s
[CV] END ................................ score: (test=4.791) total time=   0.0s
[CV] END ................................ score: (test=4.616) total time=   0.0s
[CV] END ................................ score: (test=4.623) total time=   0.0s
[CV] END ................................ score: (test=4.689) total time=   0.0s
CV RSME scores:  [4.66183447 4.79096454 4.61554785 4.6230966  4.68881311]


In [13]:
# Implementing grid search

# checking the parameters
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [14]:
# setting up the grid for the subsequent search to define the model being used
para_grid = {"n_neighbors" : [2, 3, 4, 5, 10],
            "weights" : ["uniform", "distance"],
            "p" : [1, 2, 3],
            "algorithm" : ["ball_tree", "kd_tree", "brute"]
            }

# setting up the grid search: TODO change the scoring to RMSE
gs = GridSearchCV(estimator=KNeighborsClassifier(),
                param_grid = para_grid,
                scoring = scorer_rmse,
                n_jobs = -1,
                cv = 5,
                verbose = 5
                )



In [15]:
# conducting gridsearch and calculating time needed
start_time = timer()
gs.fit(X_train_scaled, y_train)
end_time = timer()

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[CV 1/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.436 total time=   0.1s
[CV 4/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.076 total time=   0.1s
[CV 1/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.463 total time=   0.1s
[CV 2/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.449 total time=   0.1s
[CV 4/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.188 total time=   0.1s
[CV 5/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.431 total time=   0.1s
[CV 2/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.414 total time=   0.1s
[CV 3/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.324 total time=   0.1s
[CV 5/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.448 total time=   0.1s
[CV 3/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=unifor

In [16]:
# outputting the time for the grid search
gs_time = end_time - start_time
gs_time

26.432123874954414

In [17]:
# Best score
print('Best score:', round(gs.best_score_, 3))

# Best parameters
print('Best parameters:', gs.best_params_)

Best score: 4.759
Best parameters: {'algorithm': 'kd_tree', 'n_neighbors': 10, 'p': 3, 'weights': 'uniform'}


In [18]:
# best estimator
knn_best = gs.best_estimator_
print(knn_best)

# Making predictions on the test set TODO also the scaled?
y_pred_test = knn_best.predict(X_test_scaled)

rmse = evaluate_rmse(y_test, y_predict)

rmse

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10, p=3)
Number of predictions:  5419
RMSE:  4.659


4.659171566126135

In [19]:
# Random Search
random_search = RandomizedSearchCV(
                            KNeighborsClassifier(), 
                            para_grid, 
                            n_iter= 500, 
                            scoring=scorer_rmse, 
                            n_jobs = -1, 
                            cv=5, 
                            random_state = RSEED,
                            verbose = 5)


In [20]:
# conducting randomsearch
start_time = timer()
random_search.fit(X_train_scaled, y_train)
end_time = timer()

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.463 total time=   0.1s
[CV 1/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.436 total time=   0.1s
[CV 2/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.414 total time=   0.1s
[CV 2/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.449 total time=   0.1s
[CV 3/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.364 total time=   0.1s
[CV 3/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.324 total time=   0.1s
[CV 4/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.188 total time=   0.1s
[CV 5/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=uniform;, score=4.431 total time=   0.1s
[CV 4/5] END algorithm=ball_tree, n_neighbors=2, p=1, weights=distance;, score=4.076 total time=   0.1s
[CV 5/5

In [21]:
random_search_time = end_time - start_time
random_search_time

28.79492658295203

In [22]:
# Best score
print('Best score:', round(random_search.best_score_, 3))

# Best parameters
print('Best parameters:', random_search.best_params_)

Best score: 4.759
Best parameters: {'weights': 'uniform', 'p': 3, 'n_neighbors': 10, 'algorithm': 'kd_tree'}


In [23]:
# best estimator
knn_best_rs = random_search.best_estimator_
print(knn_best_rs)

# Making predictions on the test set TODO also the scaled?
y_pred_test_rs = knn_best_rs.predict(X_test_scaled)

rmse_rs = evaluate_rmse(y_test, y_pred_test_rs)

rmse_rs

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10, p=3)


Number of predictions:  5419
RMSE:  4.75


4.750287721591574