In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle


from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import Normalizer


seed = 42
np.random.seed(seed)
random.seed(seed)

In [2]:
df = pd.read_csv('train.csv')
df = df.drop_duplicates()
print(df.shape)
df.head()

(252123, 91)


Unnamed: 0,Year,S0,S1,S2,S3,S4,S5,S6,S7,S8,...,S80,S81,S82,S83,S84,S85,S86,S87,S88,S89
0,2007,44.76752,114.82099,3.83239,27.99928,1.49153,-15.90853,28.24844,3.6165,-7.24653,...,-1.89619,-471.02844,411.56205,443.01198,19.30254,309.07806,-336.91706,-14.70547,-474.44157,31.3282
1,2004,52.28942,75.73319,11.35941,-6.20582,-27.64559,-30.75995,12.50955,7.47877,9.88498,...,4.5706,1.3611,-6.52977,59.48672,3.6979,-36.92252,44.08077,3.39993,-70.07591,3.86143
2,2005,33.81773,-139.07371,134.19332,17.85216,63.47408,-25.28005,-34.65911,-5.99135,1.27848,...,54.16608,15.0453,39.09107,39.03041,3.68708,-61.88547,45.68115,6.39822,3.24471,35.74749
3,1998,41.60866,3.17811,-3.97174,23.53564,-19.68553,20.74407,18.80866,6.24474,-7.98424,...,28.08591,295.88684,54.02395,102.0288,40.47711,15.10258,-250.32293,2.81288,56.05172,3.60432
4,1987,44.49525,-32.2527,58.08217,3.73684,-32.53274,-18.72885,-15.85665,-3.34607,22.63786,...,31.44988,-136.50457,-85.11989,-74.96342,9.56921,-100.61689,-133.29315,9.19246,-97.37953,30.11015


In [3]:
X = df.drop('Year',axis=1)
y = df['Year']

### KNEIGHBORS REGRESSOR GRID SEARCH 

In [4]:
# Split in train validation e test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

scaler = MinMaxScaler()
scaler.fit(X_train) 
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("Number of train set: ", X_train.shape[0])
print("Numebr of test set: ", X_test.shape[0])

Number of train set:  201698
Numebr of test set:  50425


In [5]:
#############################
## CELLA DA NON RIESEGUIRE ##
#############################

param_grid = {
                'n_neighbors': [20, 25, 30 ],
                'weights': ['uniform', 'distance']
              }

knn = KNeighborsRegressor()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, verbose=4, n_jobs=-1)
grid_search.fit(X_train,y_train)

best_hyper= grid_search.best_params_
print("Best hyper: ", best_hyper)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END n_neighbors=20, weights=distance;, score=(train=-0.000, test=-77.089) total time=  57.6s
[CV 3/5] END n_neighbors=20, weights=distance;, score=(train=-0.000, test=-77.170) total time=  57.3s
[CV 2/5] END n_neighbors=20, weights=distance;, score=(train=-0.000, test=-77.739) total time=  57.9s
[CV 5/5] END n_neighbors=20, weights=uniform;, score=(train=-70.522, test=-78.707) total time=  57.0s
[CV 2/5] END n_neighbors=20, weights=uniform;, score=(train=-70.551, test=-78.509) total time=  57.9s
[CV 1/5] END n_neighbors=20, weights=uniform;, score=(train=-70.615, test=-77.856) total time=  58.5s
[CV 3/5] END n_neighbors=20, weights=uniform;, score=(train=-70.556, test=-77.919) total time=  58.1s
[CV 4/5] END n_neighbors=20, weights=uniform;, score=(train=-70.618, test=-77.310) total time=  58.5s
[CV 4/5] END n_neighbors=20, weights=distance;, score=(train=-0.000, test=-76.550) total time= 1.1min
[CV 1/5] END n_neighbo

#### Il grid search richiede molto tempo per essere rieseguito, creiamo un nuovo modello knn con gli iperparametri restituiti da grid search

In [6]:
model_knn = grid_search.best_estimator_

y_test_pred = model_knn.predict(X_test)

print("R² (Test):", r2_score(y_test, y_test_pred))
print("MSE (Test):", mean_squared_error(y_test, y_test_pred))
print("MAE (Test):", mean_absolute_error(y_test, y_test_pred))
print("MAPE (Test):", mean_absolute_percentage_error(y_test, y_test_pred))
print("RMSE (Test):", np.sqrt(mean_squared_error(y_test, y_test_pred)))

R² (Test): 0.30962254021099844
MSE (Test): 75.24596539837536
MAE (Test): 6.295440483857536
MAPE (Test): 0.0031585442517285327
RMSE (Test): 8.674443232759977


In [9]:
file_scaler = open("modelli/KNN/scaler_knn.save","wb")
pickle.dump(scaler, file_scaler)
file_scaler.close()
file_model = open("modelli/KNN/model_knn.save","wb")
pickle.dump(model_knn, file_model)
file_model.close()