In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from IPython.display import display
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error

data = pd.read_csv('dataset_small/ratings.csv')

X = data.drop(columns=['rating', 'timestamp'])
y = data['rating']

k_folds = [5, 10]

scalers = {
    'without_scaler': None,
    'min_max': MinMaxScaler(),
    'z_score': StandardScaler()
}

regressors = {
  'knn': KNeighborsRegressor(n_neighbors=15),
  'decicion_tree': DecisionTreeRegressor(),
  'linear_regression': LinearRegression(),
  'random_forest': RandomForestRegressor(),
  'neural_network_mlp': MLPRegressor()
}

scoring = {
  'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
  'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False)
}

### KNN

In [2]:
n_neighbors =  [5, 9, 13, 15]
weights = ['distance', 'uniform']

dataframe_mse = {}
dataframe_mae = {}

for k in k_folds:
    for num in n_neighbors:
        for weight in weights:
          knnRegressor = KNeighborsRegressor(num, weights=weight)
          
          for scaler in scalers:
            if(scalers[scaler] == None):
              pipe = Pipeline([('regressor', knnRegressor)])
            else:
              pipe = Pipeline([(scaler, scalers[scaler]), ('regressor', knnRegressor)])
            
            results = cross_validate(pipe, X, y, cv=k, scoring=scoring)
            
            mse_scores = np.mean(np.abs(results['test_mean_squared_error']))
            mae_scores = np.mean(np.abs(results['test_mean_absolute_error']))
            
            key = f'{k}_fold-{num}_neighbors-{weight}'

            if key in dataframe_mse:
                dataframe_mse[key].append(mse_scores)
                dataframe_mae[key].append(mae_scores)
            else:
                dataframe_mse[key] = [mse_scores]
                dataframe_mae[key] = [mae_scores]

mse_df = pd.DataFrame.from_dict(dataframe_mse, orient='index', columns=['without-scaler', 'min-max', 'z-score'])
mae_df = pd.DataFrame.from_dict(dataframe_mae, orient='index', columns=['without-scaler', 'min-max', 'z-score'])

display('MSE', mse_df)
display('MAE', mae_df)

'MSE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-5_neighbors-distance,1.27487,1.463736,1.46172
5_fold-5_neighbors-uniform,1.268933,1.464569,1.462513
5_fold-9_neighbors-distance,1.169467,1.391915,1.391638
5_fold-9_neighbors-uniform,1.163369,1.392775,1.392226
5_fold-13_neighbors-distance,1.126739,1.348024,1.347542
5_fold-13_neighbors-uniform,1.121947,1.348305,1.34743
5_fold-15_neighbors-distance,1.115021,1.341134,1.334807
5_fold-15_neighbors-uniform,1.110959,1.341326,1.334479
10_fold-5_neighbors-distance,1.281893,1.457744,1.451785
10_fold-5_neighbors-uniform,1.271565,1.45794,1.451928


'MAE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-5_neighbors-distance,0.894135,0.975115,0.974552
5_fold-5_neighbors-uniform,0.891473,0.975426,0.974768
5_fold-9_neighbors-distance,0.85644,0.949696,0.949742
5_fold-9_neighbors-uniform,0.853614,0.949904,0.949809
5_fold-13_neighbors-distance,0.840197,0.934201,0.932998
5_fold-13_neighbors-uniform,0.837959,0.934153,0.932771
5_fold-15_neighbors-distance,0.835611,0.931917,0.928844
5_fold-15_neighbors-uniform,0.833582,0.931895,0.928588
10_fold-5_neighbors-distance,0.89213,0.952641,0.950172
10_fold-5_neighbors-uniform,0.888063,0.95264,0.950161


### Comparação entre métodos

In [3]:
dataframe_mse = {}
dataframe_mae = {}

for k in k_folds:
  for regressor in regressors:
    for scaler in scalers:
      if(scalers[scaler] == None):
        pipe = Pipeline([('regressor', regressors[regressor])])
      else:
        pipe = Pipeline([(scaler, scalers[scaler]), ('regressor', regressors[regressor])])
        
      results = cross_validate(pipe, X, y, cv=k, scoring=scoring)
      
      mse_scores = np.mean(np.abs(results['test_mean_squared_error']))
      mae_scores = np.mean(np.abs(results['test_mean_absolute_error']))
      
      key = f'{k}_fold-{regressor}'

      if key in dataframe_mse:
          dataframe_mse[key].append(mse_scores)
          dataframe_mae[key].append(mae_scores)
      else:
          dataframe_mse[key] = [mse_scores]
          dataframe_mae[key] = [mae_scores]

mse_df = pd.DataFrame.from_dict(dataframe_mse, orient='index', columns=['without-scaler', 'min-max', 'z-score'])
mae_df = pd.DataFrame.from_dict(dataframe_mae, orient='index', columns=['without-scaler', 'min-max', 'z-score'])

display('MSE', mse_df)
display('MAE', mae_df)
      

'MSE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-knn,1.110959,1.341326,1.334479
5_fold-decicion_tree,1.793137,1.796893,1.797272
5_fold-linear_regression,1.085898,1.085898,1.085898
5_fold-random_forest,1.275484,1.274643,1.281263
5_fold-neural_network_mlp,7.524661,1.181359,1.093392
10_fold-knn,1.134369,1.329752,1.324182
10_fold-decicion_tree,1.921614,1.927277,1.927728
10_fold-linear_regression,1.092097,1.092097,1.092097
10_fold-random_forest,1.310061,1.312288,1.318944
10_fold-neural_network_mlp,398.155513,1.108383,1.120621


'MAE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-knn,0.833582,0.931895,0.928588
5_fold-decicion_tree,1.038479,1.040299,1.039917
5_fold-linear_regression,0.830848,0.830848,0.830848
5_fold-random_forest,0.897938,0.898233,0.899731
5_fold-neural_network_mlp,1.719097,0.873949,0.837424
10_fold-knn,0.841091,0.908348,0.906148
10_fold-decicion_tree,1.074846,1.077087,1.077166
10_fold-linear_regression,0.833547,0.833547,0.833547
10_fold-random_forest,0.898434,0.899163,0.902666
10_fold-neural_network_mlp,6.677868,0.843353,0.845283
