In [20]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from IPython.display import display
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error

ratings = pd.read_csv('dataset_small/ratings.csv')
movies = pd.read_csv('dataset_small/movies.csv')

merged = ratings.merge(movies[['movieId', 'genres']], on='movieId', how='left')

genres_encoded = merged['genres'].str.get_dummies(sep='|')
data = pd.concat([merged, genres_encoded], axis=1)
data.drop('genres', axis=1, inplace=True)

X = data.drop(columns=['rating', 'timestamp'])
y = data['rating']

k_folds = [5, 10]

scalers = {
    'without_scaler': None,
    'min_max': MinMaxScaler(),
    'z_score': StandardScaler()
}

regressors = {
  'knn': KNeighborsRegressor(n_neighbors=15),
  'decicion_tree': DecisionTreeRegressor(),
  'linear_regression': LinearRegression(),
  'random_forest': RandomForestRegressor(),
  'neural_network_mlp': MLPRegressor()
}

scoring = {
  'mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
  'mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
}

### KNN

In [18]:
n_neighbors =  [5, 9, 13, 15]
weights = ['distance', 'uniform']

dataframe_mse = {}
dataframe_mae = {}

for k in k_folds:
    for num in n_neighbors:
        for weight in weights:
          knnRegressor = KNeighborsRegressor(num, weights=weight)
          
          for scaler in scalers:
            if(scalers[scaler] == None):
              pipe = Pipeline([('regressor', knnRegressor)])
            else:
              pipe = Pipeline([(scaler, scalers[scaler]), ('regressor', knnRegressor)])
            
            results = cross_validate(pipe, X, y, cv=k, scoring=scoring)
            
            mse_scores = np.mean(np.abs(results['test_mean_squared_error']))
            mae_scores = np.mean(np.abs(results['test_mean_absolute_error']))
            
            key = f'{k}_fold-{num}_neighbors-{weight}'

            if key in dataframe_mse:
                dataframe_mse[key].append(mse_scores)
                dataframe_mae[key].append(mae_scores)
            else:
                dataframe_mse[key] = [mse_scores]
                dataframe_mae[key] = [mae_scores]

mse_df = pd.DataFrame.from_dict(dataframe_mse, orient='index', columns=['without-scaler', 'min-max', 'z-score'])
mae_df = pd.DataFrame.from_dict(dataframe_mae, orient='index', columns=['without-scaler', 'min-max', 'z-score'])

display('MSE', mse_df)
display('MAE', mae_df)

'MSE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-5_neighbors-distance,1.274086,1.223631,1.220114
5_fold-5_neighbors-uniform,1.268754,1.215524,1.21165
5_fold-9_neighbors-distance,1.16873,1.135296,1.129115
5_fold-9_neighbors-uniform,1.163156,1.128572,1.122627
5_fold-13_neighbors-distance,1.126294,1.095484,1.089937
5_fold-13_neighbors-uniform,1.122007,1.090908,1.085206
5_fold-15_neighbors-distance,1.114512,1.083358,1.077373
5_fold-15_neighbors-uniform,1.110874,1.079522,1.073139
10_fold-5_neighbors-distance,1.280384,1.248853,1.239046
10_fold-5_neighbors-uniform,1.270805,1.236411,1.225199


'MAE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-5_neighbors-distance,0.893693,0.874938,0.872706
5_fold-5_neighbors-uniform,0.891279,0.871325,0.86904
5_fold-9_neighbors-distance,0.856212,0.841824,0.83904
5_fold-9_neighbors-uniform,0.853641,0.83878,0.836084
5_fold-13_neighbors-distance,0.84001,0.82657,0.823398
5_fold-13_neighbors-uniform,0.838017,0.824185,0.821047
5_fold-15_neighbors-distance,0.835347,0.821531,0.818368
5_fold-15_neighbors-uniform,0.833502,0.819376,0.816275
10_fold-5_neighbors-distance,0.891536,0.877641,0.874088
10_fold-5_neighbors-uniform,0.887772,0.872563,0.868622


### Comparação entre métodos

In [21]:
dataframe_mse = {}
dataframe_mae = {}

for k in k_folds:
  for regressor in regressors:
    for scaler in scalers:
      if(scalers[scaler] == None):
        pipe = Pipeline([('regressor', regressors[regressor])])
      else:
        pipe = Pipeline([(scaler, scalers[scaler]), ('regressor', regressors[regressor])])
        
      results = cross_validate(pipe, X, y, cv=k, scoring=scoring)
      
      mse_scores = np.mean(np.abs(results['test_mean_squared_error']))
      mae_scores = np.mean(np.abs(results['test_mean_absolute_error']))
      
      key = f'{k}_fold-{regressor}'

      if key in dataframe_mse:
          dataframe_mse[key].append(mse_scores)
          dataframe_mae[key].append(mae_scores)
      else:
          dataframe_mse[key] = [mse_scores]
          dataframe_mae[key] = [mae_scores]

mse_df = pd.DataFrame.from_dict(dataframe_mse, orient='index', columns=['without-scaler', 'min-max', 'z-score'])
mae_df = pd.DataFrame.from_dict(dataframe_mae, orient='index', columns=['without-scaler', 'min-max', 'z-score'])

display('MSE', mse_df)
display('MAE', mae_df)
      

'MSE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-knn,1.110874,1.079522,1.073139
5_fold-decicion_tree,1.854833,1.869555,1.850341
5_fold-linear_regression,1.044922,1.044922,1.044922
5_fold-random_forest,1.157199,1.157782,1.156089
5_fold-neural_network_mlp,23212.294727,1.022487,1.028647
10_fold-knn,1.134032,1.098773,1.088921
10_fold-decicion_tree,1.935694,1.895479,1.886381
10_fold-linear_regression,1.051066,1.051066,1.051066
10_fold-random_forest,1.196646,1.198801,1.193588
10_fold-neural_network_mlp,714.002061,1.030498,1.033182


'MAE'

Unnamed: 0,without-scaler,min-max,z-score
5_fold-knn,0.833502,0.819376,0.816275
5_fold-decicion_tree,1.055813,1.057737,1.053403
5_fold-linear_regression,0.813784,0.813784,0.813784
5_fold-random_forest,0.855946,0.856827,0.856267
5_fold-neural_network_mlp,34.360211,0.798801,0.802161
10_fold-knn,0.840955,0.822814,0.818926
10_fold-decicion_tree,1.076564,1.062517,1.060148
10_fold-linear_regression,0.81602,0.81602,0.81602
10_fold-random_forest,0.858239,0.858523,0.856776
10_fold-neural_network_mlp,5.152439,0.799802,0.799392
