In [16]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

scalers = {
    'without_scaler': None,
    'min_max': MinMaxScaler(),
    'z-score': StandardScaler()
}

regressors = {
  'knn': KNeighborsRegressor(n_neighbors=15),
  'decicion_tree': DecisionTreeRegressor(),
  'linear_regression': LinearRegression(),
  'random_forest': RandomForestRegressor(),
  'neural_network_mlp': MLPRegressor()
}

ratings = pd.read_csv('dataset_small/ratings.csv')

display(ratings)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [17]:
X = ratings.drop(columns=['rating', 'timestamp'])
y = ratings['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

random_row = X_test.sample(n=1, random_state=42)
display(ratings.loc[(ratings['movieId'] == random_row.iloc[0]['movieId']) & (ratings['userId'] == random_row.iloc[0]['userId'])])

dataframe_predict = {}

for regressor in regressors:
  for scaler in scalers:
    if(scalers[scaler] == None):
      pipe = Pipeline([('regressor', regressors[regressor])])
    else:
      pipe = Pipeline([(scaler, scalers[scaler]), ('regressor', regressors[regressor])])
      
    pipe.fit(X_train, y_train)
    y_predict = pipe.predict(random_row)

    if regressor in dataframe_predict:
        dataframe_predict[regressor].append(y_predict[0])
    else:
        dataframe_predict[regressor] = [y_predict[0]]
    
predict_df = pd.DataFrame.from_dict(dataframe_predict, orient='index', columns=['without-scaler', 'min-max', 'z-score'])

display(predict_df)

Unnamed: 0,userId,movieId,rating,timestamp
81182,514,1704,4.0,1533871404


Unnamed: 0,without-scaler,min-max,z-score
knn,3.766667,3.466667,3.466667
decicion_tree,2.5,2.5,2.5
linear_regression,3.452323,3.452323,3.452323
random_forest,2.82,2.99,2.96
neural_network_mlp,3.431498,3.5326,3.513094


In [18]:
movies = pd.read_csv('dataset_small/movies.csv')

merged = ratings.merge(movies[['movieId', 'genres']], on='movieId', how='left')

genres_encoded = merged['genres'].str.get_dummies(sep='|')
data = pd.concat([merged, genres_encoded], axis=1)
data.drop('genres', axis=1, inplace=True)

display(data)

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,964981247,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,964982224,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,964983815,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,964982931,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
100832,610,168248,5.0,1493850091,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
100833,610,168250,5.0,1494273047,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
100834,610,168252,5.0,1493846352,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [19]:
X = data.drop(columns=['rating', 'timestamp'])
y = data['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

random_row = X_test.sample(n=1, random_state=42)
display(merged.loc[(data['movieId'] == random_row.iloc[0]['movieId']) & (data['userId'] == random_row.iloc[0]['userId'])])

dataframe_predict = {}

for regressor in regressors:
  for scaler in scalers:
    if(scalers[scaler] == None):
      pipe = Pipeline([('regressor', regressors[regressor])])
    else:
      pipe = Pipeline([(scaler, scalers[scaler]), ('regressor', regressors[regressor])])
      
    pipe.fit(X_train, y_train)
    y_predict = pipe.predict(random_row)

    if regressor in dataframe_predict:
        dataframe_predict[regressor].append(y_predict[0])
    else:
        dataframe_predict[regressor] = [y_predict[0]]
    
predict_df = pd.DataFrame.from_dict(dataframe_predict, orient='index', columns=['without-scaler', 'min-max', 'z-score'])

display(predict_df)

Unnamed: 0,userId,movieId,rating,timestamp,genres
81182,514,1704,4.0,1533871404,Drama|Romance


Unnamed: 0,without-scaler,min-max,z-score
knn,3.766667,3.1,3.433333
decicion_tree,4.0,4.0,4.0
linear_regression,3.581364,3.581364,3.581364
random_forest,3.755,3.825,3.88
neural_network_mlp,4.321766,3.577465,3.525955
