In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import math as m

#Wczytanie, podział i preprocessing danych

col_names = ["user_id","movie_id","rating"]
ratings_data=pd.read_csv("/content/drive/MyDrive/rec/u.data", sep="\t", names=col_names, usecols=['user_id', "movie_id", "rating"])
#print(ratings_data.head)

X=ratings_data.copy()
X=X.drop("rating", axis=1)
Y=ratings_data["rating"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)
#print(X_train.shape)

X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.values
Y_test = Y_test.values

ratings_df = ratings_data.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
#print(ratings_df)

#Tutaj próba cosine similarity

#cos_sim = cosine_similarity(ratings_df,ratings_df)
#print(cos_sim)
#sim_matrix = pd.DataFrame(cos_sim, index=ratings_df.index, columns=ratings_df.index)
#print(sim_matrix)

#Pearson similarity i obliczanie średniej

ratings_matrix = ratings_df.values
user_sim = np.corrcoef(ratings_matrix)
#print(user_sim)

mean_ratings = np.zeros(943)
for i in range(943):
  if np.count_nonzero(ratings_matrix[:, i]) == 0:
    mean_ratings[i] = 0
  else:             
    mean_ratings[i] = np.sum(ratings_matrix[:, i])/np.count_nonzero(ratings_matrix[:, i])
#print(mean_ratings)

#Funkcje

def get_k_nearest(correlations,k):
    sort_corr = np.sort(correlations)
    k_nearest = sort_corr[-k-1:-1]
    k_nearest = np.array(k_nearest)
    return  k_nearest

def prediction(k):
  predict = []
  for x in range(Y_test.size):
    user = X_test[x][0]-1
    movie_id = X_test[x][1]-1
    top = 0
    bottom = 0
    k_nearest = get_k_nearest(user_sim[user,:], k)
    for y in range(943):
      if ratings_matrix[y, movie_id] != 0:
        if user_sim[user, y] in k_nearest:
          sim = user_sim[user, y]
          means = ratings_matrix[y, movie_id]-mean_ratings[y]
          top += sim * means
          bottom += sim
    if bottom > 0:
      predict.append(mean_ratings[user] + (top/bottom))
    else:
      predict.append(0)
  return predict

def rmse(p):
  sum = 0
  for i in range(len(p)):
    sum += pow(abs(p[i] - Y_test[i]),2)
  rmse = m.sqrt(1/len(p)*sum)
  return rmse

def mae(p):
  sum = 0
  for i in range(len(p)):
    sum += abs(p[i] - Y_test[i])
  mae = 1/len(p)*sum
  return mae

#Obliczanie błędów

for x in range(10, 100, 20):
  e=prediction(x)
  print("Values of errors for k = "+str(x))
  print("Root Mean Square Error: "+str(rmse(e)))
  print("Mean Absolute Error: "+str(mae(e)))

for x in range(100, 1000, 200):
  e=prediction(x)
  print("Values of errors for k = "+str(x))
  print("Root Mean Square Error: "+str(rmse(e)))
  print("Mean Absolute Error: "+str(mae(e)))

In [None]:
#Dodatkowo chciałem znaleźć najbardziej optymalne k
for x in range(400, 570, 10):
  e=prediction(x)
  print("Values of errors for k = "+str(x))
  print("Root Mean Square Error: "+str(rmse(e)))
  print("Mean Absolute Error: "+str(mae(e)))

**Po predykcji uzyskałem takie wartości dla błędów:**

Values of errors for k = 10
* Root Mean Square Error: 1.5242340491924888
* Mean Absolute Error: 1.1605596833780218

Values of errors for k = 30
* Root Mean Square Error: 1.3187188926025402
* Mean Absolute Error: 1.0211834279859358

Values of errors for k = 50
* Root Mean Square Error: 1.2673690316588953
* Mean Absolute Error: 0.9890128890652603

Values of errors for k = 70
* Root Mean Square Error: 1.2393702949146346
* Mean Absolute Error: 0.9706871976565834

Values of errors for k = 90
* Root Mean Square Error: 1.2300634290425976
* Mean Absolute Error: 0.964566342888497

Values of errors for k = 100
* Root Mean Square Error: 1.2280967226539627
* Mean Absolute Error: 0.9636672138467832

Values of errors for k = 300
* Root Mean Square Error: 1.2121299919417168
* Mean Absolute Error: 0.956650452985221

Values of errors for k = 500
* Root Mean Square Error: 1.2101455574973712
* Mean Absolute Error: 0.9562652828687958

Values of errors for k = 700
* Root Mean Square Error: 1.210635791570899
* Mean Absolute Error: 0.9574653046930408

Values of errors for k = 900
* Root Mean Square Error: 1.2111190953469082
* Mean Absolute Error: 0.9576507916364212

Po żmudnej robocie szukania najbardziej optymalnego k wyszło mi:
około k=420 dla najmniejszego RMSE
około k=430 dla najmniejszego MAE


In [88]:
#Naive Recommender
#print(ratings_matrix)
naive = np.mean(ratings_matrix, axis=0)
print("MAE: " + str(mae(naive)))
print("RMSE: " + str(rmse(naive)))

MAE: 3.3199701663045698
RMSE: 3.524081577268136


Porównując CF vs Naive Recommender nie da się zauważyć, że CF deklasuje przeciwnika pod względem obu miar błędów.