In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import time
from scipy import stats
from statistics import mean
from sklearn.metrics import accuracy_score

In [2]:
print("Loading dataframe")
training = pd.read_pickle('training.pkl')
test = pd.read_pickle('sub_test.pkl')
avg_ratings = pd.read_pickle('avg_ratings.pkl')

Loading dataframe


In [3]:
def similar_users(x1, x2):
    similar = np.square(4-(x1['Rating'] - x2['Rating']))
    return np.sqrt(similar)

In [4]:
def calculate_similarity(row, dataset):
    list_ = []
    length = dataset.shape[0]
    for i in range(0, length):
        row2 = dataset.iloc[i,:]
        list_.append(similar_users(row, row2))
    return np.asarray(list_)

In [14]:
def knn_user_similarity(train, test, k):
  unique_ID = test['Customer_ID'].unique()
  test_length = test.shape[0]
  predicted_labels = []
  knn_set = {}

  for id in unique_ID:
    similarity_users = []
    user_train = train[train['Customer_ID']==id]
    others_train = train[train['Customer_ID']!=id]
    user_length = user_train.shape[0]
    for i in range(0, user_length):
      user_row = user_train.iloc[i,:]
      movie_id = user_row['Movie_ID']
      # look only at the movie that other customers has rated
      movie_train = others_train[others_train['Movie_ID']==movie_id]

      if movie_train.shape[0] > 0:
        # find closest k users
        similarity = calculate_similarity(user_row, movie_train)
        sorted_indices = np.argsort(similarity)
        nearest_indices = sorted_indices[::-1][0:k]
        for index, index_of_nn in enumerate(nearest_indices):
          similarity_users.append(movie_train.iloc[index_of_nn].loc['Customer_ID'])
    (unique, counts) = np.unique(similarity_users, return_counts=True)
    final_neighbors = np.argsort(counts)[::-1][0:k]
    knn_set[id] = unique[final_neighbors]
    
  for i in range(0, test_length):
    test_row = test.iloc[i,:]
    movie_id = test_row['Movie_ID']
    customer_id = test_row['Customer_ID']
    # search similar users ratings
    for user in knn_set[customer_id]:
      ratings = []
      condition = (train.Customer_ID == user) & (train.Movie_ID == movie_id)
      rating = train[condition]['Rating'].to_numpy()
      if len(rating) != 0:
        ratings.append(rating[0])
    if len(ratings) > 0:
      predicted_labels.append(mean(ratings))
    else: # if no other users rated the movie, just predict the average rating for the movie across all users
      avg_row = avg_ratings[avg_ratings['Movie_ID'] == movie_id]
      if avg_row.empty:
          predicted_labels[index] = 3
      else:
          avg = avg_row['Avg_Rating'].to_numpy()
          predicted_labels.append(avg[0])

  return np.asarray(predicted_labels)

In [15]:
print("Performing KNN Regression with user similarity metric")

#test = test_df.sample(200, random_state = 42)
start = time.time()
predictions = knn_user_similarity(training, test, 5)
rounded_predictions = np.around(predictions)
end = time.time()
print(end - start, " seconds elapsed")
actual = test['Rating'].to_numpy()
rmse = np.sqrt(np.square(rounded_predictions - actual).mean())
print("RMSE: ", rmse)
cm = confusion_matrix(actual, rounded_predictions)
print(cm)
accuracy = accuracy_score(actual, rounded_predictions)
print("Accuracy: ", accuracy)

Performing KNN Regression with user similarity metric


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  

3465.7393987178802  seconds elapsed
RMSE:  1.2727922061357855
[[ 3 12 15  3  0]
 [ 2 12 17  5  1]
 [ 4  6 30 10  2]
 [ 2  3 37  6  3]
 [ 0  1 13  6  7]]
Accuracy:  0.29


In [16]:
print("Saving predictions")
np.save('knn_user_predictions.npy', predictions)

Saving predictions


In [None]:
test.to_pickle('sub_test.pkl')