<a href="https://colab.research.google.com/github/van26101998/Recommendation-System/blob/main/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install and import

In [1]:
# install required packages
!pip install -q comet_ml wandb

In [2]:
import wandb
wandb_run = wandb.init(project="knn")

[34m[1mwandb[0m: Currently logged in as: [33mvan26101998[0m (use `wandb login --relogin` to force relogin)


In [3]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
import zipfile
from sklearn.model_selection import train_test_split
from time import time

# Movielens 1M Dataset

In [4]:
!wget -c http://files.grouplens.org/datasets/movielens/ml-1m.zip
import zipfile
with zipfile.ZipFile('ml-1m.zip', 'r') as zip_ref:
    zip_ref.extractall()

--2021-01-07 00:47:36--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [5]:
ratings_df = pd.read_csv('./ml-1m/ratings.dat', sep="::", engine='python', names=['user', 'item', 'rating', 'timestamp'])
users = sorted(ratings_df['user'].unique())
items = sorted(ratings_df['item'].unique())
ratings_df

Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
# train validate test split
train_df, test_df = train_test_split(ratings_df, test_size=0.1)
train_df, validate_df = train_test_split(train_df, test_size=0.1)

In [7]:
print("number of train-validate-test:", len(train_df), len(validate_df), len(test_df))

number of train-validate-test: 810169 90019 100021


# Model

In [8]:
class KNNCF:
    """
        K-nearest-neighbor colaborative filtering
        Params:
            ratings_df: DataFrame contains user, item, rating, timestamp
            users: sorted list of users
            items: sorted list of items
            k: number of neighbors
            dist_func: function to calculate similarity
            uuCF: user-user CF if True, otherwise item-item CF
    """
    def __init__(self, ratings_df, users, items, k, dist_func = cosine_similarity, uuCF = True):
        
        self.n_users = len(users)
        self.n_items = len(items)

        self.user_to_id = {user:id for id, user in enumerate(users)}
        self.item_to_id = {item:id for id, item in enumerate(items)}
        
        self.ratings_df = ratings_df.copy()
        self.ratings_df['user'] = self.ratings_df['user'].apply(lambda x: self.user_to_id[x])
        self.ratings_df['item'] = self.ratings_df['item'].apply(lambda x: self.item_to_id[x])
        

        self.Y_data = self.ratings_df.to_numpy() 
        self.k = k 
        self.dist_func = dist_func
        self.uuCF = uuCF
        if not uuCF:
            self.Y_data = self.Y_data[:, [1, 0, 2]]
            self.n_items, self.n_users = self.n_users, self.n_items

        self.Ybar_data = None        

    
    def normalize_Y(self):
        users = self.Y_data[:, 0]
        self.Ybar_data = self.Y_data.copy()
        
        self.mu = np.zeros((self.n_users,))
        for user_id in range(self.n_users):
            # row indices of rating done by user n
            ids = np.where(users == user_id)[0].astype(np.int32)
            
            item_ids = self.Y_data[ids, 1] 
            ratings = self.Y_data[ids, 2]
            
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[user_id] = m
            
            # normalize
            self.Ybar_data[ids, 2] = ratings - np.full((len(ratings),), m)
        
        #store nonzeros only and their locations
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

            

    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)  
    
    def fit(self):
        self.normalize_Y()
        self.similarity()
    

    def __pred(self, u_id, i_id, normalize = 1):
        """ 
        predict the rating of user u for item i (normalized)
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i_id)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u_id, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i_id, users_rated_i[a]]
        if normalize:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u_id]
    
    def pred(self, u_id, i_id, normalize = 1):
        """ 
        predict the rating of user u for item i (normalize)
        """
        if self.uuCF: return self.__pred(u_id, i_id, normalize)
        return self.__pred(i_id, u_id, normalize)
    
    def pred_real(self, user, item, normalize = 1):
        u_id = self.user_to_id[user]
        i_id = self.item_to_id[item]
        
        return self.pred(u_id, i_id, normalize)

# Run

In [9]:
k = 30
uuCF = False

In [10]:
params = {
    "k": k,
    'uuCF': uuCF
}
wandb.config.update(params)

In [11]:
knncf = KNNCF(train_df, users, items, k=k, uuCF=uuCF)

In [None]:
t0 = time()

knncf.fit()

fit_time = time() - t0
print("fit_time:", fit_time)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [None]:
t0 = time()
square_error = 0.0

for user, item, rating, timestamp in test_df.to_numpy():
    pred = knncf.pred_real(user, item, normalize=0)
    square_error += (rating - pred)**2

mse = square_error / len(test_df)
rmse = np.sqrt(mse)
print("rmse:", rmse)

test_time = time() - t0
print("test time:", test_time)

In [None]:
wandb.log({
    "test_rmse": rmse,
    "fit_time": fit_time,
    "test_time": test_time
})

In [None]:
wandb_run.finish()

In [None]:
for user, item, rating, timestamp in test_df.to_numpy()[:10]:
    pred = knncf.pred_real(user, item, normalize=0)
    print("rating:", rating, '\tpred:', pred)