<a href="https://colab.research.google.com/github/menaenahoro/Recommendation_systems/blob/main/Collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Recommendation_systems/u.data", sep='\t', names=column_names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Recommendation_systems/u.item", usecols=[0,1], names=['item_id', 'title'], sep='|', encoding='latin-1')
data.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 943
Num of Movies: 1682


In [6]:
df1 = pd.merge(df,data,on='item_id')
df1.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


**Splitting Our DataSet**

In [7]:
#Creating a train & test dataset
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df1, test_size=0.25)

In [8]:
train_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
79724,624,886,4,879792251,"Life Less Ordinary, A (1997)"
62969,496,433,4,876066904,Heathers (1989)
15819,346,685,3,874950383,Executive Decision (1996)
18491,271,317,3,885848863,In the Name of the Father (1993)
88256,295,420,4,879518233,Alice in Wonderland (1951)


In [9]:
test_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
81339,667,192,5,891034947,Raging Bull (1980)
17445,295,151,4,879517635,Willy Wonka and the Chocolate Factory (1971)
62576,734,230,2,891022803,Star Trek IV: The Voyage Home (1986)
47152,720,315,4,891262608,Apt Pupil (1998)
76892,927,411,4,879182939,"Nutty Professor, The (1996)"


## Creating a matrix vector and then using the rating to calculate the similarity

In [10]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [11]:
print(train_data_matrix)

[[0. 0. 4. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]


In [12]:
'''Using pairwise distances to calculate the similarity and create a matrix vector with output 0-1'''
from sklearn.metrics.pairwise import pairwise_distances

user_similarity = pairwise_distances(train_data_matrix, metric='cosine')

item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine') #Using matrix transpose .T


In [13]:
user_similarity

array([[0.        , 0.84232307, 0.95419124, ..., 0.87105478, 0.88004756,
        0.69000669],
       [0.84232307, 0.        , 0.91298734, ..., 0.84153181, 0.91667526,
        0.9040425 ],
       [0.95419124, 0.91298734, 0.        , ..., 0.90144049, 0.92052463,
        0.96620014],
       ...,
       [0.87105478, 0.84153181, 0.90144049, ..., 0.        , 0.89950045,
        0.89887131],
       [0.88004756, 0.91667526, 0.92052463, ..., 0.89950045, 0.        ,
        0.86706288],
       [0.69000669, 0.9040425 , 0.96620014, ..., 0.89887131, 0.86706288,
        0.        ]])

In [14]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred



item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')



In [20]:
item_prediction

array([[0.3559151 , 0.36452439, 0.37831982, ..., 0.42750327, 0.42593694,
        0.42593694],
       [0.0897685 , 0.10203258, 0.09748293, ..., 0.10127517, 0.10291493,
        0.10291493],
       [0.0710531 , 0.07450119, 0.07260166, ..., 0.07039278, 0.07376562,
        0.07376562],
       ...,
       [0.03242118, 0.04111668, 0.03977172, ..., 0.04582536, 0.04580607,
        0.04580607],
       [0.12513181, 0.13363865, 0.14121296, ..., 0.14762059, 0.14753123,
        0.14753123],
       [0.21392926, 0.2021914 , 0.22692134, ..., 0.26357867, 0.26174896,
        0.26174896]])

In [19]:
#Root Mean Squared Error for validation.
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(pred, test):
    pred = pred[test.nonzero()].flatten() 
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, test))

print("Collaborative Filtering RMSE")
print('User-based: ', rmse(user_prediction, test_data_matrix))   # ~3.12584229228
print('Movie-based: ', rmse(item_prediction, test_data_matrix))	  # ~3.45381500808


Collaborative Filtering RMSE
User-based:  3.1296833506501773
Movie-based:  3.4568277589037466
