# MovieLens Dataset Analysis

The MovieLens dataset contains information about 100,000 movie ratings provided by 943 users for 1682 movies. This dataset also includes demographic information about users, such as age, gender, and occupation.

## Task

Suppose you are working for a movie streaming service and your task is to design a recommendation system to suggest movies to users based on their past ratings and demographic information. Your goal is to provide movie recommendations that users will enjoy and keep them engaged with the platform.

## Design

Design a collaborative filtering (item-based with cosine similarity) recommender system that recommends movies based on users’ rating history. 

## Evaluation

Measure the accuracy of your predictions using metrics such as the mean absolute error and the root mean squared error. 

**Note:** Using libraries is not allowed in this section.

In [1]:
import pandas as pd
import numpy as np

In [57]:
# read dataset from DataSet folder next to the file 
data = pd.read_csv('DataSet/u1.base', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [58]:
# read item dataset
item = pd.read_csv(
    'DataSet/u.item', 
    sep='|', 
    header=None, 
    encoding='latin-1',
    names=[
        'movie_id', 
        'movie_title',
        'release_date', 
        'video_release_date', 
        'IMDb_URL', 
        'unknown', 
        'Action', 
        'Adventure', 
        'Animation', 
        'Children', 
        'Comedy', 
        'Crime', 
        'Documentary', 
        'Drama', 
        'Fantasy', 
        'Film-Noir', 
        'Horror', 
        'Musical', 
        'Mystery', 
        'Romance', 
        'Sci-Fi', 
        'Thriller', 
        'War', 
        'Western'
    ]
)

item.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [59]:
# read user dataset 
user = pd.read_csv(
    'DataSet/u.user', 
    sep='|', 
    header=None, 
    names=[
        'user_id', 
        'age',
        'gender',
        'occupation',
        'zip_code'
    ]
)

user.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [75]:
# define adjusted cosine similarity function for two vectors
def cosine_similarity(a: np.array, b: np.array) -> float:
    # calculate the average of the two vectors
    a_avg = np.mean(a)
    b_avg = np.mean(b)

    # calculate the adjusted cosine similarity
    numerator = np.dot(a - a_avg, b - b_avg)
    denominator = np.linalg.norm(a - a_avg) * np.linalg.norm(b - b_avg)
    if(denominator == 0):
        return 0
    return numerator / denominator

In [61]:
def fill_empty_raiting(item_rates):
    output = np.zeros(len(user))
    for i in item_rates:
        output[i[0]-1] = i[1]
    
    return output

In [76]:
# find the similarity between any two items and save it as a matrix


items_full_rating = np.zeros((item.shape[0], len(user))) 

for i in range(item.shape[0]):
        item_i_ratings = data[data.item_id == item.iloc[i].movie_id][['user_id', 'rating']]
        # sort item_i_ratings by user_id
        item_i_ratings = item_i_ratings.sort_values(by='user_id')

        # fill the empty ratings
        items_full_rating[i] = fill_empty_raiting(item_i_ratings.values)



# test the ith item
# i = 10
# item_i_ratings = data[data.item_id == item.iloc[i].movie_id][['user_id', 'rating', 'item_id']]
# item_i_ratings = item_i_ratings.sort_values(by='user_id')
# print(item_i_ratings)
# print(items_full_rating[i])


item_similarity = np.zeros((item.shape[0], item.shape[0]))

items_count = item.shape[0]

for i in range(items_count):
    for j in range(items_count):
        item_similarity[i][j] = cosine_similarity(items_full_rating[i], items_full_rating[j])
                  

In [77]:
item_similarity[0:10, 0:10]

array([[ 1.        ,  0.21566273,  0.19527919,  0.16693814,  0.09575089,
         0.00549466,  0.30498394,  0.21370386,  0.17129246,  0.0905195 ],
       [ 0.21566273,  1.        ,  0.15152886,  0.3338697 ,  0.21846946,
         0.04360463,  0.20072639,  0.17817673,  0.04371251,  0.03724215],
       [ 0.19527919,  0.15152886,  1.        ,  0.17718501,  0.08224166,
         0.04489898,  0.20330618,  0.08909625,  0.19101628,  0.08322652],
       [ 0.16693814,  0.3338697 ,  0.17718501,  1.        ,  0.18942963,
         0.04872585,  0.2311173 ,  0.27627529,  0.17308813,  0.0847885 ],
       [ 0.09575089,  0.21846946,  0.08224166,  0.18942963,  1.        ,
        -0.01886633,  0.16419423,  0.08756006,  0.09335111, -0.03236824],
       [ 0.00549466,  0.04360463,  0.04489898,  0.04872585, -0.01886633,
         1.        ,  0.05811167,  0.01523954,  0.08504553,  0.10774117],
       [ 0.30498394,  0.20072639,  0.20330618,  0.2311173 ,  0.16419423,
         0.05811167,  1.        ,  0.14044262

In [112]:
def predict(user_id, item_id):
    user_ratings = data[data.user_id == user_id][['item_id', 'rating']]
    user_avg = np.mean(user_ratings['rating'])
    rated_items = user_ratings['item_id'].values

    numerator = 0
    denominator = 0

    for x in rated_items:
        if(item_similarity[item_id-1][x-1] > 0.3):
            numerator += item_similarity[item_id-1][x-1] * (user_ratings[user_ratings.item_id == x]['rating'].values[0])
            denominator += np.abs(item_similarity[item_id-1][x-1])
    
    if denominator == 0:
        return user_avg
    
    return numerator / denominator

In [114]:
# read test  DataSet
test_data = pd.read_csv('DataSet/u1.test', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
test_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [117]:
answer = []

for sample in test_data.values:
    user_id = sample[0]
    item_id = sample[1]
    rating = sample[2]
    
    predicted_rating = predict(user_id, item_id)
    answer.append([rating, predicted_rating])

In [120]:
# calculate mean absolute error
error = 0
for a in answer:
    error += np.abs(a[0] - a[1])

error /= len(answer)

print('Mean Absolute Error:', error)

Mean Absolute Error: 0.7923401037983085


In [121]:
# calculate root mean square error
error = 0
for a in answer:
    error += (a[0] - a[1]) ** 2

error /= len(answer)
error = np.sqrt(error)

print('Root Mean Square Error:', error)

Root Mean Square Error: 1.0414774175938437
