# MovieLens Dataset Analysis

The MovieLens dataset contains information about 100,000 movie ratings provided by 943 users for 1682 movies. This dataset also includes demographic information about users, such as age, gender, and occupation.

## Task

Suppose you are working for a movie streaming service and your task is to design a recommendation system to suggest movies to users based on their past ratings and demographic information. Your goal is to provide movie recommendations that users will enjoy and keep them engaged with the platform.

## Design

Design a collaborative filtering (item-based with cosine similarity) recommender system that recommends movies based on users’ rating history. 

## Evaluation

Measure the accuracy of your predictions using metrics such as the mean absolute error and the root mean squared error. 

**Note:** Using libraries is not allowed in this section.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read dataset from DataSet folder next to the file 
data = pd.read_csv('DataSet/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# read item dataset
item = pd.read_csv(
    'DataSet/u.item', 
    sep='|', 
    header=None, 
    encoding='latin-1',
    names=[
        'movie_id', 
        'movie_title',
        'release_date', 
        'video_release_date', 
        'IMDb_URL', 
        'unknown', 
        'Action', 
        'Adventure', 
        'Animation', 
        'Children', 
        'Comedy', 
        'Crime', 
        'Documentary', 
        'Drama', 
        'Fantasy', 
        'Film-Noir', 
        'Horror', 
        'Musical', 
        'Mystery', 
        'Romance', 
        'Sci-Fi', 
        'Thriller', 
        'War', 
        'Western'
    ]
)

item.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
# read user dataset 
user = pd.read_csv(
    'DataSet/u.user', 
    sep='|', 
    header=None, 
    names=[
        'user_id', 
        'age',
        'gender',
        'occupation',
        'zip_code'
    ]
)

user.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
# define adjusted cosine similarity function for two vectors
def cosine_similarity(a: np.array, b: np.array) -> float:
    # calculate the average of the two vectors
    a_avg = np.mean(a)
    b_avg = np.mean(b)

    # calculate the adjusted cosine similarity
    numerator = np.dot(a - a_avg, b - b_avg)
    denominator = np.linalg.norm(a - a_avg) * np.linalg.norm(b - b_avg)

    return numerator / denominator

In [16]:
def fill_empty_raiting(item_rates):
    output = np.zeros(len(user))
    for i in item_rates:
        output[i[0]-1] = i[1]
    
    return output

In [26]:
# find the similarity between any two items and save it as a matrix


items_full_rating = np.zeros((item.shape[0], len(user))) 

for i in range(item.shape[0]):
        item_i_ratings = data[data.item_id == item.iloc[i].movie_id][['user_id', 'rating']]
        # sort item_i_ratings by user_id
        item_i_ratings = item_i_ratings.sort_values(by='user_id')

        # fill the empty ratings
        items_full_rating[i] = fill_empty_raiting(item_i_ratings.values)



# test the ith item
# i = 10
# item_i_ratings = data[data.item_id == item.iloc[i].movie_id][['user_id', 'rating', 'item_id']]
# item_i_ratings = item_i_ratings.sort_values(by='user_id')
# print(item_i_ratings)
# print(items_full_rating[i])


item_similarity = np.zeros((item.shape[0], item.shape[0]))

items_count = item.shape[0]

for i in range(items_count):
    for j in range(items_count):
        item_similarity[i][j] = cosine_similarity(items_full_rating[i], items_full_rating[j])
                  




In [33]:
item_similarity[0:10, 0:10]

array([[ 1.        ,  0.23459453,  0.19336208,  0.22621324,  0.1288397 ,
         0.01511263,  0.34735397,  0.25448971,  0.20950172,  0.10465499],
       [ 0.23459453,  1.        ,  0.1906486 ,  0.40904414,  0.24071235,
         0.03006227,  0.22002157,  0.20601978,  0.07789398,  0.07290564],
       [ 0.19336208,  0.1906486 ,  1.        ,  0.22784943,  0.14136814,
         0.06534697,  0.25885456,  0.07863639,  0.14618061,  0.0796085 ],
       [ 0.22621324,  0.40904414,  0.22784943,  1.        ,  0.23729797,
         0.02187772,  0.29548902,  0.35280049,  0.22992182,  0.13821985],
       [ 0.1288397 ,  0.24071235,  0.14136814,  0.23729797,  1.        ,
        -0.00859378,  0.20528937,  0.14586586,  0.14254138, -0.03374614],
       [ 0.01511263,  0.03006227,  0.06534697,  0.02187772, -0.00859378,
         1.        ,  0.0544151 ,  0.01232958,  0.0796193 ,  0.16608407],
       [ 0.34735397,  0.22002157,  0.25885456,  0.29548902,  0.20528937,
         0.0544151 ,  1.        ,  0.1906702 

In [47]:
# I choose thershold to get Neighbors of each item
threshold = 0.4

#least neighbors for each item
minimum_neighbours = 3

# find the neighbors of each item
neighbors = []
for i in range(items_count):

    temp = np.zeros(items_count)
    temp_thershold = threshold

    while np.sum(temp) < minimum_neighbours:
        temp_thershold -= 0.01
        temp = item_similarity[i] > temp_thershold
    

    # get index in temp that are True
    temp = np.where(temp)[0]

    # sort the temp by similarity
    temp = sorted(temp, key=lambda x: item_similarity[i][x], reverse=True)
    neighbors.append(temp)


In [56]:
# check the neighbors of the ith item
i = 6
neighbors[i], item_similarity[i][neighbors[i]]

([6, 99, 116, 55, 10],
 array([1.        , 0.44805931, 0.4235096 , 0.39256695, 0.39140909]))