# Content-based Filtering with Movie Description

Utilize film metadata (such as genre, year, director) in your recommendation system and create a content-based filtering model that suggests movies to users based on their preference history for specific genres or directors

In [70]:
import numpy as np
import pandas as pd

In [71]:
# read dataset from DataSet folder next to the file 
data = pd.read_csv('DataSet/u1.base', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [72]:
# read item dataset
item = pd.read_csv(
    'DataSet/u.item', 
    sep='|', 
    header=None, 
    encoding='latin-1',
    names=[
        'movie_id', 
        'movie_title',
        'release_date', 
        'video_release_date', 
        'IMDb_URL', 
        'unknown', 
        'Action', 
        'Adventure', 
        'Animation', 
        'Children', 
        'Comedy', 
        'Crime', 
        'Documentary', 
        'Drama', 
        'Fantasy', 
        'Film-Noir', 
        'Horror', 
        'Musical', 
        'Mystery', 
        'Romance', 
        'Sci-Fi', 
        'Thriller', 
        'War', 
        'Western'
    ]
)

movie_genre = [        
        'Action', 
        'Adventure', 
        'Animation', 
        'Children', 
        'Comedy', 
        'Crime', 
        'Documentary', 
        'Drama', 
        'Fantasy', 
        'Film-Noir', 
        'Horror', 
        'Musical', 
        'Mystery', 
        'Romance', 
        'Sci-Fi', 
        'Thriller', 
        'War', 
        'Western'
]

item.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [73]:
# read user dataset 
user = pd.read_csv(
    'DataSet/u.user', 
    sep='|', 
    header=None, 
    names=[
        'user_id', 
        'age',
        'gender',
        'occupation',
        'zip_code'
    ]
)

user.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [74]:
# read test  DataSet
test_data = pd.read_csv('DataSet/u1.test', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
test_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [77]:
# generate users profile based on the user's rating
user_profile = list()


for i in range(len(user)):
    user_data = data[data['user_id'] == i+1]
    user_item = item[item['movie_id'].isin(user_data['item_id'])]
    # sort user_data by item_id
    user_data = user_data.sort_values(by='item_id')

    # sort user_item by movie_id
    user_item = user_item.sort_values(by='movie_id')

    # multiply user rating with movie genre
    temp_user_profile = np.dot(user_data['rating'], user_item[movie_genre])

    # check if the dot product is a correct approach for my solution
    check_sum = 0
    A = list(np.array(user_data['rating']))
    B = list(np.array(user_item['Action']))
    for i in range(len(A)):
        check_sum += A[i]*B[i]
    
    assert check_sum == temp_user_profile[0], "Error in dot product"

    # normalize user profile
    temp_user_profile = temp_user_profile/np.linalg.norm(temp_user_profile)
    

    user_profile.append(list(temp_user_profile))
user_profile = pd.DataFrame(user_profile, columns=movie_genre, index=list(range(1, len(user)+1)))
user_profile.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,0.274875,0.121524,0.072336,0.09259,0.503456,0.107057,0.069442,0.645234,0.00868,0.014467,0.037615,0.037615,0.040508,0.266195,0.228581,0.257515,0.124417,0.011574
2,0.14176,0.033355,0.033355,0.066711,0.275182,0.200132,0.0,0.758834,0.0,0.041694,0.025017,0.025017,0.066711,0.441958,0.025017,0.275182,0.050033,0.0
3,0.280695,0.165115,0.0,0.0,0.330229,0.198137,0.082557,0.643947,0.0,0.033023,0.049534,0.066046,0.247672,0.148603,0.280695,0.363252,0.148603,0.0
4,0.488648,0.199901,0.0,0.0,0.222113,0.310958,0.111056,0.333169,0.0,0.0,0.0,0.111056,0.288746,0.066634,0.244324,0.53307,0.088845,0.0
5,0.486087,0.325631,0.169895,0.155737,0.59935,0.080228,0.0,0.160456,0.0,0.0,0.184052,0.122702,0.033035,0.108544,0.353947,0.099105,0.127421,0.004719


In [105]:
def predict_rating(user_id, item_id):
    user_id_profile = user_profile.loc[user_id]
    full_item = item[item['movie_id'] == item_id]
    item_genre = full_item[movie_genre]
    rating = np.dot(user_id_profile, item_genre.T)
    return rating[0]

def predict_rating_for_user(user_id):
    raiting = []
    for temp_item in item['movie_id']:
        raiting.append(predict_rating(user_id, temp_item))

    # scale the raiting to 1-5
    raiting = np.array(raiting)
    raiting = raiting - min(raiting)
    raiting = raiting/max(raiting)*4+1

    return raiting

# calculate all the rating for all the user and make the matrix
scale = len(user)/20 
print("progress: ", end="")
rating_matrix = []
for temp_user in user['user_id']:
    rating_matrix.append(predict_rating_for_user(temp_user))
    if(temp_user % scale == 0):
        print("#", end="", flush=True)

rating_matrix = pd.DataFrame(rating_matrix, columns=item['movie_id'], index=user['user_id'])
rating_matrix.head() 


progress: 

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.609756,2.574913,1.620209,4.428571,3.432056,2.554007,3.10453,3.989547,2.554007,2.853659,...,2.28223,2.554007,2.554007,2.554007,2.554007,2.554007,2.261324,3.195122,2.212544,2.554007
2,1.891089,2.069307,1.653465,3.792079,3.930693,2.80198,2.861386,3.613861,2.80198,2.920792,...,1.990099,2.80198,2.80198,2.80198,2.80198,2.80198,2.70297,3.851485,1.653465,2.80198
3,1.784314,2.921569,1.862745,3.980392,3.862745,2.529412,3.196078,3.313725,2.529412,2.882353,...,2.529412,2.529412,2.529412,2.529412,2.529412,2.529412,2.215686,2.882353,1.784314,2.529412
4,1.519481,3.857143,2.246753,3.441558,3.753247,1.779221,2.350649,2.298701,1.779221,1.987013,...,3.38961,1.779221,1.779221,1.779221,1.779221,1.779221,2.402597,1.935065,1.519481,1.779221
5,2.898305,2.869249,1.20339,3.556901,1.697337,1.329298,2.05569,2.878935,1.329298,1.590799,...,2.200969,1.329298,1.329298,1.329298,1.329298,1.329298,1.42615,1.552058,2.230024,1.329298


In [106]:
answer = []
for i in range(len(test_data)):
    user_test_id = test_data['user_id'][i]
    item_test_id = test_data['item_id'][i]
    raiting_test = test_data['rating'][i]
    answer.append([rating_matrix[item_test_id][user_test_id], raiting_test])

In [108]:
# calculate mean absolute error
error = 0
for a in answer:
    error += np.abs(a[0] - a[1])

error /= len(answer)

print('Mean Absolute Error:', error)

Mean Absolute Error: 1.350938449837401


In [109]:
# calculate root mean square error
error = 0
for a in answer:
    error += (a[0] - a[1]) ** 2

error /= len(answer)
error = np.sqrt(error)

print('Root Mean Square Error:', error)

Root Mean Square Error: 1.6240385947993123
