# Table of Content
1. Introduction and Import
2. Feature Extraction
3. User Likings and Implementation

In [2]:
import pandas as pd

In [3]:
metadata=pd.read_csv('Dataset/prepared/final_metadata.csv')
metadata.head()

Unnamed: 0,movieId,title,year,genres,cast,director,collection,production_companies,vote_count,vote_average,popularity,keywords
0,862,Toy Story,1995,"Animation, Comedy, Family","Tom Hanks, Tim Allen, Don Rickles",John Lasseter,Toy Story Collection,Pixar Animation Studios,5415.0,7.7,21.946943,"jealousy, toy, boy, friendship, friends, rival..."
1,8844,Jumanji,1995,"Adventure, Fantasy, Family","Robin Williams, Jonathan Hyde, Kirsten Dunst",Joe Johnston,,"TriStar Pictures, Teitler Film, Interscope Com...",2413.0,6.9,17.015539,"disappearance, based on children's book, new h..."
2,15602,Grumpier Old Men,1995,"Romance, Comedy","Walter Matthau, Jack Lemmon, Ann-Margret",Howard Deutch,Grumpy Old Men Collection,"Warner Bros., Lancaster Gate",92.0,6.5,11.7129,"fishing, best friend, duringcreditsstinger"
3,31357,Waiting to Exhale,1995,"Comedy, Drama, Romance","Whitney Houston, Angela Bassett, Loretta Devine",Forest Whitaker,,Twentieth Century Fox Film Corporation,34.0,6.1,3.859495,"based on novel, interracial relationship, sing..."
4,11862,Father of the Bride Part II,1995,Comedy,"Steve Martin, Diane Keaton, Martin Short",Charles Shyer,Father of the Bride Collection,"Sandollar Productions, Touchstone Pictures",173.0,5.7,8.387519,"baby, midlife crisis, confidence, aging, daugh..."


# 2. Features Extraction

In [4]:
movies=metadata[['movieId','genres','cast','keywords']]

movies.head()

Unnamed: 0,movieId,genres,cast,keywords
0,862,"Animation, Comedy, Family","Tom Hanks, Tim Allen, Don Rickles","jealousy, toy, boy, friendship, friends, rival..."
1,8844,"Adventure, Fantasy, Family","Robin Williams, Jonathan Hyde, Kirsten Dunst","disappearance, based on children's book, new h..."
2,15602,"Romance, Comedy","Walter Matthau, Jack Lemmon, Ann-Margret","fishing, best friend, duringcreditsstinger"
3,31357,"Comedy, Drama, Romance","Whitney Houston, Angela Bassett, Loretta Devine","based on novel, interracial relationship, sing..."
4,11862,Comedy,"Steve Martin, Diane Keaton, Martin Short","baby, midlife crisis, confidence, aging, daugh..."


In [5]:
# creating a feature column consisting of genres, cast and keywords
features=movies.copy()
features['features']=''
features.fillna(' ',inplace=True)
features['features']=features['genres']+', '+features['cast']+features['keywords']
features.drop(['genres','cast','keywords'],axis=1,inplace=True)
features['features']=features['features'].str.split(', ')
features.head()

Unnamed: 0,movieId,features
0,862,"[Animation, Comedy, Family, Tom Hanks, Tim All..."
1,8844,"[Adventure, Fantasy, Family, Robin Williams, J..."
2,15602,"[Romance, Comedy, Walter Matthau, Jack Lemmon,..."
3,31357,"[Comedy, Drama, Romance, Whitney Houston, Ange..."
4,11862,"[Comedy, Steve Martin, Diane Keaton, Martin Sh..."


In [6]:
rating=pd.read_csv('Dataset/ratings.csv')
rating.drop(['timestamp'],axis=1,inplace=True)
rating.columns=['userId','movieId','rating']

rating=rating.head(50000)
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [7]:
# merging the ratings with the features
merged=pd.merge(rating,features,on='movieId')
merged.head()

Unnamed: 0,userId,movieId,rating,features
0,1,110,1.0,"[Drama, Mystery, Romance, Irène Jacob, Jean-Lo..."
1,11,110,3.5,"[Drama, Mystery, Romance, Irène Jacob, Jean-Lo..."
2,22,110,5.0,"[Drama, Mystery, Romance, Irène Jacob, Jean-Lo..."
3,24,110,5.0,"[Drama, Mystery, Romance, Irène Jacob, Jean-Lo..."
4,29,110,3.0,"[Drama, Mystery, Romance, Irène Jacob, Jean-Lo..."


In [10]:
# calculating the average rating given by the user to the specific feature
# if user gives a rating to a certain movie, then the rating to all the features of that movie is considered to be same
# after that all the average rating for that specific feature is listed in th below df

likings_sum = merged.explode('features').pivot_table(index='userId', columns='features', values='rating', aggfunc='sum').fillna(0, downcast='infer')
likings_count = merged.explode('features').pivot_table(index='userId', columns='features', values='rating', aggfunc='count').fillna(0, downcast='infer')

likings_count['total']=likings_count.sum(axis=1)
likings_count.head()

features,Unnamed: 1_level_0,Unnamed: 2_level_0,Therese Giehsecoming out,musical,short,woman director,16th century,18th century,1910s,1930s,...,Ángela Molina,Átila Iório,Åke Grönberg,Çağan Irmakbar,Élodie Bouchez,Øverste hylde,Đơn Dương,Анатолій Солоніцин,Михаил Пореченков,total
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,104
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,238
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,127
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,240
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,151


In [9]:
likings_sum.head()

features,Unnamed: 1_level_0,Unnamed: 2_level_0,Therese Giehsecoming out,musical,short,woman director,16th century,18th century,1910s,1930s,...,zoo,Ángela Molina,Átila Iório,Åke Grönberg,Çağan Irmakbar,Élodie Bouchez,Øverste hylde,Đơn Dương,Анатолій Солоніцин,Михаил Пореченков
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0
2,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0
3,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0
4,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0
5,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,0,0.0,0,0.0,0


## 3. Implementation

In [35]:
import numpy as np

# this function calculates the average rating given by the user(specified) to all the features 
#  of that movie(specified) 
#  after that averages of that averages is returned as expected to be the rating of that user for that movie

def predict(user,movie,likings_count,likings_sum,features):
    
    if user not in list(likings_count.index):
        print('User not found')
        return 0
    if movie not in list(features.movieId):
        print('movie not found')
        return 0
    
#     extracting the features of the movie
    fet=features[features.movieId==movie].features
    fet=fet.values[0]

#     calculating the average rating given by the user to the features of the movie

    avg=[]
    for i in fet:
        c=likings_count.loc[user][i]
        s=likings_sum.loc[user][i]
        if c==0:
            avg.append(0)
        else:
            avg.append(s/c)

#     calculating the average of the averages
    avg=np.array(avg)
    avg=avg[avg!=0]
    if len(avg)==0:
        return 0

    return avg.mean()

In [36]:
user=11
movie=110

# for the user 11 , the predicted rating for the movie 110 is 3.50521448
predict(user,movie,likings_count,likings_sum,features)

3.50521448248721

In [None]:
likings_count.to_csv('Dataset/prepared/user_likings_count.csv')
likings_sum.to_csv('Dataset/prepared/user_likings_sum.csv')
features.to_csv('Dataset/prepared/movie_features.csv',index=False)