In [None]:
# inspired by https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system

In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from ast import literal_eval

In [2]:
credit = pd.read_csv('tmdb_5000_credits.csv')
movie = pd.read_csv('tmdb_5000_movies.csv')
ratings = pd.read_csv('ratings_small.csv')

# Demographic Filtering

In [3]:
#Merge DataFrame
merged= movie.merge(credit,left_on=['id','title'],right_on=['movie_id','title'])

In [5]:
#Calculate Weighted Rating

# movies which is higher than the mean can be in the chart
C= merged['vote_average'].mean()  
# movies voted have to be more than 90% percentile to enter the chart
m= merged['vote_count'].quantile(0.9)
#Create a copy file and filter the qualified movie
q_movies = merged.copy().loc[merged['vote_count'] >= m]
print(q_movies.shape)

# There are 481 movies qualified to be in the chart

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)


(481, 23)


In [6]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 10 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

Unnamed: 0,title,vote_count,vote_average,score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.92002
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
3337,The Godfather,5893,8.4,7.851236
95,Interstellar,10867,8.1,7.809479
809,Forrest Gump,7927,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727243
1990,The Empire Strikes Back,5879,8.2,7.697884


# Content Based Filtering

## Movie Description


In [7]:
#Import file, rename column and merge
df1 = credit
df2 = movie
user = ratings
user = user.rename(columns = {'movieId':'movie_id'})
user = pd.read_csv('ratings_small.csv').rename(columns = {'movieId':'movie_id'})
user_df = pd.merge(user, df1, on='movie_id', how='inner')
df1.columns = ['id','title','cast','crew']
df2= df2.merge(df1,on=['id','title'])

In [8]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(lowercase = True, stop_words='english', use_idf = True)

# Define lemmatizer
lemmatizer = WordNetLemmatizer()

#Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('.')
df2['tagline'] = df2['tagline'].fillna('.')

# Comine the overview with tagline descriptive data for each movie
df2['overview_tagline'] = [df2['overview'][i].split('.')[0] +' '+df2['tagline'][i] for i in range(len(df2['overview']))]

#Construct the required TF-IDF matrix by fitting and transforming the data

corpus = list(df2['overview_tagline'])
corpus_stemmed = []
for line in corpus:
    words = word_tokenize(line)
    doc_stemmed = ''
    for w in words:
        w_stemmed = lemmatizer.lemmatize(w)
        doc_stemmed += ' ' + w_stemmed
    corpus_stemmed.append(doc_stemmed)
    
tfidf_matrix = tfidf.fit_transform(corpus_stemmed)
 

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(4803, 14400)

In [9]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.01453412, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01453412, 1.        , 0.        , ..., 0.03937667, 0.03070786,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.03937667, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.03070786, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [10]:
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

In [11]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2['title'].iloc[movie_indices]

In [12]:
#Get recommendations for 'The Dark Knight Rises'

print(get_recommendations('The Dark Knight Rises'))

2507                                  Slow Burn
65                              The Dark Knight
1181                                        JFK
1369                                  Cape Fear
2193                       Secret in Their Eyes
2035                        Our Kind of Traitor
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
975                           The International
198                                    R.I.P.D.
Name: title, dtype: object


In [13]:
#Get recommendations for 'The Avengers'

print(get_recommendations('The Avengers'))

215     Fantastic 4: Rise of the Silver Surfer
2136                Team America: World Police
1715                                   Timecop
133                               Dark Shadows
1626                    My Super Ex-Girlfriend
1234                            The Art of War
1183                               The Mexican
1907                              Maximum Risk
553                                The Kingdom
1704                             The Big Short
Name: title, dtype: object


In [14]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [15]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [16]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [17]:
# Define new director, cast, genres and keywords features that are in a suitable form.
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

In [18]:
# Print the new features of the first 3 films
df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"


In [19]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [20]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [21]:
def create_all(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_all, axis=1)

In [22]:
# Import CountVectorizer and create the count matrix

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])
count_matrix.shape

(4803, 11520)

In [23]:
# Compute the Cosine Similarity matrix based on the count_matrix

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [24]:
indices = pd.Series(df2.index, index=df2['title'])

In [25]:
#Get recommendations for 'The Dark Knight Rises'

print(get_recommendations('The Dark Knight Rises', cosine_sim2))

65               The Dark Knight
119                Batman Begins
4638    Amidst the Devil's Wings
1196                The Prestige
3073           Romeo Is Bleeding
3326              Black November
1503                      Takers
1986                      Faster
303                     Catwoman
747               Gangster Squad
Name: title, dtype: object


In [26]:
#Get recommendations for 'The Godfather'

print(get_recommendations('The Godfather', cosine_sim2))

867      The Godfather: Part III
2731      The Godfather: Part II
4638    Amidst the Devil's Wings
2649           The Son of No One
1525              Apocalypse Now
1018             The Cotton Club
1170     The Talented Mr. Ripley
1209               The Rainmaker
1394               Donnie Brasco
1850                    Scarface
Name: title, dtype: object


# Collaborative Filtering

In [27]:
#Preparing matrix for user-based and item-based
user_ratings_pivot0 = ratings.pivot(index='userId', columns='movieId', values='rating')
avg_ratings = user_ratings_pivot0.mean(axis=1)
user_ratings_pivot = user_ratings_pivot0.sub(avg_ratings, axis=0)
# user_based matrix
user_ratings_pivot.fillna(0, inplace=True)
# Change from user_based to item_based matrix
movie_ratings_pivot = user_ratings_pivot.T

## Item Baesd

In [28]:
#Matrix of similarity between each movie
similarities = cosine_similarity(movie_ratings_pivot)
cosine_similarity_df = pd.DataFrame(similarities,
                                    columns=movie_ratings_pivot.index,
                                    index=movie_ratings_pivot.index)
cosine_similarity_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,-0.042287,-0.064368,-0.080701,-0.041198,-0.014749,-0.028232,0.005169,-0.078349,-0.048015,...,-0.009536,0.04451,-0.116383,-0.150744,-0.150744,0.033538,0.116383,0.0,0.0,0.009536
2,-0.042287,1.0,-0.031495,-0.059223,-0.063168,-0.102614,0.036673,0.049508,-0.024567,0.200814,...,0.0,0.079861,0.112288,-0.013585,-0.013585,0.060176,-0.112288,0.0,0.0,0.0
3,-0.064368,-0.031495,1.0,0.049531,0.17535,-0.086597,-0.034197,0.179569,0.042033,0.016735,...,0.0,0.0,0.0,-0.012656,-0.012656,0.0,0.0,0.0,0.0,0.0
4,-0.080701,-0.059223,0.049531,1.0,0.052369,-0.005074,0.07509,-0.105059,0.04235,0.03938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.041198,-0.063168,0.17535,0.052369,1.0,0.066489,-0.009678,0.017522,-0.029532,-0.000736,...,0.0,-0.275465,0.0,-0.015511,-0.015511,0.0,0.0,0.0,0.0,0.0


In [29]:
#List the top 10 movie which is most similar to movie 1
cosine_similarity_df.loc[1].sort_values(ascending=False).head(10)


movieId
1        1.000000
3114     0.401537
78499    0.267200
2355     0.244279
471      0.206070
8961     0.205355
3034     0.200939
1198     0.198590
58559    0.198550
4886     0.198196
Name: 1, dtype: float64

## KNN(User Based)

In [30]:
similarities = cosine_similarity(user_ratings_pivot)
cosine_similarity_df = pd.DataFrame(similarities, 
                                    index=user_ratings_pivot.index,
                                    columns=user_ratings_pivot.index)
cosine_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.00362,-0.002274,0.0,-0.070321,0.0,0.042632,0.0,...,0.0,0.0,0.018643,0.001031,0.0,0.0,0.0,0.044095,0.0,-0.013096
2,0.0,1.0,-0.001852,-0.004854,0.012639,0.0,0.042691,0.021066,0.011109,-0.007989,...,-0.018248,-0.021546,0.018902,-0.058952,0.028515,-0.106828,-0.007999,-0.041628,-0.090233,0.056258
3,0.0,-0.001852,1.0,0.018594,-0.025903,-0.0632,0.0549,0.026488,-0.036187,0.038021,...,0.044297,0.019581,0.070702,0.030669,0.143705,0.096713,0.027451,0.089297,-0.009815,0.062276
4,0.00362,-0.004854,0.018594,1.0,0.010801,0.019224,0.057519,0.05543,-0.010442,0.005126,...,0.011978,0.006569,0.027687,0.092092,0.021334,0.040833,0.018428,0.028642,0.019848,0.032749
5,-0.002274,0.012639,-0.025903,0.010801,1.0,-0.005843,-0.015075,-0.038886,0.013708,0.0305,...,0.046134,0.001903,0.00162,0.036819,-0.038269,-0.019537,-0.071721,0.00376,-0.029455,-0.036814


In [31]:
#We find the 3 most similar user.
#We then find the ratings there users gave to the movie from the orginal rating dataframe and get the mean. 
#This rating represents the rating the user would likely give to movieX based on the rating users similar to them gave it.
nearest_neighbors = cosine_similarity_df[1].sort_values(ascending=False)[1:4].index
nearest_neighbors

Int64Index([35, 197, 539], dtype='int64', name='userId')

## SVD

In [32]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
from surprise import accuracy
reader = Reader()

In [33]:
# Load data
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

#SVD
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5,n_jobs=-1,verbose=True)

#Fit the model
data=data.build_full_trainset()
svd.fit(data)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8923  0.9042  0.8957  0.8940  0.9026  0.8978  0.0048  
MAE (testset)     0.6850  0.6947  0.6919  0.6879  0.6968  0.6913  0.0043  
Fit time          4.73    4.79    4.70    4.66    4.93    4.76    0.09    
Test time         0.16    0.14    0.13    0.13    0.12    0.13    0.01    


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb714e97be0>

In [34]:
#Show ratings that user 1 has given 
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [35]:
#Predict what rating will user 1 give movie 302
svd.predict(1, 302, 3,verbose=True)

user: 1          item: 302        r_ui = 3.00   est = 2.71   {'was_impossible': False}


Prediction(uid=1, iid=302, r_ui=3, est=2.714368435663937, details={'was_impossible': False})

# Hybrid

In [36]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan


In [37]:
#Import data and clean data
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
md = pd.read_csv('movies_metadata.csv')
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]
df = smd.copy()
df.reset_index(inplace=True)
id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(df[['title', 'id']], on='id').set_index('title')
indices = pd.Series(df.index, index=df['title'])
indices_map = id_map.set_index('id')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [38]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [39]:
hybrid(1, 'Toy Story')

Unnamed: 0,title,vote_count,vote_average,id,est
2825,Diner,84.0,6.9,13776,3.049462
1341,Alien Escape,2.0,4.5,29938,2.90707
312,Bitter Moon,115.0,7.0,10497,2.896159
2767,Dersu Uzala,90.0,8.0,9764,2.873248
1449,Quest for Camelot,193.0,6.9,18937,2.808553
4652,Quai des Orfèvres,15.0,7.8,49842,2.727452
634,Heavy,11.0,7.7,22621,2.697604
1345,The Butcher Boy,36.0,6.7,22797,2.693474
942,Strictly Ballroom,83.0,6.2,10409,2.682038
1784,One Crazy Summer,54.0,6.4,18282,2.670652
