In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('../data/processed/movies.csv')

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,genres,keywords,popularity,title,vote_average,vote_count,movie_id,cast,crew
0,0,"['Action', 'Adventure', 'Fantasy', 'Science Fi...","['culture clash', 'future', 'space war', 'spac...",150.437577,Avatar,7.2,11800,19995,"['Sam Worthington', 'Zoe Saldana', 'Sigourney ...",['James Cameron']
1,1,"['Adventure', 'Fantasy', 'Action']","['ocean', 'drug abuse', 'exotic island', 'east...",139.082615,Pirates of the Caribbean: At World's End,6.9,4500,285,"['Johnny Depp', 'Orlando Bloom', 'Keira Knight...",['Gore Verbinski']
2,2,"['Action', 'Adventure', 'Crime']","['spy', 'based on novel', 'secret agent', 'seq...",107.376788,Spectre,6.3,4466,206647,"['Daniel Craig', 'Christoph Waltz', 'Léa Seydo...",['Sam Mendes']


# Demographic Filtering

Including a Now Trending system is a quick and easy way to find movies that are very popular. We need to define a to rate films using both average rating and rating count. We'll be using IMDB's rating formula. Weighted Rating
\begin{equation}
WR = (v/v+m) * R + (m/v+m) * C
\end{equation}




Where, 

-  v is the number of votes for the movie
-  m is the min votes required to be in the list
-  R is the average rating of the movie
-  C is the mean vote across the report

In [4]:
C = df['vote_average'].mean()
print(f'The average movie rating for our dataset is: {round(C,2)}')

The average movie rating for our dataset is: 6.11


In [5]:
m = df['vote_count'].quantile(.9)
m

1862.0

Let's now remove movies that do not make the cut

In [6]:
qualified_m = df.copy().loc[df['vote_count'] >= m]

In [7]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)


Let us apply our fomula to our movie dataset

In [8]:
qualified_m['score'] = qualified_m.apply(weighted_rating, axis=1)


In [9]:
#Sort movies based on score calculated above
qualified_m = qualified_m.sort_values('score', ascending=False)

#Print the top 15 movies
qualified_m[['title', 'vote_count', 'vote_average', 'score']].head(10)


Unnamed: 0,title,vote_count,vote_average,score
1887,The Shawshank Redemption,8205,8.5,8.058779
662,Fight Club,9413,8.3,7.93908
65,The Dark Knight,12002,8.2,7.91991
3237,Pulp Fiction,8428,8.3,7.904531
96,Inception,13752,8.1,7.863227
3342,The Godfather,5893,8.4,7.851248
95,Interstellar,10867,8.1,7.809563
809,Forrest Gump,7927,8.2,7.803313
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727547
1996,The Empire Strikes Back,5879,8.2,7.698363


We have now identified how IMDB ranks their movies


In [10]:
import matplotlib.pyplot as plt
pop = df.sort_values('popularity', ascending=False)
plt.figure(figsize=(12, 4))

plt.barh(pop['title'].head(10), pop['popularity'].head(10), align='center',
         color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")


Text(0.5, 1.0, 'Popular Movies')

# Content Filtering

Content based filtering uses different criterias when recommending movies. For example, we can recommend movies based on cast, crew and story plot. 

We need to convert names and keywords instances into lowercase and strip all the spaces between them so our CountVectorizer doesn't count them as two different instances of the same observation.

In [11]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


Our content filtering system is going to use the features below to compare films and return similar titles.

In [12]:

features = ['cast', 'keywords', 'crew', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)



We need to create a column that contains the metadata of our features.

In [13]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['crew'] + ' ' + ' '.join(x['genres'])
df['soup'] = df.apply(create_soup, axis=1)

In [14]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [16]:
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [17]:
def get_recommendations(title, cosine_sim=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [18]:
get_recommendations('The Dark Knight Rises', cosine_sim2)


65                               The Dark Knight
95                                  Interstellar
96                                     Inception
119                                Batman Begins
1036                                    Insomnia
1199                                The Prestige
3578                                     Memento
0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
Name: title, dtype: object

# Collaborative Filtering

The idea of collaborative filtering is based on the idea that users similar to me can be used to predict how much I will like a particular product or service using their experience. 

Our system will follow theses steps:

-   Get user input
-   Based on user input our system will return similar users
-   Get the most rated movies for similar users and calculate smilarity score
-   Recommend highest scored movies back to user

In [19]:
import pandas as pd
movies_df = pd.read_csv('../data/processed/collab_movies.csv')
ratings_df = pd.read_csv('../data/processed/collab_ratings.csv')

In [20]:
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Droping timestamp since it is not useful for our model

In [21]:

ratings_df.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,83438,497,5.0,1081992433
1,83438,500,3.5,1081992600
2,83438,534,4.5,1081992229
3,83438,543,5.0,1081992067
4,83438,587,3.5,1081992611


#### Merge both Ratings and Movies dataset

In [22]:
movies = pd.merge(movies_df,ratings_df, on='movieId')
movies = movies[['userId', 'title', 'rating']]
movies.head()


Unnamed: 0,userId,title,rating
0,83439,Toy Story (1995),3.0
1,83442,Toy Story (1995),3.0
2,83446,Toy Story (1995),4.0
3,83450,Toy Story (1995),4.0
4,83456,Toy Story (1995),3.5


#### My Users Ratings Input

In [23]:

input_movies = pd.read_csv('../data/processed/my_input.csv')
input_movies


Unnamed: 0,userId,title,rating
0,696969,Remember the Titans,3.0
1,696969,Transformers,3.0
2,696969,Happy Gilmore,4.0
3,696969,Interstellar,5.0
4,696969,The Lion King,3.5
5,696969,Gravity,3.0
6,696969,Titanic,5.0
7,696969,The Godfather,5.0


#### Add my movie ratings to our dataframe

In [24]:
combined_movies_data = pd.concat([movies, input_movies], axis=0)

# rename the columns to userID, itemID and rating
combined_movies_data.columns = ['userID', 'itemID', 'rating']

# use the transform method group by userID and count to keep the movies with more than 25 reviews

combined_movies_data['reviews'] = combined_movies_data.groupby(['itemID'])[
    'rating'].transform('count')

combined_movies_data = combined_movies_data[combined_movies_data.reviews > 25][[
    'userID', 'itemID', 'rating']]


In [27]:
from surprise import NMF, SVD, SVDpp
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset


In [28]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(combined_movies_data, reader)

In [29]:
# get the list of the movie ids
unique_ids = combined_movies_data['itemID'].unique()

# get the list of the ids that the userid 696969 has rated
iids696969 = combined_movies_data.loc[combined_movies_data['userID']==696969, 'itemID']

# remove the rated movies for the recommendations
movies_to_predict = np.setdiff1d(unique_ids,iids696969)

#### NMF

In [30]:
algo = NMF()
algo.fit(data.build_full_trainset())
my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=696969, iid=iid).est))

pd.DataFrame(my_recs, columns=['Movies', 'Predictions']).sort_values(
    'predictions', ascending=False).head(10)


Unnamed: 0,Movies,predictions
0,"""Great Performances"" Cats (1998)",3.531935
5196,Pat and Mike (1952),3.531935
5164,Paprika (Papurika) (2006),3.531935
5163,Papillon (1973),3.531935
5162,Paperman (2012),3.531935
5161,"Paper, The (1994)",3.531935
5160,Paper Towns (2015),3.531935
5159,Paper Moon (1973),3.531935
5158,Paper Clips (2004),3.531935
5157,"Paper Chase, The (1973)",3.531935


#### SVD

In [32]:
algo = SVD()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=696969, iid=iid).est))

pd.DataFrame(my_recs, columns=['Movies', 'Predictions']).sort_values(
    'Predictions', ascending=False).head(10)


Unnamed: 0,Movies,Predictions
5340,Planet Earth II (2016),4.522649
5339,Planet Earth (2006),4.477095
6784,The Godfather Trilogy: 1972-1990 (1992),4.426922
6100,"Shawshank Redemption, The (1994)",4.395602
623,Band of Brothers (2001),4.390496
878,Black Mirror: White Christmas (2014),4.385337
4884,"Night, The (Notte, La) (1960)",4.367073
1795,"Decalogue, The (Dekalog) (1989)",4.357217
7571,Winter Light (Nattvardsgästerna) (1963),4.354824
107,7 Plus Seven (1970),4.310763


#### SVD++

In [33]:
algo = SVDpp()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=696969,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['Movies', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,Movies,predictions
5340,Planet Earth II (2016),4.49061
5339,Planet Earth (2006),4.405024
878,Black Mirror: White Christmas (2014),4.340749
6784,The Godfather Trilogy: 1972-1990 (1992),4.328347
623,Band of Brothers (2001),4.322573
6100,"Shawshank Redemption, The (1994)",4.317663
5060,Open Hearts (Elsker dig for evigt) (2002),4.270364
3307,I Am a Fugitive from a Chain Gang (1932),4.256419
6883,"Thin Man Goes Home, The (1945)",4.244912
107,7 Plus Seven (1970),4.242237


#### Model Evaluation

In [34]:
cv = []
# Iterate over all recommender system algorithms
for recsys in [NMF(), SVD(), SVDpp()]:
    # Perform cross validation
    tmp = cross_validate(recsys, data, measures=['RMSE'], cv=3, verbose=False)
    cv.append((str(recsys).split(' ')[0].split('.')[-1], tmp['test_rmse'].mean()))

pd.DataFrame(cv, columns=['Recommendation Sys', 'RMSE'])

Unnamed: 0,Recommendation Sys,RMSE
0,NMF,0.885323
1,SVD,0.832026
2,SVDpp,0.824335
