In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

In [3]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [4]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [5]:
credits.columns = ['id', 'title', 'cast', 'crew']
moview = movies.merge(credits, on='id')

In [6]:
movies['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [7]:
#replace NaN values with empty strings
movies['overview'] = movies['overview'].fillna('')

Using movie descriptions, the keywords associated with the movie and the genre column to make movie recommendations

In [8]:
def create_soup(x):
    return ''.join(x['keywords']) + '' + ''.join(x['genres']) + '' + ''.join(x['overview'])

# apply allow the users to pass a function and apply it on every single value of the Pandas series
movies['soup'] = movies.apply(create_soup, axis=1) 

In [9]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['soup'])
tfidf_matrix.shape

(4803, 32768)

In [10]:
#cosine similarity is used to compute similarity because it is independent of magnitude and also relatively easy and fast to calculate
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
#construct a reverse mal of indices annd movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

Steps to define recommendation system
- retrieve the index of the movie given its title
- compute a list of cosine similarity scores for the target movie with all movies in the dataset then convert it into a list of tuple where first element would be is its position and the second is the similarity score
- sort this list of tuples based on similarity scores 
- get the top 10 elements of this list (ignore the first element as it refers to the target movie itself)
- return the titles that correspond to the indices of the top elements

In [12]:
def get_recommendation(title, cosine_sim=cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwise similarity scores of all movies with that movie
    # Enumerate() method adds a counter to an iterable and returns it in a form of enumerating object
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores if the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movies indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]
    

In [13]:
get_recommendation('The Avengers', cosine_sim)

7                  Avengers: Age of Ultron
26              Captain America: Civil War
64                       X-Men: Apocalypse
242                         Fantastic Four
511                                  X-Men
79                              Iron Man 2
85     Captain America: The Winter Soldier
169     Captain America: The First Avenger
182                                Ant-Man
68                                Iron Man
Name: title, dtype: object