# Compute the similarity between movies based on their overview

## read datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [2]:
data = pd.read_csv('data_movies_final.csv', low_memory=False)
data.shape

(49716, 16)

In [3]:
q_movies = pd.read_csv('topMovies.csv', low_memory=False)
q_movies.shape

(4977, 17)

In [4]:
#Print plot overviews of the first 5 movies.
data['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [5]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
data['overview'] = data['overview'].fillna('')

In [6]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['overview'])
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(49716, 72064)

In [7]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[5000:5010]



['ayres',
 'ayreses',
 'ayrshire',
 'ayrton',
 'aysgarth',
 'ayu',
 'ayudante',
 'ayurveda',
 'ayuttaya',
 'ayça']

In [8]:
cosine_sim_overview = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
cosine_sim_overview.shape

(49716, 49716)

###  Similarity between top rated movies and all other movies based on overview

In [10]:
topMovID = q_movies['id'].tolist()
indx = []
for i in topMovID:
    indx.append(np.where(data['id']==i)[0][0])

In [11]:
cosin_sim_overview_topMov = cosine_sim_overview[:, indx]
cosin_sim_overview_topMov.shape

(49716, 4977)

In [12]:
f = open('cosin_sim_overview_topMov.pckl', 'wb')
pickle.dump(cosin_sim_overview_topMov, f)
f.close()

In [None]:
"""f = open('cosin_sim_overview_topMov.pckl', 'rb')
cosin_sim_overview_topMov = pickle.load(f)
f.close()"""

In [13]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(data.index, index=data['title'])

In [16]:
# Function to convert all strings to lower case and strip names of spaces
def lower_noSpace(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [17]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    title = lower_noSpace(title)
    
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = []
    for i,sim in enumerate(cosine_sim[idx]):
        sim_scores.append([indx[i],sim])

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[0:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [18]:
get_recommendations('Interstellar', cosin_sim_overview_topMov)

22742                 interstellar
22897    spacepiratecaptainharlock
29720              thegreeninferno
18905                   prometheus
24777              dumbanddumberto
312                       stargate
9930                atriptothemoon
1142             theenglishpatient
45844         starwars:thelastjedi
16193                allgoodthings
23865              asabove,sobelow
Name: title, dtype: object