# Compute the similarity between movies based on other features

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
data = pd.read_csv('data_movies_final.csv', low_memory=False)
data.shape

(49716, 17)

In [3]:
q_movies = pd.read_csv('topMovies.csv', low_memory=False)
q_movies.shape

(4977, 17)

In [4]:
# Print the new features of the first 3 films
data[['title', 'soup']].head(3)

Unnamed: 0,title,soup
0,toystory,tomhanks timallen donrickles tomhanks en johnl...
1,jumanji,robinwilliams kirstendunst bradleypierce robin...
2,grumpieroldmen,waltermatthau jacklemmon ann-margret waltermat...


In [5]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['soup'])

In [6]:
count_matrix.shape

(49716, 75129)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_otherFeatures = cosine_similarity(count_matrix, count_matrix)

In [8]:
cosine_sim_otherFeatures.shape

(49716, 49716)

In [9]:
topMovID = q_movies['id'].tolist()
indx = []
for i in topMovID:
    indx.append(np.where(data['id']==i)[0][0])

In [10]:
cosine_sim_otherFeatures_topMov = cosine_sim_otherFeatures[:, indx]
cosine_sim_otherFeatures_topMov.shape

(49716, 4977)

In [11]:
f = open('cosine_sim_otherFeatures_topMov.pckl', 'wb')
pickle.dump(cosine_sim_otherFeatures_topMov, f)
f.close()

In [None]:
"""f = open('cosine_sim_otherFeatures_topMov.pckl', 'rb')
cosine_sim_otherFeatures_topMov = pickle.load(f)
f.close()"""

In [12]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(data.index, index=data['title'])

In [13]:
# Function to convert all strings to lower case and strip names of spaces
def lower_noSpace(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [14]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    title = lower_noSpace(title)
    
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = []
    for i,sim in enumerate(cosine_sim[idx]):
        sim_scores.append([indx[i],sim])

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[0:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [17]:
get_recommendations('the wolf of wall street', cosine_sim_otherFeatures_topMov)

22002          thewolfofwallstreet
47008    onceuponatime…inhollywood
9522                    theaviator
5817                gangsofnewyork
5850               catchmeifyoucan
144           thebasketballdiaries
14767                shutterisland
17984                      j.edgar
1638                       titanic
20794               thegreatgatsby
13203            revolutionaryroad
Name: title, dtype: object