In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import json
import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
# https://www.kaggle.com/tmdb/tmdb-movie-metadata
# !wget https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv

In [None]:
df = pd.read_csv("tmdb_5000_movies.csv")
df.head()

In [None]:
x = df.iloc[0]
print(x)


print(x['genres'])
print(x["keywords"])

j = json.loads(x['genres'])
print(j)

In [None]:
# #convert json into single line of text
# ' '.join(''.join(jj['name'].split()) for jj in j)

In [None]:

#cvt the relevant data for each movie into a single string 
#to be ingested by TfidVectionrizer 

def genres_and_kw_to_str(row):
  genres = json.loads(row['genres'])
  genres = ' '.join(''.join(j['name'].split())for j in genres)

  keywords = json.loads(row['keywords'])
  keywords = ' '.join(''.join(k['name'].split()) for k in keywords)

  return "%s %s" %(genres, keywords)

In [None]:
#create a new string representation of each movie
df['string'] = df.apply(genres_and_kw_to_str, axis = 1)
df['string']


#create a tf-idf vec obj
tfidf = TfidfVectorizer(max_features=2000)


X = tfidf.fit_transform(df['string'])

print(X)

In [None]:

#generate a mapping from movie title -> index (in df)
movie2idx = pd.Series(df.index, index=df['title'])
print(movie2idx)

idx = movie2idx['Scream 3']
print(idx)


query = X[idx]
query.toarray()

#compute score

scores = cosine_similarity(query, X)
print(scores)

In [None]:
scores = scores.flatten()

plt.plot(scores)
     

In [None]:
(-scores).argsort()
plt.plot(scores[(-scores).argsort()]);

In [None]:
#get top 5 matches
def recommend(title):
  idx = movie2idx[title]
  if type(idx) == pd.Series:
    idx = idx.iloc[0]

  #cal the pairwise similarity for this movie
  query = X[idx]
  scores = cosine_similarity(query, X)

  #currently the array is 1 x N, make it just a 1-D arr
  scores = scores.flatten()

  #get the indexes of the highest scoring movies
  #get the first K recomm
  # don't return itself! not start from idx0 
  recommended_idx = (-scores).argsort()[1:6]

  #return the title of the recommendations 
  return df['title'].iloc[recommended_idx]

In [None]:

print(f"Top 5 reccommendations for 'Scream 3' : ")
print(recommend('Scream 3'))