In [None]:
import pandas as pd
import ast

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# import warnings
# warnings.filterwarnings("ignore")


In [None]:
movies  = pd.read_csv('./data/tmdb_5000_movies.csv')
credits = pd.read_csv('./data/tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits, on='title')

In [4]:
movies = movies[['movie_id', 'title', 'overview', 'genres','keywords' ,'cast', 'crew']]

In [5]:
movies.dropna(inplace=True)

In [6]:
def convert(data):
    L = []
    for item in ast.literal_eval(data):
        L.append(item['name'])
    return L

In [7]:
movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)

In [8]:
def convert_cast(cast):
    L = []
    count = 0
    for actor in ast.literal_eval(cast):
        if count < 3:
            L.append(actor['name'])
            count += 1
        else:
            break
    return L

In [9]:
movies["cast"] = movies["cast"].apply(convert_cast)

In [10]:
def fetch_director(crew):
    L = []
    for member in ast.literal_eval(crew):
        if member['job'] == 'Director':
            L.append(member['name'])
    return L


In [11]:
movies["crew"] = movies["crew"].apply(fetch_director)
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [12]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [13]:
# Remove Spaces
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])
movies['overview'] = movies['overview'].apply(lambda x:[i.replace(" ","") for i in x])

In [14]:
movies['tags'] =  movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [15]:
new_movies = movies[['movie_id', 'title', 'tags']]
new_movies['tags'] = new_movies['tags'].apply(lambda x: " ".join(x).lower())

In [16]:
# To convert text to its base form example: "running" -> "run" , "ran" -> "run"
ps = PorterStemmer()

def stem(text):
  return " ".join([ps.stem(word) for word in text.split()])
new_movies['tags'] = new_movies['tags'].apply(stem)

In [17]:
# Create a count vectorizer to convert text data into numerical vectors
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_movies['tags']).toarray()

In [18]:
# Compute the cosine similarity between the vectors (movies)
similarity = cosine_similarity(vectors)

In [19]:
def recommended(movie):
  movie_index = new_movies[new_movies['title'] == movie].index[0] # Get the index of the movie
  distances = similarity[movie_index] # Get the similarity scores for the movie
  recommended_movies = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6] # Get the top 5 most similar movies
  print("Recommended movies:")
  for i in recommended_movies:
      print(new_movies.iloc[i[0]])

recommended('Batman Begins')

Recommended movies:
movie_id                                                  155
title                                         The Dark Knight
tags        batman rais the stake in hi war on crime. with...
Name: 65, dtype: object
movie_id                                                 2661
title                                                  Batman
tags        the dark knight of gotham citi begin hi war on...
Name: 1361, dtype: object
movie_id                                                49026
title                                   The Dark Knight Rises
tags        follow the death of district attorney harvey d...
Name: 3, dtype: object
movie_id                                                  268
title                                                  Batman
tags        the dark knight of gotham citi begin hi war on...
Name: 1360, dtype: object
movie_id                                                  415
title                                          Batman & Robin
tags        a

In [None]:
import pickle
pickle.dump(new_movies.to_dict(), open('../models/movies.pkl', 'wb'))
pickle.dump(similarity, open('../models/similarity.pkl', 'wb'))