In [14]:
import numpy as np
import pandas as pd
import warnings
import ast
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings("ignore")
%matplotlib inline

In [15]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')
pd.set_option('display.max.columns', None)

In [16]:
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [17]:
def preprocess_genres_keywords(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l

def preprocess_cast(text):
    l=[]
    ct=0
    for i in ast.literal_eval(text):
        if ct<3:
            l.append(i['name'])
        ct+=1
    return l

def preprocess_crew(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job'] == 'director':
            l.append(i['name'])
    return l

def preprocess_collapse(lst):
    l=[]
    for i in lst:
        l.append(i.replace(" ", ""))
    return l

In [18]:
movies['genres'] = movies['genres'].apply(preprocess_genres_keywords)
movies['keywords'] = movies['keywords'].apply(preprocess_genres_keywords)
movies['cast'] = movies['cast'].apply(preprocess_cast)
movies['crew'] = movies['crew'].apply(preprocess_crew)

In [19]:
movies['genres'] = movies['genres'].apply(preprocess_collapse)
movies['keywords'] = movies['keywords'].apply(preprocess_collapse)
movies['cast'] = movies['cast'].apply(preprocess_collapse)
movies['crew'] = movies['crew'].apply(preprocess_collapse)

movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[]


In [24]:
movies.dropna(inplace=True)

In [25]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
custom_df = movies.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])
custom_df['tags'] = custom_df['tags'].apply(lambda x:" ".join(x))

custom_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [27]:
cv = CountVectorizer(max_features=5000, stop_words="english")
bow = cv.fit_transform(custom_df['tags']).toarray()
cos_sim = cosine_similarity(bow)

In [28]:
def recommend(movie_name):
    movie_index = custom_df[custom_df['title']==movie_name].index[0]
    similarity = sorted(enumerate(cos_sim[movie_index]), reverse=True, key = (lambda x: x[1]))
    for i in similarity[1:6]:
        print(custom_df.iloc[i[0]].title)
    return 

In [29]:
recommend('Batman')

Batman
Batman & Robin
The Dark Knight Rises
Batman Begins
Batman Returns


In [30]:
pickle.dump(custom_df, open('movie_list.pkl','wb'))
pickle.dump(cos_sim, open('similarity.pkl','wb'))