In [1]:
import numpy as np
import pandas as pd

In [2]:
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies=movies.merge(credits,on='title')

In [4]:
# removing unwanted colomns
# and considering
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [5]:
# check if there is a missing data
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [6]:
movies.dropna(inplace=True)

In [7]:
# check if duplicate
movies.duplicated().sum()

0

In [8]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [9]:
import ast as ast
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L
        

In [10]:
movies['genres']=movies['genres'].apply(convert)

In [11]:
movies['keywords']=movies['keywords'].apply(convert)

In [12]:
def convert_to_4(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=4:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [13]:
movies['cast']=movies['cast'].apply(convert_to_4)

In [14]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

In [15]:
movies['crew']=movies['crew'].apply(fetch_director)

In [16]:
movies['crew']

0           [James Cameron]
1          [Gore Verbinski]
2              [Sam Mendes]
3       [Christopher Nolan]
4          [Andrew Stanton]
               ...         
4804     [Robert Rodriguez]
4805         [Edward Burns]
4806          [Scott Smith]
4807          [Daniel Hsia]
4808     [Brian Herzlinger]
Name: crew, Length: 4806, dtype: object

In [17]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [18]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [19]:
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [20]:
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [21]:
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [22]:
movies['tags']

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: tags, Length: 4806, dtype: object

In [47]:
movies=movies[['movie_id','title','tags']]

In [48]:
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [49]:
import nltk

In [50]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [51]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [52]:
movies['tags']=movies['tags'].apply(lambda x:" ".join(x))

In [53]:
movies['tags']=movies['tags'].apply(lambda x:x.lower())

In [54]:
movies['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [55]:
movies['tags']=movies['tags'].apply(stem)

In [56]:
movies['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultur clash futur space war space coloni societi space travel futurist romanc space alien tribe alien planet cgi marin soldier battl love affair anti war power relat mind and soul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [58]:
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."


In [59]:
vectors=cv.fit_transform(movies['tags']).toarray()

In [60]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [61]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [62]:
from sklearn.metrics.pairwise import cosine_similarity

In [63]:
similarity=cosine_similarity(vectors)

In [64]:
similarity[0]

array([1.        , 0.07156563, 0.05249442, ..., 0.04845016, 0.        ,
       0.        ])

In [65]:
def recommend(movie):
    movie_index=movies[movies['title']==movie].index[0]
    distances=similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:5]
    
    for i in movies_list:
        print(movies.iloc[i[0]].title)



In [68]:
recommend('Avatar')

Aliens
Silent Running
Mission to Mars
Moonraker


In [69]:
import pickle

In [70]:
pickle.dump(movies,open('movies.pkl','wb'))

In [71]:
pickle.dump(movies.to_dict(),open('movies_dictionary.pkl','wb'))

In [76]:
movies.title

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4804                                 El Mariachi
4805                                   Newlyweds
4806                   Signed, Sealed, Delivered
4807                            Shanghai Calling
4808                           My Date with Drew
Name: title, Length: 4806, dtype: object

In [72]:
pickle.dump(similarity,open('similarity.pkl','wb'))