In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle 

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
# movie = movies.copy(deep=True)
# credit = credits.copy(deep=True)
# credit.rename(columns={'movie_id': 'id'}, inplace=True)

In [4]:
# print(movies.merge(credits, on='title').shape)
# print(movie.merge(credit, on='id').shape)

In [5]:
movies = movies.merge(credits, on = 'title')

In [6]:
# movies.head(1)

In [7]:
# genres, id, keywords, title, overview, cast, crew

movies = movies[['genres', 'id', 'keywords', 'title', 'overview', 'cast', 'crew']]

In [8]:
# Preprocessing
movies.isnull().sum()
movies.duplicated().sum()

0

In [9]:
movies.dropna(inplace=True)

In [10]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [11]:
movies['genres'] = movies['genres'].apply(convert)

In [12]:
# x = [z for z in ast.literal_eval(movies.iloc[:].genres)]
# print(x)

In [13]:
movies['keywords'] = movies['keywords'].apply(convert)

In [14]:
# movies.iloc[0].cast

In [15]:
# movies.iloc[0].crew

In [16]:
def convert2(obj):
    L = []
    cnt = 0
    for i in ast.literal_eval(obj):
        if cnt != 3:
            L.append(i['name'])  
            cnt += 1
        else:
            break
    return L

In [17]:
movies['cast'] = movies['cast'].apply(convert2)

In [18]:
def convert3(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [19]:
movies['crew'] = movies['crew'].apply(convert3)

In [20]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [21]:
movies.head(4)

Unnamed: 0,genres,id,keywords,title,overview,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]",285,"[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,"[Action, Adventure, Crime]",206647,"[spy, based on novel, secret agent, sequel, mi...",Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,"[Action, Crime, Drama, Thriller]",49026,"[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]


In [22]:
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(" ", "") for i in x])

In [23]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [24]:
new_df = movies[['id', 'title', 'tags']]

In [25]:
new_df['tags'] = new_df['tags'].apply(lambda x : " ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x : " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())


In [26]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [27]:
cv = CountVectorizer(max_features=5000, stop_words = "english")
vectors = cv.fit_transform(new_df['tags']).toarray()

In [28]:
# Stemming
ps = PorterStemmer()

In [29]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [30]:
new_df['tags']  = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']  = new_df['tags'].apply(stem)


In [31]:
# cv.get_feature_names()

In [32]:
similarity = cosine_similarity(vectors)

In [33]:
distances = similarity[119]
print(distances)

[0.04829453 0.05050763 0.07576144 ... 0.02129589 0.02380952 0.02317449]


In [34]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x : x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [35]:
recommend('Batman Begins')

The Dark Knight
The Dark Knight Rises
Batman
Batman & Robin
Batman


In [36]:
# !pip install pickle5

In [39]:
# pickle.dump(new_df, open('movies.pkl', 'wb'))
# pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))
# pickle.dump(similarity, open('similarity.pkl', 'wb'))