### Preprocessing of Data

In [None]:
# kaggle (tmdb 5000 movies dataset)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

In [None]:
credits.head()

In [None]:
credits.shape

In [None]:
credits.info()

In [None]:
movies.head()

In [None]:
movies.shape

In [None]:
movies.info()
# contain null values

In [None]:
# merging the two data sets
# movies.loc[movies['id']==credits['movie_id'],['cast','crew']] = credits.loc[movies['id']==credits['movie_id'],['cast','crew']]
movies_data = movies.merge(credits,on='title')

In [None]:
movies_data.shape

In [None]:
movies_data.head(1)

In [None]:
credits.head(1)

In [None]:
movies_data.info()

In [None]:
# removing unneccessary columns
movies_data = movies_data[['movie_id','title','cast','crew','genres','keywords']].copy()
movies_data.head(3)

In [None]:
movies_data.shape

In [None]:
movies_data.isnull().sum(axis=0)
# 3 null in overview as these are very small we will drop these 3 rows
movies_data.dropna(inplace=True)

In [None]:
movies_data.isnull().sum()

In [None]:
movies_data.duplicated().sum()
# no duplicated rows are there

In [None]:
 movies_data.iloc[0].genres

In [None]:
movies_data.iloc[0].keywords

In [None]:
# movies_data.iloc[0].cast

In [None]:
# movies_data.iloc[0].crew

In [None]:
from ast import literal_eval
def convert(obj):
    newobj = literal_eval(obj)
    str = []
    for i in newobj:
        str.append(i['name'].replace(" ",""))
    return str

def convert_3(obj):
    newobj = literal_eval(obj)
    if(len(newobj)>3):
        newobj=newobj[0:3]
    str = []
    for i in newobj:
        str.append(i['name'].replace(" ",""))
    return str

def convert_crew(obj):
    newobj = literal_eval(obj)
    str = []
    for i in newobj:
        if i['job'] == 'Director':
            str.append(i['name'].replace(" ",""))
    return str

In [None]:
movies_data.loc[:,['genres']] = movies_data.loc[:,'genres'].apply(convert)
movies_data.loc[:,['keywords']] = movies_data.loc[:,'keywords'].apply(convert)
movies_data.loc[:,['cast']] = movies_data.loc[:,'cast'].apply(convert_3)
movies_data.loc[:,['crew']] = movies_data.loc[:,'crew'].apply(convert_crew)
movies_data.loc[:,['title']] = movies_data.loc[:,'title'].apply(lambda x: x.lower())

In [None]:
movies_data.head(5)

In [None]:
movies_data['tags'] = (movies_data['genres']+movies_data['crew']+movies_data['cast']+movies_data['keywords']).copy()

In [None]:
movies_data = movies_data[['movie_id','tags','title']].copy()

In [None]:
movies_data.head()

In [None]:
movies_data['tags'] = movies_data.loc[:,'tags'].apply(lambda x: " ".join(x)).apply(lambda x: x.lower())
# join means join them using " "

In [None]:
movies_data.head()

In [None]:
movies_data['tags'][0]

### Text vectorization (bag of words)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
vectors = cv.fit_transform(movies_data['tags']).toarray()
vectors

In [None]:
cv.get_feature_names()

In [None]:
vectors.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(vectors)

In [None]:
similarity_matrix[0]

In [None]:
similarity_matrix[0].shape

In [None]:
def sort_matrix(matrix):
    distances = list(enumerate(matrix))
    return sorted(distances,reverse=True,key=lambda x: x[1])

In [None]:
similarity_matrix = [sort_matrix(matrix) for matrix in similarity_matrix]
# similarity_matrix[0]

### Recommend

In [None]:
def recommend(movie):
    movie_index = movies_data[movies_data['title']==movie.lower()].index[0]
    distances = similarity_matrix[movie_index]
    recommend_movies_index = [index[0] for index in distances[1:6]]
    return movies_data.loc[recommend_movies_index,['movie_id','title']].reset_index()

In [None]:
recommend('Avatar')

In [None]:
recommend('batman begins')

In [None]:
recommend('the avengers')
# dump the movies_data as dict and similarity_matrix 
# load these in app.py and rewrite above recommend function there
# now in app.py create website and create a select box for movie by using movies_data
# clicking on the recommend button trigger recommend function written in app.py itself