In [None]:
import numpy as np 
import pandas as pd 

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head(1)

In [None]:
movies = movies.merge(credits, on='title')
movies.head(1)

In [None]:
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head(1)

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)
len(movies)

In [None]:
movies.duplicated().sum()


In [None]:
movies.drop_duplicates(subset='title', keep='first', inplace=True)
len(movies)

In [None]:
movies.iloc[0].genres

In [None]:
# Convert the above to ['Action', 'Adventure', 'Fantasy', 'Science Fiction']
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L


In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

In [None]:
movies['cast'] = movies['cast'].apply(convert3)
movies.head()

In [None]:
movies['crew'][0]

In [None]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies.head()

In [None]:
# converting overview to list of string
# we have converted it to a list so that we can concatenate all the columns to form a single column
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies.head()

In [None]:
# remove spaces between the words
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
movies.head()

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies[['movie_id', 'title', 'tags']]
new_df.head()


In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df.head()

In [None]:
new_df['tags'][0]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
new_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
!pip install nltk
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)


In [None]:
new_df.head()
np.set_printoptions(threshold=np.inf)

In [None]:
cv.get_feature_names_out()   ## list of all the words in the vectors

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [None]:
similarity

In [None]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend('Batman Begins')

In [None]:
import pickle

In [None]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [None]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))