***Movie Recommender System***

In [None]:
import numpy as np
import pandas as pd
import ast
import nltk
from nltk.stem.porter import PorterStemmer 
# import PorterStemmer Class For Stemming
from sklearn.feature_extraction.text import CountVectorizer  # To convert into vector
from sklearn.metrics.pairwise import cosine_similarity
import pickle


# Load CSV Files
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
# Merge them
movies = movies.merge(credits, on='title')

# movies['original_language'].value_counts()
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
# movies.info()
# movies.isnull().sum()
movies.dropna(inplace=True)          # To make changes in the dataframe itself
# movies.duplicated().sum()          To find weather any row is duplicate or not

# Extract Words from Dictionary (genres and keywords)
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):      # To convert string into list
        l.append(i['name'])
    return l
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)


# Extract Words from Dictionary (cast)
def convert1(obj):
    l = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            l.append(i['name'])
            counter += 1
        else:
            break
    return l
movies['cast'] = movies['cast'].apply(convert1)

# Extract Words from Dictionary (cast)
def fetch_director(obj):
    l = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            l.append(i['name'])
            break
    return l
movies['crew'] = movies['crew'].apply(fetch_director)

# To convert overview string to overview list
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Removes Spaces for better search/recommendations
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Make a tags column and join the words
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies = movies[['movie_id', 'title', 'tags']]
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

# In Natural Language Processing (NLP), stemming is a text preprocessing technique that 
# reduces words to their base or root form (stem) by removing affixes (prefixes and suffixes)
ps = PorterStemmer()
def stem(text):
    l = []
    for i in text.split():
        l.append(ps.stem(i))
    return " ".join(l)
movies['tags'] = movies['tags'].apply(stem)
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

# Transfrom into vector of 5000 dimension (select top 5000 words that are frequent)
cv = CountVectorizer(max_features=5000, stop_words='english')   # stop words to remove 'to', 'and', etc. type of words
vectors = cv.fit_transform(movies['tags']).toarray()   
# cv.get_feature_names_out()   features or keywords, array of 5000 keywords

similarity = cosine_similarity(vectors)  # returns an array like correlation matrix

# Main Function to recommend similar movies
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]  # Gives Index Position
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in movie_list:
        print(movies.iloc[i[0]].title)


# Deployement starts from here
pickle.dump(movies.to_dict(), open('movie_dict.pkl', 'wb'))  
# write binary (wb); 
# converting movies dataframe to dictionary to transfer through pickle file and 
# convert back to dataframe
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [59]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf
