In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import ast


In [7]:
def convert_to_list (values):
    list_ = []
    #Converts a string representation of a Python list into an actual list
    for i in ast.literal_eval(values):
        list_.append(i['name'])
    return list_

In [8]:
def convert_cast (values):
    return [i['name'] for i in ast.literal_eval(values)[:3]]


In [9]:
def fetch_director (values):
    list_ = []
    #Converts a string representation of a Python list into an actual list
    for i in ast.literal_eval(values):
        if i["job"] == "Director":
            list_.append(i['name'])
            break
    return list_

In [10]:
movies = pd.read_csv('database/tmdb_5000_movies.csv')
credits = pd.read_csv('database/tmdb_5000_credits.csv')

In [11]:
combined_data = movies.merge(credits, on = 'title')

In [12]:
movies = combined_data[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew' ]]

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace = True)

In [None]:
# Parse unnecessary ids and change genres and keywords into a list
movies['genres'] = movies['genres'].apply(convert_to_list)
movies['keywords'] = movies['keywords'].apply(convert_to_list)
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

In [16]:
def remove_spaces(text):
    list_ = []
    for i in text:
        list_.append(i.replace(" ", ""))
    return list_

In [None]:
movies['genres'] = movies['genres'].apply(remove_spaces)
movies['keywords'] = movies['keywords'].apply(remove_spaces)
movies['cast'] = movies['cast'].apply(remove_spaces)
movies['crew'] = movies['crew'].apply(remove_spaces)

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [19]:
df = movies[['movie_id', 'title', 'tags']]

In [None]:
df['tags'] = df['tags'].apply(lambda x: " ".join(x))
df['tags'] = df['tags'].apply(lambda x: x.lower())

In [None]:
df.iloc[0]['tags']

In [None]:
df.shape

In [23]:
import nltk
from nltk.stem import PorterStemmer

In [24]:
ps = PorterStemmer()

In [25]:
def stems(text):
    list_ = []
    for i in text.split():
        list_.append(ps.stem(i))
    return " ".join(list_)

In [None]:
df['tags'] = df['tags'].apply(stems)

In [None]:
df.iloc[0]['tags']

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
# 5000 most common words
cv = CountVectorizer(max_features=5000, stop_words='english')

In [29]:
vector = cv.fit_transform(df['tags']).toarray()

In [None]:
vector.shape

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
similarity = cosine_similarity(vector) 

In [None]:
similarity.shape

In [34]:
def recommend(movie):
    #find index of movie
    index = df[df['title'] == movie].index[0] 
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x:x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)

In [36]:
import pickle

pickle.dump(df, open('recommendation_data/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('recommendation_data/similarity.pkl', 'wb'))