In [237]:
import pandas as pd
import ast as ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

#Loading Data
movies_data = pd.read_csv('tmdb_5000_movies.csv')
credits_data = pd.read_csv('tmdb_5000_credits.csv')

In [238]:
movies_data.columns = movies_data.columns.str.strip()
credits_data.columns = credits_data.columns.str.strip()

In [239]:
#Merges the 2 df together based on id
merged_data = movies_data.merge(credits_data, left_on='id', right_on='movie_id')

In [240]:
#Just deletes the non-English titles
merged_data = merged_data[merged_data['original_language'] == 'en']


In [241]:
merged_data = merged_data.drop(columns=['budget', 'movie_id', 'title_y', 'original_language', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'vote_average', 'vote_count', 'title_x', 'production_companies', 'homepage', 'tagline'])


In [242]:
merged_data = merged_data.rename(columns={'original_title': 'title'})

In [243]:
merged_data.head()

Unnamed: 0,genres,id,keywords,title,overview,popularity,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [244]:
merged_data.dropna(subset=['overview'], inplace=True)
merged_data.reset_index(drop=True, inplace=True)

In [245]:
print(merged_data.isnull().sum())

genres        0
id            0
keywords      0
title         0
overview      0
popularity    0
cast          0
crew          0
dtype: int64


In [246]:
#This converts a long string to a dictionary and then appends all the keys with 'name' so that we can get the actual genres
def convert_genres(text):
    L = []
    
    for i in ast.literal_eval(text):
        L.append(i['name'].lower().replace(' ', ''))
    return L

#This just gets the top 5 actors in the movie and adds them into a list
def convert_cast(text):
    L = []
    
    for i in ast.literal_eval(text):
        L.append(i['name'].lower().replace(' ', ''))
        if len(L) == 5:
            return L
    return L

#This will parse through and get the director of each movie
def convert_crew(text):
    L = []
    
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'].lower().replace(' ', ''))
            return L
    return L

#This takes the top 10 keywords
def convert_keywords(text):
    L = []
    
    for i in ast.literal_eval(text):
        L.append(i['name'].lower().replace(' ', ''))
        if len(L) == 10:
            return L
    return L

In [247]:
#Parses the text to retrieve usable info
merged_data['genres'] = merged_data['genres'].apply(convert_genres)
merged_data['cast'] = merged_data['cast'].apply(convert_cast)
merged_data['crew'] = merged_data['crew'].apply(convert_crew)
merged_data['keywords'] = merged_data['keywords'].apply(convert_keywords)

In [248]:
merged_data['overview'] = merged_data['overview'].apply(lambda x: x.split())
merged_data['tags'] = (merged_data['overview'] + merged_data['keywords'] + merged_data['crew'] + merged_data['cast'] + merged_data['genres'])

In [249]:
#Creates a new data frame with new tags columns and title and id
df = merged_data[['tags', 'title', 'id']].copy()

In [250]:
df['tags'] = df['tags'].apply(lambda x: " ".join(x))

In [251]:
#I am cutting words into their main root so that different forms of the same words aren't considered different.
stemmer = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(stemmer.stem(i))
        
    return " ".join(y)

In [252]:
df['tags'] = df['tags'].apply(stem)

In [253]:
print(df['tags'].iloc[0])

in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien jamescameron samworthington zoesaldana sigourneyweav stephenlang michellerodriguez action adventur fantasi sciencefict


In [254]:
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

In [255]:
vectors = vectorizer.fit_transform(df['tags']).toarray()

In [256]:
similarity = cosine_similarity(vectors)

In [257]:
def recommend(movie):
    # Finds index of movie
    movie_index = df[df['title'] == movie].index[0]
    
    # Get the distance for that movie
    distances = similarity[movie_index]
    
    # Creates a list of tuples (index, score) and sorts by the score
    # Takes indices from [1:6] to get the top 5 movies (index 0 is the movie itself
    movies_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]
    
    # prints the movie titles
    for i in movies_list:
        print(df.iloc[i[0]].title)

In [258]:
pickle.dump(df.to_dict(), open('movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))