In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')


In [3]:
movie_keywords = pd.read_csv("/home/kimberly/PycharmProjects/pythonProject2/keywords.csv", low_memory=True)
movie_credit = pd.read_csv("/home/kimberly/PycharmProjects/pythonProject2/credits.csv", low_memory=True)
movie_metadata = pd.read_csv("/home/kimberly/PycharmProjects/pythonProject2/movies_metadata.csv", low_memory=True)

In [4]:
movie_metadata1 = movie_metadata.drop([19730, 29503, 35587])

In [5]:
movie_metadata1['id'] = movie_metadata1['id'].astype(int)

In [6]:
movie_ = movie_metadata1.merge(movie_credit, on="id")
movie_ = movie_.merge(movie_keywords, on="id")

In [7]:
movie_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46628 entries, 0 to 46627
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  46628 non-null  object 
 1   belongs_to_collection  4574 non-null   object 
 2   budget                 46628 non-null  object 
 3   genres                 46628 non-null  object 
 4   homepage               8009 non-null   object 
 5   id                     46628 non-null  int64  
 6   imdb_id                46611 non-null  object 
 7   original_language      46617 non-null  object 
 8   original_title         46628 non-null  object 
 9   overview               45633 non-null  object 
 10  popularity             46624 non-null  object 
 11  poster_path            46229 non-null  object 
 12  production_companies   46624 non-null  object 
 13  production_countries   46624 non-null  object 
 14  release_date           46540 non-null  object 
 15  re

In [8]:
movie_.isna().sum()

adult                        0
belongs_to_collection    42054
budget                       0
genres                       0
homepage                 38619
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   995
popularity                   4
poster_path                399
production_companies         4
production_countries         4
release_date                88
revenue                      4
runtime                    268
spoken_languages             4
status                      86
tagline                  25845
title                        4
video                        4
vote_average                 4
vote_count                   4
cast                         0
crew                         0
keywords                     0
dtype: int64

In [9]:
features = ['genres', 'cast', 'crew', 'keywords']
for feature in features:
    movie_[feature] = movie_[feature].apply(literal_eval)

In [10]:
movie_.to_csv('recommending_movies')

In [11]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [12]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #check if more than 3 elements exist. If yes, return only first three. if no,return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #return empty list in case of missing/ malformed data
    return []

In [13]:
#Define new director, cast, genres, and keywords features that are in a suitable form.
movie_['director'] = movie_['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    movie_[feature] = movie_[feature].apply(get_list)

In [14]:
movie_[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]


In [15]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #check if director exists, if not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [16]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movie_[feature] = movie_[feature].apply(clean_data)

In [17]:
def create_soup(x):
    return " ".join(x['keywords'])+ " "+" ".join(x['director']+" "+" ".join(x['genres']))

In [18]:
#create a new soup feature
movie_['soup'] = movie_.apply(create_soup, axis = 1)

In [19]:
movie_[['soup']].head()

Unnamed: 0,soup
0,jealousy toy boy j o h n l a s s e t e r a n...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger h o w ...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence c h a r l e s s ...


In [20]:
tfidf_count = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_count.fit_transform(movie_['soup'])

In [21]:
tfidf_matrix.shape

(46628, 9878)

In [22]:
from scipy.spatial.distance import hamming
cosine_similarity_matric = hamming(tfidf_matrix, tfidf_matrix)

In [25]:
metadata = movie_.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [24]:
def get_recommendations(title, cosine_sim=cosine_similarity_matric):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    #sort the movies based on the similarity scores



