**Cleaning Movies Metadata**

In [2]:
import pandas as pd
import numpy as np

movies = pd.read_csv('C:/Coding Projects/Movie-Recommender-Engine/data/movies_metadata.csv')
id_counts = movies['imdb_id'].value_counts()
id_counts > 1

  movies = pd.read_csv('C:/Coding Projects/Movie-Recommender-Engine/data/movies_metadata.csv')


imdb_id
tt1180333     True
0             True
tt0499537     True
tt0082992     True
tt0046468     True
             ...  
tt6209470    False
tt2028550    False
tt0303758    False
tt0008536    False
tt6980792    False
Name: count, Length: 45417, dtype: bool

In [None]:
duplicate_ids = id_counts[id_counts > 1]
duplicate_ids

In [None]:
duplicate_movies = movies[movies['imdb_id'].isin(duplicate_ids.index)]
duplicate_movies.sort_values(by = ['imdb_id', 'id'])

In [None]:
movies.drop_duplicates(subset = ['imdb_id', 'id'], inplace = True) #Drop duplicate entries based on imdb_id and id

In [None]:
movies['imdb_id'].value_counts() #Duplicates are removed

In [None]:
#Drop rows where id or imdb_id is in an invalid format

movies['id'] = pd.to_numeric(movies['id'], errors = 'coerce')
movies.dropna(subset = ['id'], inplace = True)
movies.dropna(subset = ['imdb_id'], inplace = True)

movies['movie_id'] = movies['id'].astype(int)
movies.drop(columns = ['id'], inplace = True)

In [None]:
#Rename original_title column to title

movies['title'] = movies['title']
movies.drop(columns = ['original_title'], inplace = True)

In [None]:
movies.head()

In [None]:
movies['genres'] = movies['genres'].apply(lambda element: eval(element))

In [None]:
movies.loc[0, 'genres']
movies['genres'] = movies['genres'].apply(lambda element: [d['name'] for d in element] if isinstance(element, list) else [])

In [None]:
#Keep overviews
movies.drop(columns = ['belongs_to_collection', 'budget', 'homepage', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'poster_path', 'video', 'original_language', 'adult'], inplace = True)

In [None]:
movies['overview'] = movies['tagline'].fillna('').str.cat(movies['overview'].fillna(''), sep=' ')
movies.drop(columns = ['tagline'], inplace = True)
movies.head()

In [None]:
"""Stem the keywords so there are less features in the TF-IDF matrix"""
import string
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

movies['overview'] = movies['overview'].str.split()
movies['overview'] = movies['overview'].apply(lambda l: [stemmer.stem(word.strip(string.punctuation)) for word in l])
movies.head()

**Cleaning Keywords**

Using the clean_keywords function for better organization and efficiency.

In [3]:
keywords = pd.read_csv('C:/Coding Projects/Movie-Recommender-Engine/data/keywords.csv')
keywords['keywords'] = keywords['keywords'].apply(lambda element: eval(element))
keywords['movie_id'] = keywords['id']
keywords.drop(columns = ['id'], inplace = True)
keywords.head()

Unnamed: 0,keywords,movie_id
0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",862
1,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",8844
2,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",15602
3,"[{'id': 818, 'name': 'based on novel'}, {'id':...",31357
4,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",11862


In [4]:
keywords['keywords'] = keywords['keywords'].apply(lambda element: [d['name'] for d in element] if isinstance(element, list) else [])
keywords.head()

Unnamed: 0,keywords,movie_id
0,"[jealousy, toy, boy, friendship, friends, riva...",862
1,"[board game, disappearance, based on children'...",8844
2,"[fishing, best friend, duringcreditsstinger, o...",15602
3,"[based on novel, interracial relationship, sin...",31357
4,"[baby, midlife crisis, confidence, aging, daug...",11862


In [5]:
all_keywords = keywords['keywords'].apply(lambda l: pd.Series(l)).stack().value_counts()
unique_keywords = all_keywords[all_keywords == 1]
unique_keywords

penniless             1
acab                  1
remains               1
ancient tablet        1
coming home           1
                     ..
helping animals       1
animal agriculture    1
brother sister        1
bad boy               1
biloxi mississippi    1
Name: count, Length: 8559, dtype: int64

In [None]:
keywords['keywords'] = keywords['keywords'].apply(lambda l: [element for element in l if element not in unique_keywords.index])
keywords.head()

In [None]:
"""Stem the keywords so there are less features in the TF-IDF matrix"""
keywords['keywords'] = keywords['keywords'].apply(lambda l: [stemmer.stem(word) for word in l])

In [None]:
keywords.head()

**Cleaning credits**

In [None]:
credits = pd.read_csv('C:/Coding Projects/Movie-Recommender-Engine/data/credits.csv')
credits.head()

In [None]:
credits['cast'] = credits['cast'].apply(lambda element: eval(element))
credits['crew'] = credits['crew'].apply(lambda element: eval(element))
credits.head()

In [None]:
credits['director'] = credits['crew'].apply(lambda element: [d['name'] for d in element if 'director' in d['job'].lower()] if isinstance(element, list) else [])

"""Turn the director names into a single string not seperated by spaces (otherwise first and last name will be treated as different terms in TF-IDF vectorization)"""

credits['director'] = credits['director'].apply(lambda l: [''.join(fullname.split()) for fullname in l])
credits['director'].head(1).values[0]

In [None]:
"""Find the cast members that only appear in one movie"""

director_name_counts = credits['director'].apply(lambda l: pd.Series(l)).stack().value_counts()
unique_directors = director_name_counts[director_name_counts == 1]
unique_directors.head(30)
unique_directors.info()

In [None]:
"""Get rid of the cast members that only appear in one movie (they will not be useful for finding similarities between movies)"""

credits['director'] = credits['director'].apply(lambda l: [name for name in l if name not in unique_directors.index])
credits['director'].head()

In [None]:
credits.head()

In [None]:
top_3_actors = credits['cast'].apply(lambda l: [d['name'] for d in l][0:3])
top_3_actors = top_3_actors.apply(lambda l: [''.join(fullname.split()) for fullname in l])
top_3_actors.head()

In [None]:
credits['top_3_actors'] = top_3_actors
credits.head()

In [None]:
credits['movie_id'] = credits['id']
credits.drop(columns = ['id'], inplace = True)
credits.head()

In [None]:
#Director should be weighted more than other cast members
credits['director'] = credits['director'] * 3
credits.head()

**Merge Keywords and Credits into Movies**

In [None]:
movies = movies.merge(keywords, on = 'movie_id', how = 'inner')
movies.head()

In [None]:
movies = movies.merge(credits.loc[:, ['movie_id', 'top_3_actors', 'director']], on='movie_id', how='left')
movies['top_3_actors'] = movies['top_3_actors'].apply(lambda x: x if isinstance(x, list) else [])
movies.head()

In [None]:
movies.head()

**Create the Cosine Similarity Matrix between movies**

In [None]:
movies.head()
"""metadata_dump = movies['overview'] + movies['genres'] + movies['keywords'] + movies['top_3_actors'] + movies['director']
movies['metadata_dump'] = metadata_dump.cumsum()
movies.drop(columns = ['overview', 'genres', 'keywords', 'top_3_actors', 'director'], inplace = True)"""

In [None]:
#movies.apply(sum, axis = 1)

In [None]:
movies['metadata_dump'] = pd.Series(dtype=str)

fields = ['overview', 'genres', 'keywords', 'top_3_actors', 'director']
def generate_metadata_dump_entry():
    for index in movies.index:
        res =  ' '.join(' '.join(movies.at[index, field]) for field in fields)
        movies.at[index, 'metadata_dump'] = res

        for field in fields:
            movies.at[index, field] = ''

generate_metadata_dump_entry()


In [None]:
movies = movies.drop(columns = fields)
movies.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 2), min_df = 3)
dtm = tfidf_vectorizer.fit_transform(movies['metadata_dump'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim_matrix = pd.DataFrame(cosine_similarity(dtm, dense_output = True))
cosine_sim_matrix.index = movies.index
cosine_sim_matrix.columns = movies.index

In [None]:
cosine_sim_matrix['movie_id'] = movies.index
columns = ['movie_id'] +[col_name for col_name in cosine_sim_matrix.columns if col_name != 'movie_id']
cosine_sim_matrix = cosine_sim_matrix[columns]
cosine_sim_matrix.head()


**Create the correlation matrix for movie ratings**

In [181]:
movie_ratings = pd.read_csv(f'C:/Coding Projects/Movie-Recommender-Engine/data/ratings_small.csv')
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [182]:
movie_ratings = movie_ratings.pivot_table(index = 'userId', columns = 'movieId', values = 'rating')
movies.head()

Unnamed: 0,imdb_id,title,vote_average,vote_count,movie_id,metadata_dump
0,tt0114709,Toy Story,7.7,5415.0,862,led by woodi andi toy live happili in his room...
1,tt0113497,Jumanji,6.9,2413.0,8844,roll the dice and unleash the excit when sibl ...
2,tt0113228,Grumpier Old Men,6.5,92.0,15602,still yell still fight still readi for love a ...
3,tt0114885,Waiting to Exhale,6.1,34.0,31357,friend are the peopl who let you be yourself a...
4,tt0113041,Father of the Bride Part II,5.7,173.0,11862,just when his world is back to normal he in fo...


In [183]:
movie_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 671 entries, 1 to 671
Columns: 9066 entries, 1 to 163949
dtypes: float64(9066)
memory usage: 46.4 MB


In [186]:
correlation_matrix = movie_ratings.corr(method = 'pearson', min_periods = 7)
correlation_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.363256,0.25508,-0.031564,0.279521,0.030998,0.350519,,-0.312002,0.096792,...,,,,,,,,,,
2,0.363256,1.0,0.187217,,0.035446,-0.01963,0.425928,,,0.438066,...,,,,,,,,,,
3,0.25508,0.187217,1.0,,0.56018,-0.075918,0.05012,,0.496904,0.164039,...,,,,,,,,,,
4,-0.031564,,,1.0,,,,,,,...,,,,,,,,,,
5,0.279521,0.035446,0.56018,,1.0,0.383228,0.012493,,0.225689,0.077864,...,,,,,,,,,,


In [187]:
correlation_matrix['movie_id'] = correlation_matrix.index
columns = ['movie_id'] + [col_name for col_name in correlation_matrix.columns if col_name != 'movie_id']
correlation_matrix = correlation_matrix[columns]

correlation_matrix.index.name = None
correlation_matrix.columns.name = None

correlation_matrix.head()

Unnamed: 0,movie_id,1,2,3,4,5,6,7,8,9,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
1,1,1.0,0.363256,0.25508,-0.031564,0.279521,0.030998,0.350519,,-0.312002,...,,,,,,,,,,
2,2,0.363256,1.0,0.187217,,0.035446,-0.01963,0.425928,,,...,,,,,,,,,,
3,3,0.25508,0.187217,1.0,,0.56018,-0.075918,0.05012,,0.496904,...,,,,,,,,,,
4,4,-0.031564,,,1.0,,,,,,...,,,,,,,,,,
5,5,0.279521,0.035446,0.56018,,1.0,0.383228,0.012493,,0.225689,...,,,,,,,,,,
