#### Import and view data

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('../data/ml-latest/movies.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissière (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [7]:
movies.genres.value_counts()

Drama                                     8402
Comedy                                    5372
(no genres listed)                        4266
Documentary                               4250
Comedy|Drama                              2212
                                          ... 
Comedy|Fantasy|Mystery|Romance               1
Comedy|Crime|Drama|Horror|Thriller           1
Adventure|Documentary|Sci-Fi                 1
Action|Fantasy|Thriller|IMAX                 1
Action|Fantasy|Sci-Fi|Thriller|Western       1
Name: genres, Length: 1643, dtype: int64

In [9]:
movies['genre_list'] = movies.genres.map(lambda x: x.split('|'))

In [10]:
movies

Unnamed: 0,movieId,title,genres,genre_list
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,[Comedy]
...,...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed),[(no genres listed)]
58094,193878,Les tribulations d'une caissière (2011),Comedy,[Comedy]
58095,193880,Her Name Was Mumu (2016),Drama,[Drama]
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,"[Adventure, Drama, Horror, Sci-Fi]"


Create a column for each genre

In [12]:
genres = """* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western
* (no genres listed)""".split('* ')

clean list

In [14]:
genres = genres[1:]

genres = [genre.strip() for genre in genres]

Create columns

In [20]:
for column in genres:
    movies[column] = movies['genre_list'].map(lambda x: 1 if column in x else 0)

In [21]:
movies

Unnamed: 0,movieId,title,genres,genre_list,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed),[(no genres listed)],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
58094,193878,Les tribulations d'une caissière (2011),Comedy,[Comedy],0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
58095,193880,Her Name Was Mumu (2016),Drama,[Drama],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,"[Adventure, Drama, Horror, Sci-Fi]",1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


Drop old columns

In [22]:
movies.drop(['genres', 'genre_list'], axis=1, inplace=True)

Calculate cosine similarity

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
genre_matrix = movies.drop(['title', 'movieId'], axis=1)

In [31]:
toy_story = genre_matrix.loc[0]

In [33]:
genre_matrix

Unnamed: 0,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58093,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
58094,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
58095,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
58096,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0


In [35]:
import numpy as np

In [39]:
genre_numpy = genre_matrix.to_numpy()

In [43]:
toy_story = genre_numpy[0]

In [55]:
%%time

similarity_list = [cosine_similarity(toy_story.reshape(1,-1), film.reshape(1,-1)) for film in genre_numpy]

CPU times: user 13.9 s, sys: 12 ms, total: 13.9 s
Wall time: 13.9 s


In [58]:
similarity_list = [entry[0][0] for entry in similarity_list]

In [69]:
similarity_df = pd.DataFrame({'title': movies.title, 'similarity': similarity_list})
similarity_df

Unnamed: 0,title,similarity
0,Toy Story (1995),1.000000
1,Jumanji (1995),0.707107
2,Grumpier Old Men (1995),0.353553
3,Waiting to Exhale (1995),0.288675
4,Father of the Bride Part II (1995),0.500000
...,...,...
58093,The Great Glinka (1946),0.000000
58094,Les tribulations d'une caissière (2011),0.500000
58095,Her Name Was Mumu (2016),0.000000
58096,Flora (2017),0.250000


In [74]:
similarity_df.sort_values(by='similarity', ascending=True, inplace=True)