In [1]:
import numpy as np
import pandas as pd

In [3]:
metadata_df = pd.read_csv("data/cleaned_movies_metadata.csv")
metadata_df.head(2)

Unnamed: 0,movieId,original_title,genres,original_language,overview,production_companies,production_countries
0,862,Toy Story,Animation Comedy Family,en,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,United States of America
1,8844,Jumanji,Adventure Fantasy Family,en,When siblings Judy and Peter discover an encha...,TriStar Pictures Teitler Film Interscope Commu...,United States of America


In [4]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45463 entries, 0 to 45462
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   movieId               45463 non-null  int64 
 1   original_title        45463 non-null  object
 2   genres                43021 non-null  object
 3   original_language     45452 non-null  object
 4   overview              44509 non-null  object
 5   production_companies  33585 non-null  object
 6   production_countries  39178 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.4+ MB


## Build title to id and id to title map

In [6]:
# build title to id and id to title mappings
id_map = metadata_df[["movieId", "original_title"]]
id_to_title = id_map.set_index("movieId")
id_to_title.head()

Unnamed: 0_level_0,original_title
movieId,Unnamed: 1_level_1
862,Toy Story
8844,Jumanji
15602,Grumpier Old Men
31357,Waiting to Exhale
11862,Father of the Bride Part II


In [10]:
id_to_title.loc[31357]

original_title    Waiting to Exhale
Name: 31357, dtype: object

In [12]:
title_to_id = id_map.set_index("original_title")
title_to_id.head()

Unnamed: 0_level_0,movieId
original_title,Unnamed: 1_level_1
Toy Story,862
Jumanji,8844
Grumpier Old Men,15602
Waiting to Exhale,31357
Father of the Bride Part II,11862


In [13]:
title_to_id.loc["Toy Story"]

movieId    862
Name: Toy Story, dtype: int64

In [14]:
id_to_title.to_csv("id_to_title.csv", index=False)
title_to_id.to_csv("title_to_id.csv", index=False)

## Content Based Recommender

In [15]:
metadata_df.head(2)

Unnamed: 0,movieId,original_title,genres,original_language,overview,production_companies,production_countries
0,862,Toy Story,Animation Comedy Family,en,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,United States of America
1,8844,Jumanji,Adventure Fantasy Family,en,When siblings Judy and Peter discover an encha...,TriStar Pictures Teitler Film Interscope Commu...,United States of America


In [16]:
metadata_df.columns

Index(['movieId', 'original_title', 'genres', 'original_language', 'overview',
       'production_companies', 'production_countries'],
      dtype='object')

In [46]:
def combine_all(x):
    result = ""
    for col in x[1:]:
        result += " " + str(col).lower()
    return result

In [47]:
metadata_df["combined"] = metadata_df.apply(combine_all, axis=1)

In [48]:
metadata_df.head()

Unnamed: 0,movieId,original_title,genres,original_language,overview,production_companies,production_countries,combined
0,862,Toy Story,Animation Comedy Family,en,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,United States of America,toy story animation comedy family en led by w...
1,8844,Jumanji,Adventure Fantasy Family,en,When siblings Judy and Peter discover an encha...,TriStar Pictures Teitler Film Interscope Commu...,United States of America,jumanji adventure fantasy family en when sibl...
2,15602,Grumpier Old Men,Romance Comedy,en,A family wedding reignites the ancient feud be...,Warner Bros. Lancaster Gate,United States of America,grumpier old men romance comedy en a family w...
3,31357,Waiting to Exhale,Comedy Drama Romance,en,"Cheated on, mistreated and stepped on, the wom...",Twentieth Century Fox Film Corporation,United States of America,waiting to exhale comedy drama romance en che...
4,11862,Father of the Bride Part II,Comedy,en,Just when George Banks has recovered from his ...,Sandollar Productions Touchstone Pictures,United States of America,father of the bride part ii comedy en just wh...


### Text Encoding

In [49]:
from sentence_transformers import SentenceTransformer

In [50]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [51]:
movies_encoding = model.encode(metadata_df["combined"])

In [52]:
type(movies_encoding), movies_encoding.shape

(numpy.ndarray, (45463, 384))

In [53]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities_sbert = linear_kernel(movies_encoding, movies_encoding)

In [54]:
cosine_similarities_sbert.shape, type(cosine_similarities_sbert)

((45463, 45463), numpy.ndarray)

In [98]:
indices = pd.Series(metadata_df.index, index=metadata_df["original_title"])
indices[:2]

original_title
Toy Story    0
Jumanji      1
dtype: int64

In [99]:
if "Friends" in indices:
    print("yes")

In [100]:
movie_title = metadata_df["original_title"]
movie_title[:2]

0    Toy Story
1      Jumanji
Name: original_title, dtype: object

In [101]:
def content_recommender(title):
  if title not in indices:
    raise KeyError("Title Not Found in database.")
  idx = indices[title]
  sim_scores = list(enumerate(cosine_similarities_sbert[idx]))
  sim_scores = sorted(sim_scores, key = lambda x:x[1], reverse = True)
  sim_scores = sim_scores[1:31]
  movie_indices = [i[0] for i in sim_scores]
  return movie_title.iloc[movie_indices]
     

In [102]:
content_recommender("Jumanji")

11174                    Mind Game
23061                         Noah
2367               Nettoyage à sec
20970                         Epic
17368                  In Her Skin
2682                           Big
15348                  Toy Story 3
0                        Toy Story
30799                        Noobz
32259                         Joni
13724                           Up
34119                   Yolngu Boy
23470                       Tracks
41125                         Lion
15065              Juoksuhaudantie
44441                        Gamer
35847                       Jigsaw
23640           They Came Together
22708    Gamera tai daiakuju Giron
17862             In jeder Sekunde
13550                Adventureland
11694                         TMNT
14551                       Avatar
18182                       Lifted
2181                          Antz
35689            Sevimli Tehlikeli
28955               20 Years After
27909                  Animal Room
9207      An Extreme

In [103]:
import pickle

In [89]:
with open("cosine_similarities_sbert.pkl", "wb") as f:
    pickle.dump(cosine_similarities_sbert, f)

In [90]:
type(indices), type(movie_title)

(pandas.core.series.Series, pandas.core.series.Series)

In [104]:
with open("indices.pkl", "wb") as f:
    pickle.dump(indices, f)

with open("movie_title.pkl", "wb") as f:
    pickle.dump(movie_title, f)

### Application

In [105]:
# load all required files
with open("cosine_similarities_sbert.pkl", "rb") as f:
    cosine_similarities_sbert = pickle.load(f)

with open("indices.pkl", "rb") as f:
    indices = pickle.load(f)

with open("movie_title.pkl", "rb") as f:
    movie_title = pickle.load(f)

In [106]:
indices[:2]

original_title
Toy Story    0
Jumanji      1
dtype: int64

In [114]:
def content_recommender(title, num_movie = 25):
  if title not in indices:
    raise KeyError("Title Not Found in database.")
  idx = indices[title]
  sim_scores = list(enumerate(cosine_similarities_sbert[idx]))
  sim_scores = sorted(sim_scores, key = lambda x:x[1], reverse = True)
  sim_scores = sim_scores[1:num_movie + 1]
  movie_indices = [i[0] for i in sim_scores]
  return movie_title.iloc[movie_indices]

In [115]:
content_recommender("Toy Story")

15348                            Toy Story 3
23843                            Sexual Life
23640                     They Came Together
21680                  Supporting Characters
27629                Invitation to Happiness
15538                               In a Day
17862                       In jeder Sekunde
2997                             Toy Story 2
18910             Diaries Notes and Sketches
41893                               Sobriety
18337                              Newlyweds
30184                                   Mars
29144                                 Loaded
21066                     His New Profession
104                        Keiner liebt mich
27909                            Animal Room
7368                           Noises Off...
24891                                Illicit
28984                        Superstar Goofy
34798                         Organize İşler
9050                             Floundering
27868                                Results
300       

In [116]:
content_recommender("Jumanji", 10)

11174          Mind Game
23061               Noah
2367     Nettoyage à sec
20970               Epic
17368        In Her Skin
2682                 Big
15348        Toy Story 3
0              Toy Story
30799              Noobz
32259               Joni
Name: original_title, dtype: object