In [1]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

import torch
device_ = "cuda" if torch.cuda.is_available() else "cpu"
device_

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [2]:
df=pd.read_csv('data/netflix_titles_raw.csv')

In [3]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [4]:
# drop duration and date_added
df.drop(['duration','date_added'], axis=1 ,inplace=True)
df['director'] = df['director'].fillna('UnKnown')
df['country'] = df['country'].fillna('UnKnown')
df['cast'] = df['cast'].fillna('UnKnown')

In [5]:
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
release_year    0
rating          4
listed_in       0
description     0
dtype: int64

In [6]:
# drop rating = nan
df.dropna(inplace=True)
df.to_csv('data/netflix_titles_clean.csv', index=False)

In [7]:
df = pd.read_csv('data/netflix_titles_clean.csv')

In [9]:
request = [
    'A netflix original series',
    'Two man in spaceship',
    'A movie about a dog that lost his owner',
]

In [47]:
# model_name = 'bert-base-nli-mean-tokens'
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name, device=device_)
try:
    sentence_embeddings = np.load(f'data/movie_descriptions_{model_name}.npy')
except:
    sentence_embeddings = model.encode(df['description'].values)
    np.save(f'data/movie_descriptions{model_name}.npy', sentence_embeddings)

sentence_embeddings.shape

(8803, 384)

In [15]:
cs=cosine_similarity([sentence_embeddings[0]], sentence_embeddings[1:])

In [16]:
df.description[0]

'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'

In [17]:
# argmax
df.description[cs.argmax()]

'Four best friends navigate loss and major life changes – and smoke a lot of weed – during their last two weeks of high school.'

In [30]:
# cosine similarity between request and all descriptions
request_embeddings = model.encode(request)

cs=cosine_similarity(request_embeddings, sentence_embeddings)

In [21]:
df.iloc[cs[2].argsort()[-10:][::-1]]

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,listed_in,description
1089,s1090,Movie,Two Distant Strangers,"Travon Free, Martin Desmond Roe","Joey Bada$$, Andrew Howard, Zaria",United States,2021,TV-MA,Dramas,"In this Oscar-nominated short film, a man tryi..."
6353,s6355,Movie,Bolt,"Byron Howard, Chris Williams","John Travolta, Miley Cyrus, Susie Essman, Mark...",United States,2008,PG,"Children & Family Movies, Comedies","This tale follows German shepherd Bolt, a cani..."
4977,s4978,Movie,Benji,Brandon Camp,"Gabriel Bateman, Darby Camp, Kiele Sanchez, Je...","United Arab Emirates, United States",2018,TV-PG,"Children & Family Movies, Dramas",A determined dog comes to the rescue and helps...
829,s830,Movie,Dog Gone Trouble,Kevin Johnson,"Big Sean, Pamela Adlon, Lucy Hale, Marissa Jar...",UnKnown,2021,TV-Y7,"Children & Family Movies, Comedies",The privileged life of a pampered dog named Tr...
1319,s1320,Movie,Xico's Journey,Eric D. Cabello Díaz,"Verónica Alva, Pablo Gama Iturrarán “Mago Gami...",Mexico,2021,TV-Y7,Children & Family Movies,"A girl, a dog and her best pal set out to save..."
1372,s1373,Movie,June & Kopi,Noviandra Santosa,"Acha Septriasa, Ryan Delon, Makayla Rose Hilli...",Indonesia,2021,TV-PG,"Children & Family Movies, Dramas, Internationa...","A street dog is taken in by a young couple, an..."
3405,s3406,Movie,Seventeen,Daniel Sánchez Arévalo,"Biel Montoro, Nacho Sánchez, Lola Cordón, Itsa...",Spain,2019,TV-MA,"Comedies, Dramas, International Movies","To find his therapy dog, a 17-year-old escapes..."
325,s326,Movie,Beethoven,Brian Levant,"Charles Grodin, Bonnie Hunt, Dean Jones, Olive...",United States,1992,PG,"Children & Family Movies, Comedies",A father reluctantly agrees to let his childre...
2056,s2057,Movie,Hotel for Dogs,Thor Freudenthal,"Emma Roberts, Jake T. Austin, Kyla Pratt, Lisa...","Germany, United States",2009,PG,"Children & Family Movies, Comedies",Placed in a foster home that doesn't allow pet...
7297,s7300,Movie,Life in the Doghouse,Ron Davis,UnKnown,United States,2018,TV-PG,"Documentaries, LGBTQ Movies",A couple operates a bustling dog rescue out of...


In [27]:
df.description[cs[1].argsort()[-10:][::-1]]

array(['Liam Neeson stars as a man who regains consciousness after a car accident, only to discover that another man is impersonating him.',
       "A doctor performs plastic surgery on a fat man who's been jilted, and continues his transformation by turning him into a handsome, desirable man.",
       'Fearing rejection, a young man struggles to declare his feelings for his best friend, who soon falls for another man — until a fateful incident.',
       'Jilted by his wife, a man with OCD finds eye-opening common ground and camaraderie with his neighbor, a young man with Down Syndrome.',
       "When a late poet leaves behind his property to a young man, the man's brother suspects the poet may have been more than just a family friend.",
       "A gay man deals with his sexuality, a struggling actor has an epiphany, a boy becomes a dancer, and a man tries to fulfill his father's dying wish.",
       'A Punjabi man attempts to build a life in London in hopes of impressing his girlfriend

In [32]:
# euclidean distance

ed=euclidean_distances(request_embeddings, sentence_embeddings)
df.description[ed[1].argsort()[-10:][::-1]]

7064    In the streets of Senegal, a youth movement ri...
1371    A Brooklyn youth football program and its self...
7649    Seeking to heal after a traumatic assault, a y...
550     After a series of brutal slayings, a teen and ...
511     It's not her first talk show, but it is a firs...
6933    Fast-talking city girl Zoe Hart winds up patch...
459     After a traumatic year, all an Indian-American...
6899    On the surface, the Social Welfare Agency appe...
1964    The five rival gangs ruling the SWORD district...
1241    Inspired by her mom's rebellious past and a co...
Name: description, dtype: object

In [24]:
# count vectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['description'])

In [53]:
# cosine similarity
request_embeddings = count.transform('space')
cs=cosine_similarity(request_embeddings, count_matrix)

df.description[cs[0].argsort()[-10:][::-1]]

ValueError: Iterable over raw text documents expected, string object received.

In [26]:
ed=euclidean_distances(request_embeddings, count_matrix)

df.description[ed[2].argsort()[-10:][::-1]]

7674    Leader of the Ten Kwangtung Tigers, Tieh Chiao...
7934    Raised in working-class Mumbai, Manisha has an...
7094    A record-setting triathlete dares to complete ...
7312    Jake and Tony become friends when Jake's dad i...
8181    In the City Without a Voice, only faceless sin...
4797    To her Indian parents' dismay, London-born Jas...
8138    Shaolin warrior Tung Chien-chen is injured in ...
5749    When a group of vicious Sinners plots to escap...
593     When a snow day shuts down the whole town, the...
215     Based on a true story, this action film follow...
Name: description, dtype: object