In [74]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [8]:
df=pd.read_csv('data/netflix_titles_raw.csv')

In [9]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [10]:
# drop duration and date_added
df.drop(['duration','date_added'], axis=1 ,inplace=True)
df['director'] = df['director'].fillna('UnKnown')
df['country'] = df['country'].fillna('UnKnown')
df['cast'] = df['cast'].fillna('UnKnown')

In [11]:
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
release_year    0
rating          4
listed_in       0
description     0
dtype: int64

In [57]:
# drop rating = nan
df.dropna(inplace=True)
df.to_csv('data/netflix_titles_clean.csv', index=False)

In [58]:
df = pd.read_csv('data/netflix_titles_clean.csv')

In [13]:
# # cosine similarity
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # count = CountVectorizer(stop_words='english')
# # count_matrix = count.fit_transform(df['description'])

# # cosine similarity between two paragraphs

# pa1 = df.description[3]
# pa2 = df.description[1]

# count = CountVectorizer(stop_words='english')
# count_matrix = count.fit_transform(df['description'])

# cosine_similarity(count_matrix,count_matrix)

In [41]:
request = [
    'A netflix original series',
    'Two man in spaceship',
    'A movie about a dog that lost his owner',
]

In [55]:
model_name = 'bert-base-nli-mean-tokens'
try:
    sentence_embeddings = np.load('data/movie_descriptions.npy')
except:
    model = SentenceTransformer(model_name)
    sentence_embeddings = model.encode(df['description'].values)
    np.save('data/movie_descriptions.npy', sentence_embeddings)

sentence_embeddings.shape

(8803, 768)

In [21]:
cs=cosine_similarity([sentence_embeddings[0]], sentence_embeddings[1:])

In [24]:
df.description[0]

'As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'

In [26]:
# argmax
df.description[cs.argmax()]

'A deputy curator of a chaebol-funded art gallery and her husband, a politically ambitious economics professor, will do anything to join the über-elite.'

In [69]:
# cosine similarity between request and all descriptions
request_embeddings = model.encode(request)

cs=cosine_similarity(request_embeddings, sentence_embeddings)

In [43]:
df.iloc[cs[2].argsort()[-10:][::-1]]

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,listed_in,description
7048,s7049,Movie,I'll See You in My Dreams,Brett Haley,"Blythe Danner, Martin Starr, June Squibb, Rhea...",United States,2015,PG-13,"Comedies, Dramas, Independent Movies","After the passing of her dog, a long-widowed s..."
8375,s8376,Movie,The Keeping Hours,Karen Moncrieff,"Lee Pace, Carrie Coon, Sander Thomas, Amy Smar...",United States,2017,PG-13,"Dramas, Thrillers",Years after the death of their only child and ...
5777,s5778,TV Show,Someone Like You,UnKnown,"Kingone Wang, Lorene Ren, Sean Lee, Nita Lei, ...",Taiwan,2015,TV-14,"International TV Shows, Romantic TV Shows, TV ...",After losing his sight and his fiancee in an a...
5073,s5074,Movie,The Open House,"Matt Angel, Suzanne Coote","Dylan Minnette, Piercey Dalton, Patricia Bethu...","Canada, United States",2018,TV-MA,"Horror Movies, Thrillers","Following a tragedy, a mother and her teen son..."
8552,s8553,Movie,The Water Diviner,Russell Crowe,"Russell Crowe, Olga Kurylenko, Yılmaz Erdoğan,...","Australia, United States",2014,R,"Dramas, International Movies",Years after the presumed death of his three so...
2617,s2618,Movie,İstanbul Kırmızısı,Ferzan Özpetek,"Halit Ergenç, Nejat İşler, Mehmet Günsür, Çiğd...","Italy, Turkey",2017,TV-MA,"Dramas, Independent Movies, International Movies","Upon his return to Istanbul, an expat writer-e..."
1340,s1341,Movie,Little Big Women,Joseph Hsu,"Chen Shu-fang, Hsieh Ying-xuan, Vivian Hsu, Su...",Taiwan,2020,TV-14,"Dramas, International Movies",A family grapples with the passing of their es...
4754,s4755,TV Show,Welcome to the Family,UnKnown,"Melani Olivares, Ivan Massagué, Yolanda Ramos,...",Spain,2018,TV-MA,"International TV Shows, TV Comedies, TV Dramas",When an evicted single mom's estranged father ...
3529,s3530,Movie,Evelyn,Orlando von Einsiedel,UnKnown,United Kingdom,2019,TV-MA,"Documentaries, International Movies","Haunted by the suicide of a brother, a directo..."
1264,s1265,Movie,Our Idiot Brother,Jesse Peretz,"Paul Rudd, Elizabeth Banks, Zooey Deschanel, E...",United States,2011,R,"Comedies, Dramas, Independent Movies",A seemingly clueless idealist relies on his ex...


In [70]:
df.description[cs[1].argsort()[-10:][::-1]]

347     After landing on a planet reminiscent of 1950s...
5650    In this "Star Trek" spin-off, Commander Sisko ...
1364    A mysterious game sends two brothers flying in...
3187    With humankind's future at stake, a group of s...
5244    Capt. Archer and his crew explore space and di...
3642    After a massive alien artifact lands on Earth,...
981     Carpet dealer and UFO photo forger Arif is abd...
2662    Traversing trippy worlds inside his universe s...
7614    With access to the scientists and engineers re...
7406    Fact meets fiction in this docudrama chronicli...
Name: description, dtype: object

In [72]:
# euclidean distance

ed=euclidean_distances(request_embeddings, sentence_embeddings)
df.description[ed[1].argsort()[-10:][::-1]]

641     Three track star sisters face obstacles in lif...
8562    A widow searches for her missing teenage daugh...
2688    Two fierce mothers become rivals when a school...
6942    This documentary tells the story of a teenage ...
2714    A lonely young woman feels trapped at her all ...
3845    Two female friends, each with confidence issue...
654     Two days before their final exams, three teen ...
2705    Implicated in her daughter's disappearance, a ...
8126    A socially awkward Catholic schoolgirl vows to...
8334    A troubled teenage girl hopes to start fresh a...
Name: description, dtype: object

In [75]:
# count vectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['description'])

In [79]:
# cosine similarity
request_embeddings = count.transform(request)
cs=cosine_similarity(request_embeddings, count_matrix)

df.description[cs[2].argsort()[-10:][::-1]]

7398    An Indian business owner struggles to keep his...
7001    Two teenage geeks inadvertently find a lifelik...
829     The privileged life of a pampered dog named Tr...
2022    Julie lost her passion for music when she lost...
2892    To survive in a dog-eat-dog world, two rival l...
1765    Lazy, lasagna-loving fat cat Garfield lives li...
1274    No dog, no breed, no behavior is unfixable for...
94      A rough and tough police dog must go undercove...
4845    It's the Emmy-winning story of a wealthy famil...
247     He lost the love of his life to a pharmaceutic...
Name: description, dtype: object

In [80]:
ed=euclidean_distances(request_embeddings, count_matrix)

df.description[ed[2].argsort()[-10:][::-1]]

7674    Leader of the Ten Kwangtung Tigers, Tieh Chiao...
7934    Raised in working-class Mumbai, Manisha has an...
7094    A record-setting triathlete dares to complete ...
7312    Jake and Tony become friends when Jake's dad i...
8181    In the City Without a Voice, only faceless sin...
4797    To her Indian parents' dismay, London-born Jas...
8138    Shaolin warrior Tung Chien-chen is injured in ...
5749    When a group of vicious Sinners plots to escap...
593     When a snow day shuts down the whole town, the...
215     Based on a true story, this action film follow...
Name: description, dtype: object