In [250]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv("netflixData.csv")
data.head()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"


In [251]:
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [252]:
data = data[["Title", "Description", "Content Type", "Genres"]]
data.head()

Unnamed: 0,Title,Description,Content Type,Genres
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"


In [253]:
data = data.dropna()

In [254]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melinadiaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [255]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text


In [256]:
data["Description Clean"]=data["Description"].apply(clean)
data[["Description","Description Clean"]].sample(10)

Unnamed: 0,Description,Description Clean
1219,"As Metropolis High students, super teens Wonde...",metropoli high student super teen wonder woman...
3859,"When he becomes lost in the desert, pet chamel...",becom lost desert pet chameleon rango pretend ...
4712,Determined to bring a Zika vaccine to the remo...,determin bring zika vaccin remot pantan three ...
1911,"From Sierra de las Minas to Esquipulas, explor...",sierra de las mina esquipula explor guatemala ...
3363,Concert and behind-the-scenes footage capture ...,concert behindthescen footag captur life tour ...
254,"Tormented by bullies, an aspiring drag star wo...",torment bulli aspir drag star work alaskan can...
1034,Spirited sister-brother duo Cleo and Cuquín tr...,spirit sisterbroth duo cleo cuquín tri differ ...
3732,After uncovering a magical locket that allows ...,uncov magic locket allow shrink size polli fri...
3569,A seemingly clueless idealist relies on his ex...,seem clueless idealist reli exasper sister she...
697,This sci-fi anthology series explores a twiste...,scifi antholog seri explor twist hightech near...


In [257]:
data["Title Clean"] = data["Title"].apply(clean)
data[["Title","Title Clean"]].sample(10)

Unnamed: 0,Title,Title Clean
4075,Sand Castle,sand castl
4574,Teen Mom 2,teen mom
5782,What the F* Is Going On?,f go
5503,Triple Threat,tripl threat
2338,Japan Sinks: 2020,japan sink
1688,Francesco De Carlo: Cose di Questo Mondo,francesco de carlo cose di questo mondo
1163,Da 5 Bloods,da blood
4711,The Chef Show,chef show
2781,Los Tigres del Norte at Folsom Prison,los tigr del nort folsom prison
1192,Dark City Beneath the Beat,dark citi beneath beat


In [193]:
data["Genres"]=data["Genres"].str.replace(' ','_')
data["Genres"]=data["Genres"].str.replace(',_',', ')
data["Genres"].unique()

array(['Reality_TV', 'Horror_Movies, International_Movies, Thrillers',
       'Documentaries, International_Movies', 'TV_Comedies',
       'Dramas, International_Movies, Romantic_Movies', 'Comedies',
       'Documentaries, Sports_Movies',
       'Comedies, Dramas, International_Movies',
       'Comedies, International_Movies, Romantic_Movies',
       'International_TV_Shows, Romantic_TV_Shows, TV_Dramas',
       'Docuseries, Science_&_Nature_TV',
       'Dramas, International_Movies, Sports_Movies', 'Movies',
       'Dramas, International_Movies',
       'Horror_Movies, International_Movies',
       'Crime_TV_Shows, TV_Dramas, TV_Mysteries',
       'Crime_TV_Shows, Docuseries', 'Documentaries',
       'Comedies, Dramas, Independent_Movies',
       'Dramas, Independent_Movies, International_Movies',
       'Dramas, Thrillers',
       'Crime_TV_Shows, International_TV_Shows, TV_Dramas',
       'Crime_TV_Shows, Docuseries, International_TV_Shows',
       'Horror_Movies, Independent_Movies

In [280]:
np.unique(np.array(" ".join(data["Genres"].unique().tolist()).replace(",","").split(" ")))

array(['&', 'Action', 'Adventure', 'Anime', 'British', 'Children',
       'Classic', 'Comedies', 'Comedy', 'Crime', 'Cult', 'Documentaries',
       'Docuseries', 'Dramas', 'Faith', 'Family', 'Fantasy', 'Features',
       'Horror', 'Independent', 'International', "Kids'", 'Korean',
       'LGBTQ', 'Movies', 'Music', 'Musicals', 'Mysteries', 'Nature',
       'Reality', 'Romantic', 'Sci-Fi', 'Science', 'Series', 'Shows',
       'Spanish-Language', 'Spirituality', 'Sports', 'Stand-Up', 'TV',
       'Talk', 'Teen', 'Thrillers'], dtype='<U16')

In [60]:
text.TfidfVectorizer(input=data["Genres"].tolist(), stop_words="english").fit_transform( data["Genres"].tolist())

<5967x44 sparse matrix of type '<class 'numpy.float64'>'
	with 22096 stored elements in Compressed Sparse Row format>

In [281]:
feature = data["Genres"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words=None, lowercase=True, token_pattern=r'\w[-\w&\']*\w')
tfidf_matrix = tfidf.fit_transform(feature)
similaritygenre= cosine_similarity(tfidf_matrix)

In [224]:
tfidf.get_feature_names()

['action_&_adventure',
 'anime_features',
 'anime_series',
 'british_tv_shows',
 'children_&_family_movies',
 'classic_&_cult_tv',
 'classic_movies',
 'comedies',
 'crime_tv_shows',
 'cult_movies',
 'documentaries',
 'docuseries',
 'dramas',
 'faith_&_spirituality',
 'horror_movies',
 'independent_movies',
 'international_movies',
 'international_tv_shows',
 "kids'_tv",
 'korean_tv_shows',
 'lgbtq_movies',
 'movies',
 'music_&_musicals',
 'reality_tv',
 'romantic_movies',
 'romantic_tv_shows',
 'sci-fi_&_fantasy',
 'science_&_nature_tv',
 'spanish-language_tv_shows',
 'sports_movies',
 'stand-up_comedy',
 'stand-up_comedy_&_talk_shows',
 'teen_tv_shows',
 'thrillers',
 'tv_action_&_adventure',
 'tv_comedies',
 'tv_dramas',
 'tv_horror',
 'tv_mysteries',
 'tv_sci-fi_&_fantasy',
 'tv_shows',
 'tv_thrillers']

In [239]:
def netFlix_recommendation(title, similarity1 = similaritygenre, similarity2=similaritydesc):
    index = indices[title]
    similarity_scores = list(enumerate(similarity1[index],similarity2[index]))
    test=similarity_scores
    print(type(test[0]), test)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("Fred Claus"))

<class 'tuple'> [(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.5817269163337007), (8, 0.0), (9, 0.37655885992125426), (10, 0.37655885992125426), (11, 0.311655387624007), (12, 0.37655885992125426), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.3075005738998031), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.37655885992125426), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.37655885992125426), (47, 0.45573913943355604), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 1.0000000000000002), (58, 0.3075005738998031), (59, 0.0), (60, 0.3075005738998031), (61, 0.45573913943355604), (62, 0.0), (63, 0.45573913943355604), (64, 0.0), (65, 0.0), (66, 0.0), (67, 0.0), (68, 0.0), (69, 0.0), (70, 0

In [283]:
import itertools

def netFlix_recommendation(title, similarity1 = similaritygenre, similarity2=similaritydesc):
    index = indices[title]
    similarity_scores = list(enumerate(zip(similarity1[index],similarity2[index])))
    similarity_scores = sorted(similarity_scores, key=lambda x: (x[1][0], x[1][1]), reverse=True)
    test=similarity_scores
    print(type(test[0]), test)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("Fred Claus"))

<class 'tuple'> [(1701, (1.0000000000000002, 1.0000000000000004)), (151, (1.0000000000000002, 0.26616723940603804)), (3794, (1.0000000000000002, 0.24845812699330438)), (57, (1.0000000000000002, 0.20447431171981262)), (4714, (1.0000000000000002, 0.1334511446740561)), (4624, (1.0000000000000002, 0.11497777619335114)), (4713, (1.0000000000000002, 0.08743271372460491)), (3619, (1.0000000000000002, 0.06340932047887436)), (2366, (1.0000000000000002, 0.05975467017562497)), (2111, (1.0000000000000002, 0.05434405750358889)), (3669, (1.0000000000000002, 0.05067570852941503)), (2522, (1.0000000000000002, 0.05054364698328134)), (887, (1.0000000000000002, 0.05042475443789131)), (995, (1.0000000000000002, 0.04979800102298132)), (5215, (1.0000000000000002, 0.04931116050880822)), (1384, (1.0000000000000002, 0.043365160215902185)), (4022, (1.0000000000000002, 0.04266836957318961)), (4683, (1.0000000000000002, 0.036235405671269066)), (5175, (1.0000000000000002, 0.0349724118390775)), (2127, (1.0000000000

In [284]:
import itertools

def netFlix_recommendation(title, similarity1 = similaritygenre, similarity2=similaritydesc):
    index = indices[title]
    similarity_scores = list(enumerate(zip(similarity1[index],similarity2[index])))
    similarity_scores = sorted(similarity_scores, key=lambda x: (x[1][1], x[1][0]), reverse=True)
    test=similarity_scores
    print(type(test[0]), test)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("Fred Claus"))

<class 'tuple'> [(1701, (1.0000000000000002, 1.0000000000000004)), (2856, (0.7206501995085879, 0.30471516748213945)), (151, (1.0000000000000002, 0.26616723940603804)), (3794, (1.0000000000000002, 0.24845812699330438)), (3467, (0.28678425070150176, 0.22696265114745418)), (57, (1.0000000000000002, 0.20447431171981262)), (1493, (0.9228444626637865, 0.18654535785299292)), (2983, (0.6141715726509542, 0.1392158467385276)), (4714, (1.0000000000000002, 0.1334511446740561)), (3941, (0.0, 0.12886030189218164)), (15, (0.15099255208756932, 0.12699161959836008)), (2401, (0.0, 0.12691829385140085)), (4624, (1.0000000000000002, 0.11497777619335114)), (4091, (0.3573487901967257, 0.10978515779782938)), (2149, (0.1376137489301628, 0.10910497084727452)), (157, (0.9228444626637865, 0.1086735036804537)), (4486, (0.9228444626637865, 0.10697149101607482)), (5459, (0.28678425070150176, 0.10669213813264289)), (1481, (0.9228444626637865, 0.10232743156892016)), (5394, (0.17602266627541652, 0.10091522811028589)),

In [282]:
feature = data["Description Clean"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words='english')
print(tfidf)
tfidf_matrix = tfidf.fit_transform(feature)
similaritydesc= cosine_similarity(tfidf_matrix)

TfidfVectorizer(input=['docuseri take deep dive lucrat well industri tout '
                       'health heal product live promis',
                       'grisli virus rampag citi lone man stay lock insid '
                       'apart digit cut seek help desper find way',
                       'diari ann frank stori retold alongsid five holocaust '
                       'survivor poignant documentari oscar winner helen '
                       'mirren',
                       'kenya barri famili navig relationship race cultur '
                       'grappl newfound success comed...
                       'spain relinquish last coloni battlefatigu outpost '
                       'engag long brutal sometim bizarr clash filipino insurg',
                       'farmer pen confess admit wife murder death begin '
                       'macabr tale base stephen king novella',
                       'dark althistori thriller naïv law student worldweari '
                       'det

In [279]:
def netFlix_recommendation(title, similarity = similaritydesc):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    test=similarity_scores
    print(type(test[0]), test)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("Fred Claus"))

<class 'tuple'> [(0, 0.0), (1, 0.02081926252791954), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.025819117793725938), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.024073034798990717), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.12699161959836008), (16, 0.0), (17, 0.023310847882338257), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.025526762082683733), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.03913031810451755), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.0), (47, 0.0358214673995255), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.20447431171981262), (58, 0.0), (59, 0.0), (60, 0.0), (61, 0.0), (62, 0.0), (63, 0.0), (64, 0.03660681196961399), (65, 0.0), (66, 0.0), (67, 0.0), (68, 0.0), (69, 0.0), (70, 0.0), (71, 0.0), (72, 0.0), (73, 0.0), (74, 0.0), (75, 

In [232]:
data[data["Title"].isin(["Bee Movie","Fred Claus","Blood & Water"])]

Unnamed: 0,Title,Description,Content Type,Genres,Title Clean
581,Bee Movie,"Barry, a worker bee stuck in a dead-end job ma...",Movie,"Children_&_Family_Movies, Comedies",bee movi
724,Blood & Water,"After crossing paths at a party, a Cape Town t...",TV Show,"International_TV_Shows, TV_Dramas, TV_Mysteries",blood water
1701,Fred Claus,The holiday season is ruined for Santa Claus w...,Movie,"Children_&_Family_Movies, Comedies",fred claus


In [134]:
indices = pd.Series(data.index, index=data['Title Clean']).drop_duplicates()

In [135]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    print (similarity_scores)
    movieindices = [i[0] for i in similarity_scores]
    return data['Title Clean'].iloc[movieindices]

print(netFlix_recommendation("fred claus"))

[(57, 1.0000000000000002), (101, 1.0000000000000002), (104, 1.0000000000000002), (146, 1.0000000000000002), (151, 1.0000000000000002), (282, 1.0000000000000002), (376, 1.0000000000000002), (403, 1.0000000000000002), (581, 1.0000000000000002), (662, 1.0000000000000002)]
57                    christma wish
101                 fair odd summer
104         famili reunion christma
146    shaun sheep movi farmageddon
151               storybot christma
282                alien stole bodi
376                    anim cracker
403                      arctic dog
581                        bee movi
662                  bigfoot famili
Name: Title Clean, dtype: object


In [119]:
data[data["Title"].str.contains("Fred Claus")]

Unnamed: 0,Title,Description,Content Type,Genres,Title Clean
1701,Fred Claus,The holiday season is ruined for Santa Claus w...,Movie,"Children & Family Movies, Comedies",fred claus


In [86]:
print(netFlix_recommendation("The Queen's Gambit"))

1346    dolli parton heartstr
1420                  dynasti
1535                     evil
1851                  godless
1901                greenleaf
1925                    gypsi
1942                  halston
1943          halt catch fire
2008                heartland
2088                hollywood
Name: Title Clean, dtype: object


In [23]:
pd.set_option('display.max_rows', None)
print(data["Title"])
pd.reset_option('display.max_columns')

0                                                (Un)Well
1                                                  #Alive
2                           #AnneFrank - Parallel Stories
3                                                #blackAF
4                                        #cats_the_mewvie
5                                       #FriendButMarried
6                                     #FriendButMarried 2
7                                            #realityhigh
8                                               #Rucker50
9                                                 #Selfie
10                                             #Selfie 69
11                                    10 Days in Sun City
12                                         10 jours en or
13                                     100 Days My Prince
14                                             100 Humans
15                                             100 Meters
16                    100 Things to do Before High School
17            

In [19]:
data.isnull().sum()

Title           0
Description     0
Content Type    0
Genres          0
dtype: int64