In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv("netflixData.csv")
data.head()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"


In [2]:
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [3]:
data = data[["Title", "Description", "Content Type", "Genres", "Imdb Score", "Rating"]]
data.head()

Unnamed: 0,Title,Description,Content Type,Genres,Imdb Score,Rating
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV,6.6/10,TV-MA
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers",6.2/10,TV-MA
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies",6.4/10,TV-14
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies,6.6/10,TV-MA
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies",5.1/10,TV-14


In [4]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melinadiaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text


In [6]:
data["Description Clean"]=data["Description"].apply(clean)
data[["Description","Description Clean"]].sample(10)

Unnamed: 0,Description,Description Clean
2002,"Headspace takes a friendly, animated look at t...",headspac take friend anim look benefit medit o...
864,"In the Hazelnut Chipmunk Family, Dominic is a ...",hazelnut chipmunk famili domin littl boy love ...
2663,"When an evil force threatens his village, a gi...",evil forc threaten villag gift teen talk ghost...
2573,The speech-and-hearing-impaired heiress of a p...,speechandhearingimpair heiress palati mansion ...
52,"After crash-landing on Earth, two royal teen a...",crashland earth two royal teen alien run strug...
530,A small but fierce group of resistance fighter...,small fierc group resist fighter engag brutal ...
2363,Jerry Seinfeld takes the stage in New York and...,jerri seinfeld take stage new york tackl talk ...
1645,Sea shanties have long united 10 Cornish fishe...,sea shanti long unit cornish fishermen chant ...
785,A former goalkeeper-turned-talent scout embark...,former goalkeeperturnedtal scout embark mischi...
2577,A young woman returns to Malaysia to take over...,young woman return malaysia take father old co...


In [7]:
data["Title Clean"] = data["Title"].apply(clean)
data[["Title","Title Clean"]].sample(10)

Unnamed: 0,Title,Title Clean
5748,Wedding Unplanned,wed unplan
5711,War Against Women,war women
4184,Seven Souls in the Skull Castle: Season Wind,seven soul skull castl season wind
402,Archibald's Next Big Thing,archibald next big thing
3382,Netflix Presents: The Characters,netflix present charact
4254,Silent,silent
1058,Code Lyoko,code lyoko
3055,Metro,metro
4524,SWORDGAI The Animation,swordgai anim
1737,FullMetal Alchemist,fullmet alchemist


In [8]:
data["Genres"]=data["Genres"].str.replace(' ','_')
data["Genres"]=data["Genres"].str.replace(',_',', ')
data["Genres"].unique()[:10]

array(['Reality_TV', 'Horror_Movies, International_Movies, Thrillers',
       'Documentaries, International_Movies', 'TV_Comedies',
       'Dramas, International_Movies, Romantic_Movies', 'Comedies',
       'Documentaries, Sports_Movies',
       'Comedies, Dramas, International_Movies',
       'Comedies, International_Movies, Romantic_Movies',
       'International_TV_Shows, Romantic_TV_Shows, TV_Dramas'],
      dtype=object)

In [9]:
np.unique(np.array(" ".join(data["Genres"].unique().tolist()).replace(",","").split(" ")))

array(['Action_&_Adventure', 'Anime_Features', 'Anime_Series',
       'British_TV_Shows', 'Children_&_Family_Movies',
       'Classic_&_Cult_TV', 'Classic_Movies', 'Comedies',
       'Crime_TV_Shows', 'Cult_Movies', 'Documentaries', 'Docuseries',
       'Dramas', 'Faith_&_Spirituality', 'Horror_Movies',
       'Independent_Movies', 'International_Movies',
       'International_TV_Shows', "Kids'_TV", 'Korean_TV_Shows',
       'LGBTQ_Movies', 'Movies', 'Music_&_Musicals', 'Reality_TV',
       'Romantic_Movies', 'Romantic_TV_Shows', 'Sci-Fi_&_Fantasy',
       'Science_&_Nature_TV', 'Spanish-Language_TV_Shows',
       'Sports_Movies', 'Stand-Up_Comedy', 'Stand-Up_Comedy_&_Talk_Shows',
       'TV_Action_&_Adventure', 'TV_Comedies', 'TV_Dramas', 'TV_Horror',
       'TV_Mysteries', 'TV_Sci-Fi_&_Fantasy', 'TV_Shows', 'TV_Thrillers',
       'Teen_TV_Shows', 'Thrillers'], dtype='<U28')

In [10]:
text.TfidfVectorizer(input=data["Genres"].tolist(), stop_words="english").fit_transform( data["Genres"].tolist())

<5967x54 sparse matrix of type '<class 'numpy.float64'>'
	with 15976 stored elements in Compressed Sparse Row format>

In [15]:
feature = data["Genres"].tolist()
#print(feature)
tfidf = text.TfidfVectorizer(input=feature, stop_words=None, lowercase=True, token_pattern=r'\w[-\w&\']*\w')
#print(tfidf)
tfidf_matrix = tfidf.fit_transform(feature)
similaritygenre= cosine_similarity(tfidf_matrix)
print(similaritygenre)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.17604959 ... 0.         0.62401158 0.        ]
 [0.         0.17604959 1.         ... 0.         0.22002569 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.62401158 0.22002569 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


1 i walked the dog in the park -> walk- dog park
2 we found a cat walking in the park -> found cat walk park -> dog ~ cat
3 i went to school to play soccer -> went/go school play soccer

    cat dog go park play school soccer walk 
1   0   1    0  1    0     0      0    0
2    1   0    

Reality, Horror, international, thrillers
1: 1, 0, 0, 0, 0
2: 0, 1, 1, 1
3:

In [16]:
feature = data["Description Clean"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words='english')
print(tfidf)
tfidf_matrix = tfidf.fit_transform(feature)
similaritydesc= cosine_similarity(tfidf_matrix)

TfidfVectorizer(input=['docuseri take deep dive lucrat well industri tout '
                       'health heal product live promis',
                       'grisli virus rampag citi lone man stay lock insid '
                       'apart digit cut seek help desper find way',
                       'diari ann frank stori retold alongsid five holocaust '
                       'survivor poignant documentari oscar winner helen '
                       'mirren',
                       'kenya barri famili navig relationship race cultur '
                       'grappl newfound success comed...
                       'spain relinquish last coloni battlefatigu outpost '
                       'engag long brutal sometim bizarr clash filipino insurg',
                       'farmer pen confess admit wife murder death begin '
                       'macabr tale base stephen king novella',
                       'dark althistori thriller naïv law student worldweari '
                       'det

In [17]:
tfidf.get_feature_names_out()

array(['aaliya', 'aamir', 'aang', ..., 'łukasz', 'ōarai', 'şeref'],
      dtype=object)

In [18]:
indices = pd.Series(data.index, index=data['Title']).drop_duplicates()

In [19]:
import itertools

def netflix_recommendation_g(title, similarity1 = similaritygenre, similarity2=similaritydesc):
    """genre, then description"""
    index = indices[title]
    similarity_scores = list(enumerate(zip(similarity1[index],similarity2[index])))
    similarity_scores = sorted(similarity_scores, key=lambda x: (x[1][0], x[1][1]), reverse=True)
    test=similarity_scores
    #print(type(test[0]), test)
    similarity_scores = similarity_scores[1:11]
    movieindices = [i[0] for i in similarity_scores]
    return data[['Title',"Imdb Score", "Rating"]].iloc[movieindices]

print(netflix_recommendation_g("Henry Danger"))

                                         Title Imdb Score Rating
4805      The Epic Tales of Captain Underpants     6.6/10  TV-Y7
4910                         The InBESTigators     8.1/10   TV-Y
1151         Cupcake & Dino - General Services     7.9/10  TV-Y7
621                   Best.Worst.Weekend.Ever.     6.5/10  TV-PG
1508  Equestria Girls: Tales of Canterlot High     6.7/10  TV-Y7
1043         Cloudy with a Chance of Meatballs     3.6/10  TV-Y7
3597        Pac-Man and the Ghostly Adventures     4.6/10  TV-Y7
2529                      Kicko & Super Speedo        NaN  TV-Y7
2742                            Little Singham     3.5/10  TV-Y7
1976                        Harvey Street Kids     6.5/10  TV-Y7


In [20]:
import itertools

def netflix_recommendation_d(title, similarity1 = similaritygenre, similarity2=similaritydesc):
    """description, then genre"""
    index = indices[title]
    similarity_scores = list(enumerate(zip(similarity1[index],similarity2[index])))
    similarity_scores = sorted(similarity_scores, key=lambda x: (x[1][1], x[1][0]), reverse=True)
    test=similarity_scores
    #print(type(test[0]), test[0])
    similarity_scores = similarity_scores[1:11]
    movieindices = [i[0] for i in similarity_scores]
    return data[['Title',"Imdb Score", "Rating"]].iloc[movieindices]

print(netflix_recommendation_d("Fred Claus"))

                                           Title Imdb Score Rating
2856        Luccas Neto in: The End of Christmas        NaN   TV-Y
151                        A StoryBots Christmas     6.0/10   TV-Y
3794                        Puppy Star Christmas     3.9/10     PG
3467                 Oddbods: The Festive Menace        NaN   TV-Y
57                           48 Christmas Wishes     3.8/10   TV-G
1493                Elliot the Littlest Reindeer     5.0/10     PG
2983  Marvel Super Hero Adventures: Frost Fight!     5.0/10     PG
4714          The Christmas Chronicles: Part Two     5.8/10     PG
3941          Ricardo O'Farrill: Abrazo navideño     5.8/10  TV-MA
15                                    100 Meters     7.6/10  TV-MA


In [21]:
import itertools

def netflix_recommendation_c(title, similarity1 = similaritygenre, similarity2=similaritydesc):
    """linear combo of description and genre"""
    index = indices[title]
    similarity_scores = list(enumerate(zip(similarity1[index],similarity2[index])))
    similarity_scores = sorted(similarity_scores, key=lambda x: (x[1][0]+1.6* x[1][1]), reverse=True)
    test=similarity_scores
    #print(type(test[0]), test)
    similarity_scores = similarity_scores[1:11]
    movieindices = [i[0] for i in similarity_scores]
    return data[['Title',"Imdb Score", "Rating"]].iloc[movieindices]

print(netflix_recommendation_c("The Queen's Gambit"))

                            Title Imdb Score Rating
4843                 The Get Down     8.3/10  TV-MA
5542     TURN: Washington's Spies     8.2/10  TV-14
246                 Akulah Balqis        NaN  TV-14
471                Away From Home        NaN  TV-PG
1346  Dolly Parton's Heartstrings     7.5/10  TV-14
1420                      Dynasty     7.2/10  TV-14
1535                         Evil     7.8/10  TV-14
1851                      Godless     8.4/10  TV-MA
1901                    Greenleaf     7.3/10  TV-14
1925                        Gypsy     6.7/10  TV-MA


In [22]:
def netFlix_recommendation(title, similarity = similaritygenre):
    """only genre, control"""
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    test=similarity_scores
    #print(type(test[0]), test)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("Fred Claus"))

57                      48 Christmas Wishes
101                     A Fairly Odd Summer
104              A Family Reunion Christmas
146    A Shaun the Sheep Movie: Farmageddon
151                   A StoryBots Christmas
282                    Aliens Stole My Body
376                         Animal Crackers
403                             Arctic Dogs
581                               Bee Movie
662                          Bigfoot Family
Name: Title, dtype: object


In [23]:
def compare_fun(title):
    print("genre priority")
    print(netflix_recommendation_g(title))
    print("\ndesc priority")
    print(netflix_recommendation_d(title))
    print("\nlinear combo")
    print(netflix_recommendation_c(title))
    print("\ncontrol")
    print(netFlix_recommendation(title))

In [24]:
compare_fun("DEATH NOTE")

genre priority
                             Title Imdb Score Rating
3541                 One-Punch Man     8.8/10  TV-14
2243        Ingress: The Animation     6.8/10  TV-MA
244                 Akame ga Kill!     7.7/10  TV-14
482               B: The Beginning     7.3/10  TV-MA
1417                    Durarara!!     8.0/10  TV-MA
1800  Ghost in the Shell: SAC_2045     6.4/10  TV-14
1893               Great Pretender     8.0/10  TV-MA
2032                     HERO MASK     6.0/10  TV-14
2662            Legend of Exorcism        NaN  TV-14
4148                    Seis Manos     7.1/10  TV-MA

desc priority
                     Title Imdb Score Rating
3189                Mortel     6.4/10  TV-MA
3813    Queen of the South     8.0/10  TV-MA
2058    High-Rise Invasion     6.9/10  TV-MA
2283                Ip Man     8.0/10      R
2738  Little Dragon Maiden     5.7/10  TV-14
713                 Bleach     6.2/10  TV-14
3782         Psychokinesis     6.1/10  TV-MA
3446      Notes for My Son 

In [31]:
compare_fun("The Queen's Gambit")

genre priority
                            Title Imdb Score Rating
4843                 The Get Down     8.3/10  TV-MA
5542     TURN: Washington's Spies     8.2/10  TV-14
1346  Dolly Parton's Heartstrings     7.5/10  TV-14
1420                      Dynasty     7.2/10  TV-14
1535                         Evil     7.8/10  TV-14
1851                      Godless     8.4/10  TV-MA
1901                    Greenleaf     7.3/10  TV-14
1925                        Gypsy     6.7/10  TV-MA
1942                      Halston     7.7/10  TV-MA
1943          Halt and Catch Fire     8.5/10  TV-14

desc priority
                            Title Imdb Score Rating
3293            My Mother's Wound     7.3/10  TV-MA
1124  Creating The Queen's Gambit     6.8/10  TV-14
246                 Akulah Balqis        NaN  TV-14
4541                    Talentime     6.6/10  TV-14
4078                        Sanju     7.6/10  TV-MA
1176               Dancing Angels        NaN  TV-PG
471                Away From Home 

In [47]:
data[data["Title"].isin(["Bee Movie","Fred Claus","Avatar: The Last Airbender", "You", "Beauty & the Beast","DEATH NOTE"])]

Unnamed: 0,Title,Description,Content Type,Genres,Imdb Score,Rating,Description Clean,Title Clean
461,Avatar: The Last Airbender,Siblings Katara and Sokka wake young Aang from...,TV Show,"Classic_&_Cult_TV, Kids'_TV, TV_Action_&_Adven...",9.3/10,TV-Y7,sibl katara sokka wake young aang long hibern ...,avatar last airbend
575,Beauty & the Beast,A homicide detective and a veteran who has bee...,TV Show,"Crime_TV_Shows, Romantic_TV_Shows, TV_Dramas",7.0/10,TV-14,homicid detect veteran turn beast militari exp...,beauti beast
581,Bee Movie,"Barry, a worker bee stuck in a dead-end job ma...",Movie,"Children_&_Family_Movies, Comedies",5.9/10,PG,barri worker bee stuck deadend job make honey ...,bee movi
1239,DEATH NOTE,When a Japanese high schooler comes into posse...,TV Show,"Anime_Series, Crime_TV_Shows, International_TV...",8.9/10,TV-14,japanes high schooler come possess mystic note...,death note
1701,Fred Claus,The holiday season is ruined for Santa Claus w...,Movie,"Children_&_Family_Movies, Comedies",5.5/10,PG,holiday season ruin santa claus cranki older b...,fred claus
5906,You,"Obsessed with an aspiring writer, a charming b...",TV Show,"Crime_TV_Shows, Romantic_TV_Shows, TV_Dramas",7.6/10,TV-MA,obsess aspir writer charm bookstor manag goe e...,


In [37]:
compare_fun("Burnt")

genre priority


KeyError: 'Burnt'

In [67]:
0.7693118355854538+1.6*0.1492536368339866

1.0081176545198325

In [23]:
pd.set_option('display.max_rows', None)
print(data["Title"])
pd.reset_option('display.max_columns')

0                                                (Un)Well
1                                                  #Alive
2                           #AnneFrank - Parallel Stories
3                                                #blackAF
4                                        #cats_the_mewvie
5                                       #FriendButMarried
6                                     #FriendButMarried 2
7                                            #realityhigh
8                                               #Rucker50
9                                                 #Selfie
10                                             #Selfie 69
11                                    10 Days in Sun City
12                                         10 jours en or
13                                     100 Days My Prince
14                                             100 Humans
15                                             100 Meters
16                    100 Things to do Before High School
17            

In [25]:
data.Rating.unique()

array(['TV-MA', 'TV-14', 'TV-G', 'TV-PG', 'TV-Y', 'R', 'PG-13', 'TV-Y7',
       'PG', 'G', nan, 'NC-17'], dtype=object)

In [26]:
rating={"G":1, "TV-Y":1, "TV-G":1, "PG":2,"TV-PG":2, "TV-Y7":2, 
        "PG-13":3, "TV-14":3, "TV-MA":4, "R":5, "NC-17":5}

In [27]:
data.isnull().sum()

Title                  0
Description            0
Content Type           0
Genres                 0
Imdb Score           608
Rating                 4
Description Clean      0
Title Clean            0
dtype: int64

In [28]:
def netflix_recommendation_lc(title, num, similarity1 = similaritygenre, similarity2=similaritydesc):
    """linear combo of description and genre"""
    index = indices[title]
    similarity_scores = list(enumerate(zip(similarity1[index],similarity2[index])))
    similarity_scores = sorted(similarity_scores, key=lambda x: (x[1][0]+num* x[1][1]), reverse=True)
    test=similarity_scores
    #print(type(test[0]), test)
    similarity_scores = similarity_scores[1:11]
    movieindices = [i[0] for i in similarity_scores]
    return data[['Title',"Imdb Score", "Rating"]].iloc[movieindices]

In [30]:
for i in range(0, 11):
    print(i,"\n",netflix_recommendation_lc("The Queen's Gambit", i),"\n")

0 
                     Title Imdb Score Rating
1420              Dynasty     7.2/10  TV-14
1535                 Evil     7.8/10  TV-14
1851              Godless     8.4/10  TV-MA
1901            Greenleaf     7.3/10  TV-14
1925                Gypsy     6.7/10  TV-MA
1942              Halston     7.7/10  TV-MA
1943  Halt and Catch Fire     8.5/10  TV-14
2008            Heartland     8.3/10  TV-14
2088            Hollywood     7.6/10  TV-MA
2566           Knightfall     6.7/10  TV-MA 

1 
                             Title Imdb Score Rating
4843                 The Get Down     8.3/10  TV-MA
5542     TURN: Washington's Spies     8.2/10  TV-14
1346  Dolly Parton's Heartstrings     7.5/10  TV-14
1420                      Dynasty     7.2/10  TV-14
1535                         Evil     7.8/10  TV-14
1851                      Godless     8.4/10  TV-MA
1901                    Greenleaf     7.3/10  TV-14
1925                        Gypsy     6.7/10  TV-MA
1942                      Halston     

2.5,4, 10