** METHOD 1: RECOMMENDATION USING NATURAL LANGUAGE PROCESSING AND COSINE SIMILARITY**

1. https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243

2. https://www.kaggle.com/vikassingh1996/netflix-movies-and-shows-plotly-recommender-sys/data#7.-Content-Based-Movie-Recommender-System

In [1]:
import pandas as pd
import numpy as np

tv_shows_df = pd.read_csv("./data/netflix_tv_shows.csv")
tv_shows_df.head(4)

Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type
0,81193313,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...",South Korea,"November 30, 2019",2019,TV-14,1 Season,"International TV Shows, Korean TV Shows, Roman...",Brought together by meaningful meals in the pa...,TV Show
1,80213643,Chip and Potato,,"Abigail Oliver, Andrea Libman, Briana Buckmast...","Canada, United Kingdom",,2019,TV-Y,2 Seasons,Kids' TV,"Lovable pug Chip starts kindergarten, makes ne...",TV Show
2,70205672,La Reina del Sur,,"Kate del Castillo, Cristina Urgel, Alberto Jim...","United States, Spain, Colombia, Mexico",,2019,TV-14,2 Seasons,"Crime TV Shows, International TV Shows, Spanis...",This compelling show tells the story of the le...,TV Show
3,81094391,Sugar Rush Christmas,,"Hunter March, Candace Nelson, Adriano Zumbo",United States,"November 29, 2019",2019,TV-PG,1 Season,Reality TV,"It's everything you love about ""Sugar Rush"" – ...",TV Show


In [2]:
new_tv_shows_df = tv_shows_df[['title', 'director', 'cast', 'listed_in', 'description']]
new_tv_shows_df.head()

Unnamed: 0,title,director,cast,listed_in,description
0,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...","International TV Shows, Korean TV Shows, Roman...",Brought together by meaningful meals in the pa...
1,Chip and Potato,,"Abigail Oliver, Andrea Libman, Briana Buckmast...",Kids' TV,"Lovable pug Chip starts kindergarten, makes ne..."
2,La Reina del Sur,,"Kate del Castillo, Cristina Urgel, Alberto Jim...","Crime TV Shows, International TV Shows, Spanis...",This compelling show tells the story of the le...
3,Sugar Rush Christmas,,"Hunter March, Candace Nelson, Adriano Zumbo",Reality TV,"It's everything you love about ""Sugar Rush"" – ..."
4,The Charming Stepmom,,"Shahkrit Yamnarm, View Wannarot Sontichai, Kri...","International TV Shows, Romantic TV Shows, TV ...",A quirky fashion student becomes the nanny of ...


In [3]:
!pip install rake-nltk
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer



In [4]:
# REMOVE NaN VALUES AND EMPTY STRINGS:
new_tv_shows_df.dropna(inplace=True)

blanks = []

cols = ['title', 'director', 'cast', 'listed_in', 'description']

for i, col in new_tv_shows_df.iterrows():
  if type(col) == str:
    if col.isspace():
      blanks.append(i)

new_tv_shows_df.drop(blanks, inplace=True)

In [5]:
# Initializing a new column
new_tv_shows_df['key_words'] = ""

for index, row in new_tv_shows_df.iterrows():
  description = row['description']

  # instantiating Rake, by default it uses english stopwords from NLTK
  # and discards all puntuation characters as well

  r = Rake()
  
  # extracting the words by passing the text
  r.extract_keywords_from_text(description)

  # getting the dictionary whith key words as keys and their scores as values
  key_words_dict_scores = r.get_word_degrees()

  # assigning the key words to the new column for the corresponding movie
  row['key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
new_tv_shows_df.drop(columns = ['description'], inplace = True)


In [6]:
# discarding the commas between the actors' full names and getting only the first three names
new_tv_shows_df['cast'] = new_tv_shows_df['cast'].map(lambda x: x.split(',')[:3])

# putting the genres in a list of words
new_tv_shows_df['listed_in'] = new_tv_shows_df['listed_in'].map(lambda x: x.lower().split(','))

new_tv_shows_df['director'] = new_tv_shows_df['director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in new_tv_shows_df.iterrows():
    row['cast'] = [x.lower().replace(' ','') for x in row['cast']]
    row['director'] = ''.join(row['director']).lower()

In [7]:

new_tv_shows_df.set_index('title', inplace = True)
new_tv_shows_df.head()

Unnamed: 0_level_0,director,cast,listed_in,key_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mars,everardogout,"[jihae, albertoammann, clémentinepoidatz]","[docuseries, science & nature tv, tv dramas]","[spacecraft, crew, mission, fact, meets, ficti..."
Nowhere Man,djchen,"[alyssachia, mavisfan, josephchang]","[crime tv shows, international tv shows, tv ...","[two, nefarious, schemes, taking, place, 10, y..."
"Bring It On, Ghost",parkjoon-hwa,"[taecyeon, kimso-hyun, kwonyul]","[international tv shows, korean tv shows, ro...","[roommate, –, college, student, hunt, spooky, ..."
Black Money Love,ahmetkatıksız,"[gülerökten, hazaltüresan, i̇lkintüfekçi]","[crime tv shows, international tv shows, tv ...","[jewelry, designer, found, dead, together, cop..."
Cheese in the Trap,leeyoon-jung,"[parkhae-jin, kimgo-eun, seokang-jun]","[international tv shows, korean tv shows, ro...","[attention, navigate, college, life, gains, da..."


In [8]:
new_tv_shows_df['bag_of_words'] = ''
columns = new_tv_shows_df.columns
for index, row in new_tv_shows_df.iterrows():
    words = ''
    for col in columns:
        if col != 'director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
new_tv_shows_df.drop(columns = [col for col in new_tv_shows_df.columns if col!= 'bag_of_words' and col != 'type'], inplace = True)


In [13]:
new_tv_shows_df.head()

Unnamed: 0_level_0,bag_of_words
title,Unnamed: 1_level_1
Mars,everardogout jihae albertoammann clémentinepoi...
Nowhere Man,djchen alyssachia mavisfan josephchang crime t...
"Bring It On, Ghost",parkjoon-hwa taecyeon kimso-hyun kwonyul inter...
Black Money Love,ahmetkatıksız gülerökten hazaltüresan i̇lkintü...
Cheese in the Trap,leeyoon-jung parkhae-jin kimgo-eun seokang-jun...


***Feature Extraction and Modelling***

In [9]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(new_tv_shows_df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(new_tv_shows_df.index)
indices[50:60]

50              Bobby Kennedy for President
51                             Father Brown
52    Th Eena Meena Deeka Chase Comedy Show
53                James Acaster: Repertoire
54                                 A.I.C.O.
55                         Brave Miss World
56                         Revolting Rhymes
57                                 Godzilla
58                         Devilman Crybaby
59         Fullmetal Alchemist: Brotherhood
Name: title, dtype: object

In [10]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.18842229, 0.1860521 , ..., 0.19611614, 0.24535825,
        0.25745831],
       [0.18842229, 1.        , 0.40509575, ..., 0.35228194, 0.36727931,
        0.44378474],
       [0.1860521 , 0.40509575, 1.        , ..., 0.4110961 , 0.42859731,
        0.43820232],
       ...,
       [0.19611614, 0.35228194, 0.4110961 , ..., 1.        , 0.54213748,
        0.37924898],
       [0.24535825, 0.36727931, 0.42859731, ..., 0.54213748, 1.        ,
        0.42580935],
       [0.25745831, 0.44378474, 0.43820232, ..., 0.37924898, 0.42580935,
        1.        ]])

In [11]:
# function that takes in movie title as input and returns the top 10 recommended movies
def tv_shows_recommendations(Title, cosine_sim = cosine_sim):
    
    recommended_tv_shows = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == Title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_tv_shows.append(list(new_tv_shows_df.index)[i])
        
    return recommended_tv_shows

GET RECOMMENDATIONS FOR A MOVIE

In [12]:
tv_shows_recommendations('Godzilla')

['Nowhere Man',
 'Mr. Sunshine',
 'London Spy',
 'Reply 1994',
 'Reply 1997',
 'Leyla and Mecnun',
 'The Five',
 'Justice',
 'Age of Rebellion',
 'Velvet']

In [14]:
tv_shows_recommendations('GHOUL')

['Old Money',
 'Jack Taylor',
 'Justice',
 'London Spy',
 'Sacred Games',
 'Innocent',
 'Call the Midwife',
 'Criminal: Spain',
 'Sadqay Tumhare',
 'Bitter Daisies']