** METHOD 1: RECOMMENDATION USING NATURAL LANGUAGE PROCESSING AND COSINE SIMILARITY**

1. https://towardsdatascience.com/how-to-build-from-scratch-a-content-based-movie-recommender-with-natural-language-processing-25ad400eb243

2. https://www.kaggle.com/vikassingh1996/netflix-movies-and-shows-plotly-recommender-sys/data#7.-Content-Based-Movie-Recommender-System

In [None]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("./data/netflix_movies.csv")
movies_df.head(5)

Error: Jupyter cannot be started. Error attempting to locate jupyter: 

In [3]:
new_movies_df = movies_df[['title', 'director', 'cast', 'listed_in', 'description']]
new_movies_df.head()

Unnamed: 0,title,director,cast,listed_in,description
0,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,"Documentaries, International Movies","From Sierra de las Minas to Esquipulas, explor..."
1,The Zoya Factor,Abhishek Sharma,"Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, ...","Comedies, Dramas, International Movies",A goofy copywriter unwittingly convinces the I...
2,Atlantics,Mati Diop,"Mama Sane, Amadou Mbow, Ibrahima Traore, Nicol...","Dramas, Independent Movies, International Movies","Arranged to marry a rich man, young Ada is cru..."
3,Crazy people,Moses Inwang,"Ramsey Nouah, Chigul, Sola Sobowale, Ireti Doy...","Comedies, International Movies, Thrillers",Nollywood star Ramsey Nouah learns that someon...
4,I Lost My Body,Jérémy Clapin,"Hakim Faris, Victoire Du Bois, Patrick d'Assum...","Dramas, Independent Movies, International Movies","Romance, mystery and adventure intertwine as a..."


In [4]:
!pip install rake-nltk
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer



In [5]:
# REMOVE NaN VALUES AND EMPTY STRINGS:
new_movies_df.dropna(inplace=True)

blanks = []

cols = ['title', 'director', 'cast', 'listed_in', 'description']

for i, col in new_movies_df.iterrows():
  if type(col) == str:
    if col.isspace():
      blanks.append(i)

new_movies_df.drop(blanks, inplace=True)

In [6]:
# Initializing a new column
new_movies_df['key_words'] = ""

for index, row in new_movies_df.iterrows():
  description = row['description']

  # instantiating Rake, by default it uses english stopwords from NLTK
  # and discards all puntuation characters as well

  r = Rake()
  
  # extracting the words by passing the text
  r.extract_keywords_from_text(description)

  # getting the dictionary whith key words as keys and their scores as values
  key_words_dict_scores = r.get_word_degrees()

  # assigning the key words to the new column for the corresponding movie
  row['key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
new_movies_df.drop(columns = ['description'], inplace = True)


In [7]:
# discarding the commas between the actors' full names and getting only the first three names
new_movies_df['cast'] = new_movies_df['cast'].map(lambda x: x.split(',')[:3])

# putting the genres in a list of words
new_movies_df['listed_in'] = new_movies_df['listed_in'].map(lambda x: x.lower().split(','))

new_movies_df['director'] = new_movies_df['director'].map(lambda x: x.split(' '))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in new_movies_df.iterrows():
    row['cast'] = [x.lower().replace(' ','') for x in row['cast']]
    row['director'] = ''.join(row['director']).lower()

In [8]:

new_movies_df.set_index('title', inplace = True)
new_movies_df.head()

Unnamed: 0_level_0,director,cast,listed_in,key_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Guatemala: Heart of the Mayan World,"luisara,ignaciojaunsolo",[christianmorales],"[documentaries, international movies]","[sierra, de, las, minas, cultural, geological,..."
The Zoya Factor,abhisheksharma,"[sonamkapoor, dulquersalmaan, sanjaykapoor]","[comedies, dramas, international movies]","[lucky, mascot, shunning, captain, ’, supersti..."
Atlantics,matidiop,"[mamasane, amadoumbow, ibrahimatraore]","[dramas, independent movies, international m...","[marry, young, ada, crushed, true, love, goes,..."
Crazy people,mosesinwang,"[ramseynouah, chigul, solasobowale]","[comedies, international movies, thrillers]","[imposter, impersonating, someone, nollywood, ..."
I Lost My Body,jérémyclapin,"[hakimfaris, victoiredubois, patrickd'assumçao]","[dramas, independent movies, international m...","[mystery, mesmerizing, animated, film, adventu..."


In [9]:
new_movies_df['bag_of_words'] = ''
columns = new_movies_df.columns
for index, row in new_movies_df.iterrows():
    words = ''
    for col in columns:
        if col != 'director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
new_movies_df.drop(columns = [col for col in new_movies_df.columns if col!= 'bag_of_words' and col != 'type'], inplace = True)


In [10]:
new_movies_df.head()

Unnamed: 0_level_0,bag_of_words
title,Unnamed: 1_level_1
Guatemala: Heart of the Mayan World,"luisara,ignaciojaunsolo christianmorales docum..."
The Zoya Factor,abhisheksharma sonamkapoor dulquersalmaan sanj...
Atlantics,matidiop mamasane amadoumbow ibrahimatraore dr...
Crazy people,mosesinwang ramseynouah chigul solasobowale co...
I Lost My Body,jérémyclapin hakimfaris victoiredubois patrick...


***Feature Extraction and Modelling***

In [11]:
# instantiating and generating the count matrix
movies_count = CountVectorizer()
movies_count_matrix = movies_count.fit_transform(new_movies_df['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
movies_indices = pd.Series(new_movies_df.index)
movies_indices[:5]

0    Guatemala: Heart of the Mayan World
1                        The Zoya Factor
2                              Atlantics
3                           Crazy people
4                         I Lost My Body
Name: title, dtype: object

In [12]:
# generating the cosine similarity matrix
movies_cosine_sim = cosine_similarity(movies_count_matrix, movies_count_matrix)
movies_cosine_sim

array([[1.        , 0.09304842, 0.12309149, ..., 0.05170877, 0.08891084,
        0.04351941],
       [0.09304842, 1.        , 0.16798421, ..., 0.05292561, 0.13650473,
        0.08908708],
       [0.12309149, 0.16798421, 1.        , ..., 0.09335201, 0.24077171,
        0.19641855],
       ...,
       [0.05170877, 0.05292561, 0.09335201, ..., 1.        , 0.05057217,
        0.04950738],
       [0.08891084, 0.13650473, 0.24077171, ..., 0.05057217, 1.        ,
        0.08512565],
       [0.04351941, 0.08908708, 0.19641855, ..., 0.04950738, 0.08512565,
        1.        ]])

In [13]:
# function that takes in movie title as input and returns the top 10 recommended movies
def movie_recommendations(Title, movies_cosine_sim = movies_cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = movies_indices[movies_indices == Title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(movies_cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(new_movies_df.index)[i])
        
    return recommended_movies

GET RECOMMENDATIONS FOR A MOVIE

In [14]:
movie_recommendations('Rocky')

['Rocky III',
 'Rocky IV',
 'Rocky II',
 'Rocky V',
 "Logan's Run",
 'Indiana Jones and the Last Crusade',
 'Thong Dee Fun Khao',
 'Arjun: The Warrior Prince',
 'The Bleeder',
 'The Age of Shadows']

In [15]:
movie_recommendations('Kai Po Che!')

['LSD: Love, Sex Aur Dhokha',
 'Beyond All Boundaries',
 'Dangal',
 '100 Meters',
 'Ferrari Ki Sawaari',
 'Iqbal',
 'Le K Benzema',
 'Hazaaron Khwaishein Aisi',
 '3 Heroines',
 'Ho Mann Jahaan']