In [111]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [112]:
data = pd.read_csv('netflix_titles.csv')
# data = data.dropna()
data.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


In [113]:
data.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [114]:
data = data[["title", "description", "type", "listed_in", "release_year"]]
data.isnull().sum()


title           0
description     0
type            0
listed_in       0
release_year    0
dtype: int64

In [115]:
data = data.dropna()

In [116]:
import nltk # NLP library
import re
## before attemting below code you have to install stopwords as
## python3
## >>> import nltk
## >>> nltk.download()
## in pop up search stopwatch and install it
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))

In [117]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)

    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    return text

data['title'] = data['title'].apply(clean)


In [118]:
feature = data['listed_in'].tolist()
#Create an instance of TF-IDF-Vectorizer 
tfidf = TfidfVectorizer(stop_words="english")
# Fit and transform the vectorizer on our corpus 
tfidf_matrix = tfidf.fit_transform(feature)
#Compute the cosine similarity matrix 
similarity = cosine_similarity(tfidf_matrix)

indices = pd.Series(data.index,
                    index=data['title']).drop_duplicates()

In [119]:
def movies_recommendation(title):
    try:
        index = indices[title]
        similarity_scores = list(enumerate(similarity[index]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[0:10]
        movie_indices = [i[0] for i in similarity_scores]
        return data['title'].iloc[movie_indices]
    except:
        print(title, "is not in db")

In [120]:
print(data['title'].head(10))


0             dick johnson dead
1                  blood  water
2                     ganglands
3         jailbirds new orleans
4                  kota factory
5                 midnight mass
6    little pony new generation
7                       sankofa
8     great british baking show
9                      starling
Name: title, dtype: object


In [121]:
movies_recommendation('great british baking show')

8                   great british baking show
1596       great british baking show holidays
1628                              repair shop
2305                                         
4461     great british baking show beginnings
6618                              diva brides
7671                      operation gold rush
8209                                big catch
8331    great british baking show masterclass
366                                      glow
Name: title, dtype: object