<a href="https://colab.research.google.com/github/namanyadav2706/DataScience-Projects/blob/main/2_Netflix_Recommandation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [90]:
import pandas as pd                      ## Import the Libraries

In [91]:
df=pd.read_csv('2_netflixData.csv')      ## Read the dataset

In [92]:
df.head()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"


In [93]:
df.shape

(5967, 13)

In [94]:
df.isnull().sum()  ## To check the null values in the dataset

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64

In [95]:
df=df[['Title','Description','Genres','Content Type']]  ## only these columns will be required in the recommandation system

In [96]:
df.head()

Unnamed: 0,Title,Description,Genres,Content Type
0,(Un)Well,This docuseries takes a deep dive into the luc...,Reality TV,TV Show
1,#Alive,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers",Movie
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies",Movie
3,#blackAF,Kenya Barris and his family navigate relations...,TV Comedies,TV Show
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,"Documentaries, International Movies",Movie


In [97]:
import nltk                  ## import the requied libraries
import re
nltk.download('stopwords')   ## download the stopwords
from nltk.corpus import stopwords
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [98]:
def clean(text):                               ## function to remove unnecessary text from the movie titles
    text = re.sub('[^a-zA-Z0-9]', ' ', text)   ## title should consist of alphabets and numbers only
    text = str(text).lower()                   ## turn complete text into the lowercase
    text = [word for word in text.split(' ') if word not in stopword]   ## store all the words in a list and remove the stopwords
    text=" ".join(text)                        ## join the elements of the list using whitespace
    return text

In [99]:
df["Title(updated)"] = df["Title"].apply(clean)  ## apply the function to the title column and add the new column into the dataset.

In [100]:
df.head()

Unnamed: 0,Title,Description,Genres,Content Type,Title(updated)
0,(Un)Well,This docuseries takes a deep dive into the luc...,Reality TV,TV Show,un well
1,#Alive,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers",Movie,alive
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies",Movie,annefrank parallel stories
3,#blackAF,Kenya Barris and his family navigate relations...,TV Comedies,TV Show,blackaf
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,"Documentaries, International Movies",Movie,cats mewvie


In [101]:
df.isnull().sum()

Title             0
Description       0
Genres            0
Content Type      0
Title(updated)    0
dtype: int64

In [102]:
feature = df["Genres"].tolist()  ## add the whole Genres column to a list to find the similarity among all the movies

In [103]:
feature[:6] , type(feature)  , len(feature) 

(['Reality TV',
  'Horror Movies, International Movies, Thrillers',
  'Documentaries, International Movies',
  'TV Comedies',
  'Documentaries, International Movies',
  'Dramas, International Movies, Romantic Movies'],
 list,
 5967)

In [104]:
## import the required libraries to find the cosine similarity
from sklearn.feature_extraction import text             ## library to create the bag of words
from sklearn.metrics.pairwise import cosine_similarity  ## library to calculate the similarity scores

In [105]:
# code to find the cosine similarity score
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

In [106]:
similarity.shape                ## each movie has been compared to all the other movie in the dataset thats why the shape is (5967 X 5967)

(5967, 5967)

In [107]:
indices = pd.Series(df.index, index=df['Title(updated)']).drop_duplicates()  #convert title into index so that we can search the movie in df

In [108]:
def netFlix_recommendation(title, similarity = similarity):  ## function to get the top 10 movies with highest similarity score
    index = indices[title]                                   ## index of the movie entered
    similarity_scores = list(enumerate(similarity[index]))   ## get the similarity score of all the movies w.r.t the entered movie in a list
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)   ## sort the list in the decending order
    similarity_scores = similarity_scores[0:10]       ## get the top 10 similarity scores
    movieindices = [i[0] for i in similarity_scores]  ## get the index of top 10 movies on the basis of similarity score
    return df['Title(updated)'].iloc[movieindices]    ##  return the titles of those movies

In [109]:
print(netFlix_recommendation("stranger things"))      ## get the recommanded shows for --stranger things--

978     chilling adventures sabrina
3406                    nightflyers
4438                stranger things
2941                       manifest
4605                           4400
5085                             oa
5285                vampire diaries
5956                            zoo
5267               umbrella academy
5715                    warrior nun
Name: Title(updated), dtype: object


Resourses and info:

TF-IDF: https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

Cosine Similarity: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html

Re-module: https://www.w3schools.com/python/python_regex.asp

StopWords: https://medium.com/@saitejaponugoti/stop-words-in-nlp-5b248dadad47