### IMPORTING ALL THE REQUIRED LIBRARIES

In [53]:
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### DATA COLLECTION AND PRE-PROCESSING

In [54]:
#reading the dataset
df = pd.read_csv('netflix.csv')
df.head(6)

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"
5,c293788a-41f7-49a3-a7fc-005ea33bce2b,#FriendButMarried,"Pining for his high school crush for years, a ...",Rako Prijanto,"Dramas, International Movies, Romantic Movies","Adipati Dolken, Vanesha Prescilla, Rendi Jhon,...",Indonesia,2018.0,TV-G,102 min,7.0/10,Movie,"May 21, 2020"


In [55]:
#create an index
df = df.reset_index()

In [56]:
df

Unnamed: 0,index,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5962,5962,62b8b682-f191-4c10-aa04-32319329bd8d,الف مبروك,"On his wedding day, an arrogant, greedy accoun...",Ahmed Nader Galal,"Comedies, Dramas, International Movies","Ahmed Helmy, Laila Ezz El Arab, Mahmoud El Fis...",Egypt,2009.0,TV-14,115 min,7.4/10,Movie,"April 25, 2020"
5963,5963,5bed77ab-5e31-4216-8b51-44c9a35442e6,دفعة القاهرة,A group of women leaves Kuwait to attend unive...,,"International TV Shows, TV Dramas","Bashar al-Shatti, Fatima Al Safi, Maram Baloch...",,2019.0,TV-14,1 Season,,TV Show,
5964,5964,4661ec0c-8692-4661-bc76-a96412b311fd,海的儿子,"Two brothers start a new life in Singapore, wh...",,"International TV Shows, TV Dramas","Li Nanxing, Christopher Lee, Jesseca Liu, Appl...",,2016.0,TV-14,1 Season,,TV Show,
5965,5965,145c93a7-1924-403c-a933-4ede8ad66f26,반드시 잡는다,After people in his town start turning up dead...,Hong-seon Kim,"Dramas, International Movies, Thrillers",Baek Yoon-sik,South Korea,2017.0,TV-MA,110 min,6.5/10,Movie,"February 28, 2018"


In [57]:
df.shape

(5967, 14)

In [58]:
#select certain features
selcted_features = ['Title', 'Description', 'Director', 'Genres', 'Content Type']
print(selcted_features)

['Title', 'Description', 'Director', 'Genres', 'Content Type']


In [59]:
#clean the data and fill null values
for feature in selcted_features:
    df[feature] = df[feature].fillna('')

In [60]:
df['Title'] = df['Title'].str.replace('#', '')

In [61]:
cleaned_title = df['Title']

In [62]:
#put all the features into one
combined_features = cleaned_title+ ''+ df['Description']+''+ df['Director']+''+ df['Genres']+ ''+ df['Content Type'] 

In [63]:
combined_features

0       (Un)WellThis docuseries takes a deep dive into...
1       AliveAs a grisly virus rampages a city, a lone...
2       AnneFrank - Parallel StoriesThrough her diary,...
3       blackAFKenya Barris and his family navigate re...
4       cats_the_mewvieThis pawesome documentary explo...
                              ...                        
5962    الف مبروكOn his wedding day, an arrogant, gree...
5963    دفعة القاهرةA group of women leaves Kuwait to ...
5964    海的儿子Two brothers start a new life in Singapore...
5965    반드시 잡는다After people in his town start turning ...
5966    최강전사 미니특공대 : 영웅의 탄생Miniforce, a special task f...
Length: 5967, dtype: object

In [64]:
#change all the text data into vectors
vectorizer = TfidfVectorizer()

In [65]:
featured_vectors = vectorizer.fit_transform(combined_features)

In [66]:
print(featured_vectors)

  (0, 21831)	0.0649216292474742
  (0, 24890)	0.12153660451556257
  (0, 19549)	0.13732989809570623
  (0, 18913)	0.26915800084143526
  (0, 24346)	0.05136836233621543
  (0, 25286)	0.09782901934894675
  (0, 14096)	0.16208106046321036
  (0, 18868)	0.27847830864373213
  (0, 6964)	0.1798420398323072
  (0, 3736)	0.1097016656555808
  (0, 10717)	0.22633582856240458
  (0, 1277)	0.04934637678947077
  (0, 10718)	0.23097200174529553
  (0, 24513)	0.2916145446922719
  (0, 26209)	0.19147112299853603
  (0, 11784)	0.22228080000990497
  (0, 26148)	0.2619286084617809
  (0, 14378)	0.23097200174529553
  (0, 24077)	0.13598564186602324
  (0, 12034)	0.11391839832640618
  (0, 6931)	0.24670145699059862
  (0, 6264)	0.1922159361328916
  (0, 23698)	0.1439082257897301
  (0, 6984)	0.14749175826353456
  (0, 26154)	0.2916145446922719
  :	:
  (5966, 23834)	0.23155379261943954
  (5966, 12705)	0.2279721653823745
  (5966, 13646)	0.1826129020114238
  (5966, 18721)	0.2356143066838119
  (5966, 24426)	0.18417266364630988
  (596

In [67]:
print(featured_vectors.shape)

(5967, 27052)


In [68]:
similarity = cosine_similarity(featured_vectors)

In [69]:
print(similarity)

[[1.         0.00474379 0.         ... 0.0349747  0.00874619 0.03596248]
 [0.00474379 1.         0.00947082 ... 0.0034465  0.05245561 0.00463123]
 [0.         0.00947082 1.         ... 0.01050651 0.01980926 0.01633836]
 ...
 [0.0349747  0.0034465  0.01050651 ... 1.         0.0656108  0.02922997]
 [0.00874619 0.05245561 0.01980926 ... 0.0656108  1.         0.00331367]
 [0.03596248 0.00463123 0.01633836 ... 0.02922997 0.00331367 1.        ]]


In [70]:
print(similarity.shape)

(5967, 5967)


In [71]:
movie_name = input('enter a movie name')

In [72]:
list_of_titles = cleaned_title.to_list()

In [73]:
print(list_of_titles)



In [74]:
#find close match of our input to our dataset
find_close_match = difflib.get_close_matches(movie_name, list_of_titles)
print(find_close_match)

['Skin', 'Smoking', 'Skins']


In [75]:
close_match = find_close_match[0]
print(close_match)

Skin


In [76]:
#finding the index of the movie
index_of_the_movie = df[cleaned_title == close_match]['index'].values[0]
print(index_of_the_movie)

4291


In [78]:
#getting a list of similar movies
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.007763021069415175), (1, 0.028116021023515306), (2, 0.018022929795080522), (3, 0.011355962174227317), (4, 0.05108314679282629), (5, 0.016314627609970753), (6, 0.017456236346273107), (7, 0.01738144323276561), (8, 0.011907067247708337), (9, 0.01817147376234442), (10, 0.015794115019096458), (11, 0.0822877454380672), (12, 0.024556859930334584), (13, 0.010828897385710894), (14, 0.008984221535966152), (15, 0.010809226285129103), (16, 0.020739691879757584), (17, 0.026165934872409626), (18, 0.010864673338505498), (19, 0.04069875006383719), (20, 0.014522433082652655), (21, 0.009657511883664878), (22, 0.007884136412480883), (23, 0.015018673665339165), (24, 0.01915926763272006), (25, 0.017781461402132354), (26, 0.014773339330289098), (27, 0.02304600695370443), (28, 0.005742227268128987), (29, 0.016412036219372737), (30, 0.019769770653801335), (31, 0.024297344170109827), (32, 0.015295858428485229), (33, 0.10994832963408165), (34, 0.017557995738918447), (35, 0.025839332213223376), (36, 0.013

In [79]:
len(similarity_score)

5967

In [80]:
#sorting the movies based on their similarity score
sorted_similar_movie = sorted(similarity_score, key= lambda x:x[1], reverse=True)
print(sorted_similar_movie)

[(4291, 1.0000000000000002), (631, 0.17404731785534594), (2972, 0.15754734507658802), (33, 0.10994832963408165), (3132, 0.10452372880598629), (1184, 0.10220740884530699), (286, 0.09864503442338492), (1446, 0.0978147785699385), (5348, 0.09610327485059905), (1023, 0.0938188511997883), (779, 0.0925311141245796), (5590, 0.09221402081434675), (3064, 0.09035217960144938), (5205, 0.08903161624608145), (4305, 0.08814361585572779), (1103, 0.08781151727298343), (1655, 0.08606296185826337), (346, 0.08525821035874316), (459, 0.08509027977891288), (5238, 0.08496988644472707), (2289, 0.0826069133421005), (161, 0.08254665671319876), (11, 0.0822877454380672), (2667, 0.08166118396808103), (5129, 0.08133162854705422), (5227, 0.08131554885276089), (1774, 0.08053313083110245), (4174, 0.08027966149367834), (4186, 0.08025372011009625), (5279, 0.08018475369085673), (3298, 0.08003878714454131), (5002, 0.07962202391419101), (589, 0.07954907337410891), (4042, 0.07950341188734224), (4293, 0.07929444883076484), (

In [81]:
#print the name of familiar movies based on the index
print('Movies suggested for you : \n')
i = 1
for movie in sorted_similar_movie:
    index = movie[0]
    title_from_index = df[df.index==index]['Title'].values[0]
    if (i<30):
        print(i, ',', title_from_index)
        i+=1

Movies suggested for you : 

1 , Skin
2 , Beverly Hills Ninja
3 , Marked
4 , 2 Weeks in Lagos
5 , Miss Virginia
6 , Daniel Sloss: Live Shows
7 , All American
8 , Eggnoid: Love & Time Portal
9 , Thi Mai
10 , Citation
11 , Born Racer
12 , Under the Eiffel Tower
13 , Michael Lost and Found
14 , The Social Dilemma
15 , Sleepless Society: Nyctophobia
16 , Country Comfort
17 , Flavors of Youth: International Version
18 , An Evening with Beverly Luff Linn
19 , Autumn's Concerto
20 , The Tenth Man
21 , Iron Ladies
22 , A Very Country Christmas
23 , 10 Days in Sun City
24 , LEGO Bionicle: The Journey to One
25 , The Power of Grayskull: The Definitive History of He-Man and the Masters of the Universe
26 , The Strange House
27 , Garth Brooks: The Road I’m On
28 , Seven and a half dates
29 , Seventeen


### MOVIE RECOMMENDATION SYSTEM

In [None]:
movie_name = input('enter a movie name')

list_of_titles = cleaned_title.to_list()

find_close_match = difflib.get_close_matches(movie_name, list_of_titles)

close_match = find_close_match[0]

index_of_the_movie = df[cleaned_title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movie = sorted(similarity_score, key= lambda x:x[1], reverse=True)

print('Movies suggested for you : \n')
i = 1
for movie in sorted_similar_movie:
    index = movie[0]
    title_from_index = df[df.index==index]['Title'].values[0]
    if (i<30):
        print(i, ',', title_from_index)
        i+=1