In [121]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [122]:
movies_data = pd.read_csv('movies.csv')

In [123]:
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [124]:
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')

In [125]:
combined_features = movies_data['genres'] + ' ' + movies_data['keywords'] + ' ' + movies_data['tagline'] + ' ' + movies_data['cast'] + ' ' + movies_data['director']

In [126]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [127]:
vectorized = TfidfVectorizer()

In [128]:
feature_matrix = vectorized.fit_transform(combined_features)

In [129]:
print(feature_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 124266 stored elements and shape (4803, 17318)>
  Coords	Values
  (0, 201)	0.07860022416510505
  (0, 274)	0.09021200873707368
  (0, 5274)	0.11108562744414445
  (0, 13599)	0.1036413987316636
  (0, 5437)	0.1036413987316636
  (0, 3678)	0.21392179219912877
  (0, 3065)	0.22208377802661425
  (0, 5836)	0.1646750903586285
  (0, 14378)	0.33962752210959823
  (0, 16587)	0.12549432354918996
  (0, 3225)	0.24960162956997736
  (0, 14271)	0.21392179219912877
  (0, 4945)	0.24025852494110758
  (0, 15261)	0.07095833561276566
  (0, 16998)	0.1282126322850579
  (0, 11192)	0.09049319826481456
  (0, 11503)	0.27211310056983656
  (0, 13349)	0.15021264094167086
  (0, 17007)	0.23643326319898797
  (0, 17290)	0.20197912553916567
  (0, 13319)	0.2177470539412484
  (0, 14064)	0.20596090415084142
  (0, 16668)	0.19843263965100372
  (0, 14608)	0.15150672398763912
  (0, 8756)	0.22709015857011816
  :	:
  (4801, 403)	0.17727585190343229
  (4801, 4835)	0.247137650

In [130]:
similarity = cosine_similarity(feature_matrix)

In [131]:
print(similarity.shape)

(4803, 4803)


In [132]:
movie_name = input('Enter the movie name: ')
print('You entered:', movie_name)

You entered: all of us are dead


In [133]:
list_of_all_titles = movies_data['title'].tolist()

In [134]:
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Day of the Dead']


In [135]:
close_match = find_close_match[0]
print(close_match)

Day of the Dead


In [136]:
index_of_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_movie)

2387


In [137]:
similarity_scores = list(enumerate(similarity[index_of_movie]))
print(similarity_scores)

[(0, np.float64(0.048272493929264346)), (1, np.float64(0.047266649893072166)), (2, np.float64(0.0)), (3, np.float64(0.009661299341282086)), (4, np.float64(0.03172815036678623)), (5, np.float64(0.0162525180036249)), (6, np.float64(0.0)), (7, np.float64(0.05722953392780726)), (8, np.float64(0.007267523814444145)), (9, np.float64(0.0)), (10, np.float64(0.04638122941549845)), (11, np.float64(0.0)), (12, np.float64(0.0)), (13, np.float64(0.007976593315490652)), (14, np.float64(0.04096749715940973)), (15, np.float64(0.017059806983094797)), (16, np.float64(0.03716401828833489)), (17, np.float64(0.0)), (18, np.float64(0.02590306072687794)), (19, np.float64(0.021109003975555034)), (20, np.float64(0.015623828425335294)), (21, np.float64(0.0)), (22, np.float64(0.014169673586409096)), (23, np.float64(0.014964016914095815)), (24, np.float64(0.03628909616706802)), (25, np.float64(0.0)), (26, np.float64(0.08258362190715438)), (27, np.float64(0.050838630651586135)), (28, np.float64(0.02711481455871870

In [138]:
sorted_similar_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
print(sorted_similar_movies)

[(2387, np.float64(1.0)), (2308, np.float64(0.16493122843065025)), (3988, np.float64(0.1573851624022617)), (3736, np.float64(0.1433360736873614)), (149, np.float64(0.13031749944885648)), (3147, np.float64(0.10763164640953356)), (2964, np.float64(0.1038499962759063)), (417, np.float64(0.10376535450781671)), (4401, np.float64(0.10322854285595326)), (3230, np.float64(0.10249933462426955)), (4425, np.float64(0.09949006144813859)), (587, np.float64(0.09556343721958144)), (2826, np.float64(0.09037370350229043)), (4276, np.float64(0.08940214580257601)), (4579, np.float64(0.08928201201348557)), (3163, np.float64(0.08693442241296073)), (3364, np.float64(0.086460076991808)), (2551, np.float64(0.08393321869494252)), (3014, np.float64(0.0829943141264869)), (3360, np.float64(0.08262975314523116)), (26, np.float64(0.08258362190715438)), (2902, np.float64(0.08227789192522547)), (2871, np.float64(0.08151583641500572)), (3737, np.float64(0.08044321747718816)), (1567, np.float64(0.08007428890962226)), (

In [139]:
print('Movie suggested for you are: ')
i = 0
for movie in sorted_similar_movies:
    index = movie[0]
    title = movies_data[movies_data.index == index]['title'].values[0]
    if(i<20):
        print(i,'.', title)
        i = i+1

Movie suggested for you are: 
0 . Day of the Dead
1 . Land of the Dead
2 . Diary of the Dead
3 . Survival of the Dead
4 . Armageddon
5 . Re-Kill
6 . The Last Days on Mars
7 . Creepshow
8 . The Helix... Loaded
9 . Repo! The Genetic Opera
10 . Zombie Hunter
11 . The Abyss
12 . Time Bandits
13 . Def-Con 4
14 . Monty Python and the Holy Grail
15 . Detention
16 . Warlock
17 . Halloween II
18 . The Dead Zone
19 . House of 1000 Corpses
