# Problem statement

#### Create movie recommendation engine using information from the dataset that includes movie descriptions, plots, genre and more.

# Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Importing the dataset

In [2]:
movies=pd.read_csv('movies_dataset.csv')
movies.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,tags
0,0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,4,49529,John Carter,"John Carter is a war-weary, former military ca..."


# Descriptive Analysis

In [3]:
movies.info()
# No missing values found

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4806 entries, 0 to 4805
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  4806 non-null   int64 
 1   movie_id    4806 non-null   int64 
 2   title       4806 non-null   object
 3   tags        4806 non-null   object
dtypes: int64(2), object(2)
memory usage: 150.3+ KB


In [4]:
movies.describe(include='all')

Unnamed: 0.1,Unnamed: 0,movie_id,title,tags
count,4806.0,4806.0,4806,4806
unique,,,4797,4806
top,,,Out of the Blue,"In the 22nd century, a paraplegic Marine is di..."
freq,,,4,1
mean,2403.160633,56922.559509,,
std,1388.273752,88309.447559,,
min,0.0,5.0,,
25%,1201.25,9009.75,,
50%,2402.5,14615.5,,
75%,3604.75,58476.75,,


In [5]:
movies.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


## Generating 'Bag of Words' from the 'tags' column

In [7]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [8]:
tfid=TfidfVectorizer(stop_words='english')

In [9]:
overview_matrix=tfid.fit_transform(movies['tags'])

In [10]:
overview_matrix.shape

(4806, 35547)

In [11]:
count_array=overview_matrix.toarray()
count_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
overview_matrix=pd.DataFrame(data=count_array,columns=tfid.get_feature_names_out())
overview_matrix
# This is the dataframe of'Bag of words' where rows indicates the movies and columns indicate the words from the 'tags' column

Unnamed: 0,00,000,007,07am,10,100,1000,101,108,10th,...,única,über,đỗthịhảiyến,špelacolja,γη,юлияснигирь,卧底肥妈,张立,绝地奶霸,超级妈妈
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
similarity_matrix=pd.DataFrame(data=cosine_similarity(count_array))
similarity_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4796,4797,4798,4799,4800,4801,4802,4803,4804,4805
0,1.000000,0.014639,0.007852,0.013737,0.061190,0.032827,0.001993,0.049723,0.012119,0.011939,...,0.000000,0.000000,0.015397,0.023433,0.000000,0.002570,0.005976,0.003233,0.003501,0.000000
1,0.014639,1.000000,0.008786,0.002398,0.021553,0.045884,0.002230,0.026534,0.013560,0.013359,...,0.000000,0.000000,0.003694,0.000000,0.000000,0.002876,0.000000,0.011305,0.000000,0.000000
2,0.007852,0.008786,1.000000,0.005825,0.015766,0.018652,0.002231,0.049387,0.019526,0.007165,...,0.011180,0.000000,0.000000,0.000000,0.014191,0.006986,0.000000,0.010239,0.000000,0.000000
3,0.013737,0.002398,0.005825,1.000000,0.007942,0.013620,0.009424,0.031928,0.011346,0.154448,...,0.001300,0.002335,0.000870,0.002853,0.010566,0.006158,0.000000,0.023996,0.031850,0.014796
4,0.061190,0.021553,0.015766,0.007942,1.000000,0.014538,0.016574,0.049659,0.004159,0.015152,...,0.005224,0.000000,0.000000,0.004205,0.000000,0.002221,0.000000,0.003965,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0.002570,0.002876,0.006986,0.006158,0.002221,0.002245,0.003758,0.014054,0.021990,0.002345,...,0.000000,0.011689,0.000000,0.002194,0.003776,1.000000,0.000000,0.000000,0.007266,0.000000
4802,0.005976,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.004032,0.000000,0.000000,0.000000,0.000000,1.000000,0.008254,0.005642,0.000000
4803,0.003233,0.011305,0.010239,0.023996,0.003965,0.011782,0.000000,0.000000,0.018320,0.011307,...,0.006867,0.002181,0.013058,0.013459,0.000000,0.000000,0.008254,1.000000,0.016356,0.018551
4804,0.003501,0.000000,0.000000,0.031850,0.000000,0.003960,0.007155,0.000000,0.024223,0.004137,...,0.000000,0.009847,0.000000,0.000000,0.000000,0.007266,0.005642,0.016356,1.000000,0.009307


In [15]:
# Let's find out movies similar to movie #4 from the above similarity matrix

In [16]:
similarity_matrix[4]

0       0.061190
1       0.021553
2       0.015766
3       0.007942
4       1.000000
          ...   
4801    0.002221
4802    0.000000
4803    0.003965
4804    0.000000
4805    0.000000
Name: 4, Length: 4806, dtype: float64

In [17]:
#In the next code cell we will be assigning a number(index) to each movie similar to movie #4 using enumerate function
similarity_score_4=list(enumerate(similarity_matrix[4]))
similarity_score_4

[(0, 0.0611900058014839),
 (1, 0.021553493463877846),
 (2, 0.015765772501535165),
 (3, 0.007941560921339394),
 (4, 1.0),
 (5, 0.014537683925908823),
 (6, 0.016573631568806393),
 (7, 0.049658886232173016),
 (8, 0.004159171240191602),
 (9, 0.015151951543959909),
 (10, 0.04660425115912152),
 (11, 0.0057866825282066985),
 (12, 0.03691819797089589),
 (13, 0.012206912147893057),
 (14, 0.018929645799537008),
 (15, 0.008669181809823585),
 (16, 0.038800754615080715),
 (17, 0.040761969172857654),
 (18, 0.02735924870312725),
 (19, 0.014086816505452352),
 (20, 0.010302278001350033),
 (21, 0.016480377544245656),
 (22, 0.0037387182526804237),
 (23, 0.0073460911647279924),
 (24, 0.01574020564893075),
 (25, 0.013161856454273473),
 (26, 0.060755274616927805),
 (27, 0.06561496886832667),
 (28, 0.043337468176871584),
 (29, 0.015051436454091384),
 (30, 0.005683220563915804),
 (31, 0.02657259421086516),
 (32, 0.03239222209807367),
 (33, 0.013718295861253452),
 (34, 0.0),
 (35, 0.01894099437050745),
 (36, 0

In [18]:
#Sorting the similar movies for movie #4 based on the scores.'key' parameter of sorted function is used to 
#indicate we need sorting based on second element in similarity_matrix[4] i,e. x[1], which have the score values in it.
similarity_score_4=sorted(enumerate(similarity_matrix[4]),key=lambda x:x[1],reverse=True)
similarity_score_4

[(4, 1.0),
 (373, 0.15894056313936678),
 (1257, 0.13420591333037657),
 (487, 0.13239263643488067),
 (1741, 0.12648175834096262),
 (473, 0.11297453851426707),
 (270, 0.11215654339078907),
 (4166, 0.11113555524993918),
 (207, 0.09346108329670322),
 (752, 0.091105232657358),
 (2936, 0.08422774168478483),
 (2968, 0.08042148984854001),
 (833, 0.07932208785111869),
 (778, 0.07731738707479346),
 (2760, 0.07667646771427652),
 (581, 0.07528476358007148),
 (1201, 0.07513471578847279),
 (322, 0.07505739321969326),
 (1147, 0.07498431044906594),
 (1161, 0.0749601499430668),
 (4405, 0.0705773124218938),
 (870, 0.0699890631768371),
 (3910, 0.06942570321009302),
 (85, 0.06826742699335328),
 (345, 0.06743784823925295),
 (91, 0.06717995525371098),
 (1322, 0.06679724993255612),
 (1774, 0.06616021299306886),
 (27, 0.06561496886832667),
 (1071, 0.06504733743180058),
 (47, 0.06483551605466428),
 (931, 0.06303030626314496),
 (1310, 0.0616117095822259),
 (0, 0.0611900058014839),
 (2409, 0.06093260286275379),


In [19]:
#movie_index_mapping
mapping=pd.Series(movies.index,index=movies["title"])
mapping

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4801
Newlyweds                                   4802
Signed, Sealed, Delivered                   4803
Shanghai Calling                            4804
My Date with Drew                           4805
Length: 4806, dtype: int64

In [20]:
movie_index=mapping["John Carter"]
movie_index

4

In [21]:
# Now, create a function that takes in the movie title and will recommend 
# us top 10 movies using the cosine similarity matrix we found above

In [24]:
def recommend_movie(movie_input):
    movie_index = mapping[movie_input]
    # get similarity value with other movies
    similarity_score = list(enumerate(similarity_matrix[movie_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    # get the list of top 10 recommended movies
    similarity_score = similarity_score[1:11]

    movie_ind = [i[0] for i in similarity_score]
    recommended_movies = movies['title'].iloc[movie_ind]
    print('Recommended movies for', movie_input,"is:")
    return recommended_movies

In [25]:
recommend_movie('John Carter')

Recommended movies for John Carter is:


373                 Mission to Mars
1257                     Get Carter
487                      Red Planet
1741                 Ghosts of Mars
473                   Mars Attacks!
270                     The Martian
4166    The Marine 4: Moving Target
207                    Total Recall
752             My Favorite Martian
2936                   Raising Cain
Name: title, dtype: object