In [2]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
#Load the data
df = pd.read_csv("IMDB-Movie-Data.csv")
#Add movie id column
df["Movie_id"] = range(0,1000)
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4


In [8]:
#Get a count of the number of movies in the data set and the number of columns
df.shape

(1000, 13)

In [11]:
#Create list of important columns
columns = ["Title","Genre","Actors","Director"]

In [12]:
#Show the data
df[columns].head()

Unnamed: 0,Title,Genre,Actors,Director
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi","Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn
1,Prometheus,"Adventure,Mystery,Sci-Fi","Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott
2,Split,"Horror,Thriller","James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan
3,Sing,"Animation,Comedy,Family","Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet
4,Suicide Squad,"Action,Adventure,Fantasy","Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer


In [13]:
#Check for missing values in columns
df[columns].isnull().sum()

Title       0
Genre       0
Actors      0
Director    0
dtype: int64

In [14]:
#Function to combine values of the columns to a single string
def string(data):
    features = []
    for i in range(0, data.shape[0]):
        features.append(data["Title"][i] + " " + data["Genre"][i] + " " + data["Actors"][i]+ " " + data["Director"][i])
    return features

In [17]:
#Column to hold strings
df["Features"] = string(df)

#Show data
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,Features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Guardians of the Galaxy Action,Adventure,Sci-F..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Prometheus Adventure,Mystery,Sci-Fi Noomi Rapa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"Split Horror,Thriller James McAvoy, Anya Taylo..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3,"Sing Animation,Comedy,Family Matthew McConaugh..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4,"Suicide Squad Action,Adventure,Fantasy Will Sm..."


In [18]:
#Convert the tax to matrix of token counts
cm = CountVectorizer().fit_transform(df["Features"])

In [19]:
#Get the cosine similarity matrix
cs = cosine_similarity(cm)
#Print the cosine similarity
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [20]:
#Get the shape of the cosine similarity matrix
cs.shape

(1000, 1000)

In [22]:
#Get the title of the movie that users like
title = "The Amazing Spider-Man"

#Find the movies id
movie_id = df[df.Title == title]["Movie_id"].values[0]
movie_id

368

In [23]:
#Create a list of enumerations for the similarity score
scores = list(enumerate(cs[movie_id]))

In [27]:
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
sorted_scores = sorted_scores[1:]

In [28]:
print(sorted_scores)

[(253, 0.7071067811865475), (149, 0.2672612419124244), (239, 0.2581988897471611), (344, 0.2581988897471611), (104, 0.25), (821, 0.23570226039551587), (78, 0.22360679774997896), (558, 0.2182178902359924), (739, 0.21320071635561041), (314, 0.20851441405707477), (767, 0.20412414523193154), (26, 0.2004459314343183), (55, 0.2004459314343183), (92, 0.2004459314343183), (363, 0.2004459314343183), (718, 0.2004459314343183), (176, 0.2), (313, 0.19611613513818404), (179, 0.19364916731037082), (303, 0.19364916731037082), (324, 0.19364916731037082), (379, 0.19364916731037082), (600, 0.19364916731037082), (694, 0.19364916731037082), (728, 0.19364916731037082), (5, 0.1875), (38, 0.1875), (294, 0.1875), (345, 0.1875), (389, 0.1875), (432, 0.1875), (529, 0.1875), (537, 0.1875), (581, 0.1875), (758, 0.1875), (770, 0.1875), (969, 0.1875), (8, 0.18190171877724973), (65, 0.18190171877724973), (107, 0.18190171877724973), (203, 0.18190171877724973), (214, 0.18190171877724973), (388, 0.18190171877724973), (3

In [34]:
#First seven most similar movies
j = 0
print("The 7 most recommended movies to", title, "are:\n")
for item in sorted_scores:
    movie_title = df[df.Movie_id == item[0]]["Title"].values[0]
    print(j+1, movie_title)
    j += 1
    if j > 6:
        break

The 7 most recommended movies to The Amazing Spider-Man are:

1 The Amazing Spider-Man 2
2 Inferno
3 The Host
4 Spider-Man 3
5 The Man from U.N.C.L.E.
6 The Imaginarium of Doctor Parnassus
7 Pirates of the Caribbean: Dead Man's Chest
