In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv(r"movie_dataset.csv")

In this notebook we used the code by https://towardsdatascience.com/using-cosine-similarity-to-build-a-movie-recommendation-system-ae7f20842599

In [5]:
#chosen features will be keywords, cast, genres & director

features = ['keywords', 'cast', 'genres', 'director']
for feature in features:
    df[feature] = df[feature].fillna('')

In [6]:
#combine all our useful features in a single string

def combined_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']
df["combined_features"] = df.apply(combined_features, axis =1)

In [7]:
#count the number of texts and print the result into an array

cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Cosine similarity is a metric used to measure how similar two items are. Mathematically, it measures the cosine of the angle between two vectors projected in a multi-dimensional space. The output value ranges from 0–1.

0 means no similarity, where as 1 means that both the items are 100% similar.

In [8]:
cosine_sim = cosine_similarity(count_matrix)


Since we are building a content based filtering system, we need to know the users’ likes in order to predict a similar item.

In [9]:
#Suppose a user likes the movie “Dead Poets Society”. 
#We then build a function to get the index from the name of this movie. The index will be saved in the movie_index variable.

movie_user_likes = "Dead Poets Society"
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]
movie_index = get_index_from_title(movie_user_likes)

In [12]:
#generate a list of similar movies

similar_movies = list(enumerate(cosine_sim[movie_index]))


In [13]:
#sort the movies in the list similar_movies

sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

In [14]:
#call this function inside the for loop to print the first ‘x’ number of movies from the sorted_similar_movies

def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
i=0
for movie in sorted_similar_movies:
    print(get_title_from_index(movie[0]))
    i=i+1
    if i>15:
        break

Dead Poets Society
Much Ado About Nothing
Patch Adams
Good Will Hunting
Flightplan
Alive
The Basket
What Just Happened
Adulterers
The Sting
The Tree of Life
The Greatest Game Ever Played
Light It Up
The Wood
The Naked Ape
Jakob the Liar
