In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv("movie_dataset.csv")

#keywords for the feature set
features = ['keywords','cast','genres','director']

#create a function for combining the values of these columns
def combine_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']

#call this function over each row of our dataframe
for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string

#applying combined_features() method over each rows of dataframe and storing the combined string in "combined_features" column
df["combined_features"] = df.apply(combine_features,axis=1) 

#feed these strings to a CountVectorizer() object
cv = CountVectorizer() 
count_matrix = cv.fit_transform(df["combined_features"])

#obtain the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix)

#define two helper functions to get movie title from movie index and vice-versa
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

#get from standard input
movie_user_likes = input("Enter your movie choice: ")
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

#sort by similarity, ignore first 
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:6]

i=0
print("5 movie recommendations similar to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i+= 1
    if i>5:
        break

5 movie recommendations similar to Cars are:

Cars 2
The Fast and the Furious: Tokyo Drift
2 Fast 2 Furious
Herbie Fully Loaded
Back to the Future Part II
