In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("movie_dataset.csv")

We have our dataframe ready, so let`s visualize it

In [3]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
df.describe()

Unnamed: 0,index,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,2401.0,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,1386.651002,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,1200.5,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,2401.0,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,3601.5,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,4802.0,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [5]:
print(df.columns.values)

['index' 'budget' 'genres' 'homepage' 'id' 'keywords' 'original_language'
 'original_title' 'overview' 'popularity' 'production_companies'
 'production_countries' 'release_date' 'revenue' 'runtime'
 'spoken_languages' 'status' 'tagline' 'title' 'vote_average' 'vote_count'
 'cast' 'crew' 'director']


Onvisualizing the dataset, you may have noticed that it has many extra info about a movie. We don’t need all of them. So, we choose keywords, cast, genres, director and title column to use as our feature set.

In [6]:
features = ['genres', 'keywords', 'title', 'cast', 'director']

As you may can noticed that some columns have NaN data points that will create a problem for us, so what we will do is instead of NaN values we will replace it with empty string ('').

In [7]:
df['cast'].isnull().values.any()

True

Our next task is to create a function for combining the values of these columns into a single string

In [8]:
def combine_features(row):
    return row['title']+' '+row['genres']+' '+row['director']+' '+row['keywords']+' '+row['cast']

Now, we need to call this function over each row of our dataframe. But, before doing that, we need to clean and preprocess the data for our use. We will fill all the NaN values with blank string in the dataframe

In [9]:
for feature in features:
    df[feature] = df[feature].fillna('')

applying combine_feature method over each row of Dataframe and storing the combined string in "combined_features" column

In [10]:
df['combined_features'] = df.apply(combine_features, axis = 1)

In [13]:
print(df.loc[0, 'combined_features'])

Avatar Action Adventure Fantasy Science Fiction James Cameron culture clash future space war space colony society Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez


Now that we have obtained the combined strings, we can now feed these strings to a CountVectorizer() object for getting the count matrix.

In [14]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])

In [15]:
count_matrix

<4803x17482 sparse matrix of type '<class 'numpy.int64'>'
	with 109310 stored elements in Compressed Sparse Row format>

Now, we need to obtain the cosine similarity matrix from the count matrix.

In [16]:
cosine_sim = cosine_similarity(count_matrix)

In [17]:
cosine_sim

array([[1.        , 0.09078413, 0.11572751, ..., 0.        , 0.        ,
        0.        ],
       [0.09078413, 1.        , 0.06537205, ..., 0.06052275, 0.        ,
        0.        ],
       [0.11572751, 0.06537205, 1.        , ..., 0.        , 0.10206207,
        0.        ],
       ...,
       [0.        , 0.06052275, 0.        , ..., 1.        , 0.        ,
        0.07142857],
       [0.        , 0.        , 0.10206207, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.07142857, 0.        ,
        1.        ]])

Now, we will define two helper functions to get movie title from movie index and vice-versa.

In [20]:
def get_title_from_index(index):
    print(df.index)
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    print(df.title)
    return df[df.title == title]["index"].values[0]

In [21]:
movie_user_likes = "Star Trek Beyond"
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index])) 

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4798                                 El Mariachi
4799                                   Newlyweds
4800                   Signed, Sealed, Delivered
4801                            Shanghai Calling
4802                           My Date with Drew
Name: title, Length: 4803, dtype: object


We will sort the list similar_movies according to similarity scores in descending order. Since the most similar movie to a given movie will be itself, we will discard the first element after sorting the movies.

In [22]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [23]:
sorted_similar_movies

[(47, 0.7499999999999999),
 (158, 0.4583333333333335),
 (94, 0.34114411961050656),
 (0, 0.3086066999241838),
 (581, 0.29329423004270666),
 (229, 0.24873416908154558),
 (7, 0.2450490147049017),
 (3208, 0.2450490147049017),
 (755, 0.24494897427831788),
 (2995, 0.24494897427831788),
 (661, 0.24152294576982403),
 (2912, 0.24019223070763074),
 (2317, 0.23145502494313788),
 (4401, 0.23145502494313785),
 (1531, 0.223606797749979),
 (1034, 0.22360679774997896),
 (1750, 0.22360679774997896),
 (1650, 0.21997067253203),
 (1217, 0.21997067253202998),
 (1303, 0.21997067253202998),
 (415, 0.2175970699446223),
 (4117, 0.2175970699446223),
 (230, 0.2165063509461097),
 (223, 0.21650635094610968),
 (2815, 0.21650635094610968),
 (16, 0.2132007163556105),
 (1319, 0.21281413268968719),
 (4042, 0.21281413268968719),
 (278, 0.2100420126042015),
 (228, 0.2083333333333334),
 (539, 0.2083333333333334),
 (134, 0.20412414523193156),
 (542, 0.20412414523193156),
 (1583, 0.2041241452319315),
 (28, 0.200160192256358

Then, we will run a loop to print first 5 entries from sorted_similar_movies list.

In [17]:
i=0
print("Top 10 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>10:
        break

Top 10 similar movies to Star Trek Beyond are:

Star Trek Into Darkness
Star Trek
Guardians of the Galaxy
Avatar
Star Trek: Insurrection
Star Wars: Episode III - Revenge of the Sith
Avengers: Age of Ultron
Star Wars: Clone Wars: Volume 1
Star Trek: Nemesis
Mad Max Beyond Thunderdome
Zathura: A Space Adventure


<img src="files/test.PNG">