In [4]:
import pandas as pd
# import numpy as np

movies = pd.read_csv("data/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# encode the genre. 1 = a genre is present, 0 = not present
movies = movies.join(movies.genres.str.get_dummies("|"))
movies.head()

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Similarity

E.g. What are the movies that are similar to Frozen? How do we calculate this similarity?

In [15]:
# By exploring the dataset, we already know that movieId of Frozen is 106696. We can query the movies dataframe to confirm this.
movies.query('movieId == 106696')

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
22440,106696,Frozen (2013),Adventure|Animation|Comedy|Fantasy|Musical|Rom...,0,0,1,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0


In [17]:
# Let's create a seed for Frozen. 
# The goal is to find similar items to the seed.
seed_genre = movies.loc[movies['movieId']==106696,'Action':'Western']
seed_genre.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
22440,0,1,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0


In [18]:
# Let's also create a base dataframe with only genres, in which we will find similar items to Frozen.
base_genres = movies.loc[:, 'Action':'Western']
base_genres.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
# Let's compute the cosine similarity using sklearn
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
from sklearn.metrics.pairwise import cosine_similarity

# arg: (n_samples, n_features)
cos_sim = cosine_similarity(seed_genre, base_genres)[0]
print(cos_sim)

[0.73029674 0.47140452 0.57735027 ... 0.         0.20412415 0.        ]


In [21]:
# We can sort the array to work out which item is most similar. 
# To save time and resources, we'll request only the top 10.
for idx, score in sorted(enumerate(cos_sim), key = lambda x: x[1], reverse = True)[:10]:   
    print (idx, movies.iloc[idx,0], movies.iloc[idx,1], movies.iloc[idx,2], score)

22440 106696 Frozen (2013) Adventure|Animation|Comedy|Fantasy|Musical|Romance 1.0000000000000002
12286 56152 Enchanted (2007) Adventure|Animation|Children|Comedy|Fantasy|Musical|Romance 0.9258200997725514
10449 37729 Corpse Bride (2005) Animation|Comedy|Fantasy|Musical|Romance 0.912870929175277
19677 96792 Kismet (1955) Adventure|Comedy|Fantasy|Musical|Romance 0.912870929175277
26703 120925 Shinbone Alley (1970) Animation|Comedy|Fantasy|Musical|Romance 0.912870929175277
2772 2857 Yellow Submarine (1968) Adventure|Animation|Comedy|Fantasy|Musical 0.9128709291752769
9056 26776 Porco Rosso (Crimson Pig) (Kurenai no buta) (1992) Adventure|Animation|Comedy|Fantasy|Romance 0.9128709291752769
13273 64652 Delgo (2008) Adventure|Animation|Comedy|Fantasy|Romance 0.9128709291752769
30351 130402 Cardcaptor Sakura: The Sealed Card (2000) Adventure|Animation|Comedy|Fantasy|Romance 0.9128709291752769
8605 26093 Wonderful World of the Brothers Grimm, The (1962) Adventure|Animation|Children|Comedy|Dram

# TF-IDF

If we rather have qualitative representations of items (e.g. descriptions, reviews, etc.), we can use a Natural Language Processing technique called TF-IDF (Term Frequency - Inverse Document Frequency) to parse through the descriptions, identify distinct phrases in each item's description, and then find 'similar' products based on those phrases.

In [26]:
# A sample dataset
data = [[0, "You know what they call a quarter pounder with cheese in Paris?"], [1, "I'm gonna make him an offer he can't refuse."], [2, "Vito's gonna make him an offer he can't refuse."]] 
df = pd.DataFrame(data, columns = ['ID', 'Description'])
df

Unnamed: 0,ID,Description
0,0,You know what they call a quarter pounder with...
1,1,I'm gonna make him an offer he can't refuse.
2,2,Vito's gonna make him an offer he can't refuse.


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. 
# The 'stop_words' param tells the TF-IDF module to ignore common english words like 'the', etc.
tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 3),
                             min_df=0,
                             stop_words='english')
tfidf_matrix = tf.fit_transform(df['Description'])

# Compute similarity between all products using SciKit
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Iterate through each item's similar items and store the 100 most similar ones. 
# Stops at 100 because we do not normally show more than 100 items in recommendation.
for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], df['ID'][i])
                     for i in similar_indices]

    # This 'sum' turns a list of tuples into a single tuple:
    # [(1,2), (3,4)] -> (1,2,3,4)
    flattened = sum(similar_items, ())
    print(flattened)

(1.0000000000000002, 0, 0.0, 2, 0.0, 1)
(1.0000000000000002, 1, 0.7964896758501381, 2, 0.0, 0)
(1.0000000000000007, 2, 0.7964896758501381, 1, 0.0, 0)


The result above is a distance matrix of all the rows. In other words, each row represents a comparison of that row to the rest of the rows.

Each row is organised as follows: 
(similarity, id, similarity, id, similarity, id) 

We can see from the first row that ID 0 is not similar to ID 1 and ID 2.

From both second and third rows, we can see that ID 1 and ID 2 are very similar.

We can confirm that these results make sense by checking the dataframe.