In [1]:
import pandas as pd
# import numpy as np

movies = pd.read_csv("data/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
# encode the genre
movies = movies.join(movies.genres.str.get_dummies("|"))
movies.head()

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


How similar is Toy Story to Jumanji?
How do we calculate this similarity?

# Similarity

In [203]:
# Let's create a seed item or user. 
# The goal is to to find similar items to the seed.
# dataframe.loc[<rows>, <columns>]
seed_genre = movies.loc[movies['movieId']==106696,'Action':'Western']
seed_genre.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
22440,0,1,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0


In [193]:
# Let's also create a dataframe with only genres 
base_genres = movies.loc[:, 'Action':'Western']
base_genres.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [204]:
# Let's compute the cosine similarity using sklearn
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html
from sklearn.metrics.pairwise import cosine_similarity

# arg: (n_samples, n_features)
cos_sim = cosine_similarity(seed_genre, base_genres)[0]
print(cos_sim)


[0.73029674 0.47140452 0.57735027 ... 0.         0.20412415 0.        ]


In [205]:
# We can sort the array to work out which item is most similar. 
# To save time and resources, we'll request only the top 10.
for idx, score in sorted(enumerate(cos_sim), key = lambda x: x[1], reverse = True)[:10]:   
    print (idx, movies.iloc[idx,0], movies.iloc[idx,1], movies.iloc[idx,2], score)


22440 106696 Frozen (2013) Adventure|Animation|Comedy|Fantasy|Musical|Romance 1.0000000000000002
12286 56152 Enchanted (2007) Adventure|Animation|Children|Comedy|Fantasy|Musical|Romance 0.9258200997725515
2772 2857 Yellow Submarine (1968) Adventure|Animation|Comedy|Fantasy|Musical 0.9128709291752769
9056 26776 Porco Rosso (Crimson Pig) (Kurenai no buta) (1992) Adventure|Animation|Comedy|Fantasy|Romance 0.9128709291752769
10449 37729 Corpse Bride (2005) Animation|Comedy|Fantasy|Musical|Romance 0.9128709291752769
13273 64652 Delgo (2008) Adventure|Animation|Comedy|Fantasy|Romance 0.9128709291752769
19677 96792 Kismet (1955) Adventure|Comedy|Fantasy|Musical|Romance 0.9128709291752769
26703 120925 Shinbone Alley (1970) Animation|Comedy|Fantasy|Musical|Romance 0.9128709291752769
30351 130402 Cardcaptor Sakura: The Sealed Card (2000) Adventure|Animation|Comedy|Fantasy|Romance 0.9128709291752769
8605 26093 Wonderful World of the Brothers Grimm, The (1962) Adventure|Animation|Children|Comedy|D

# TF-IDF

If we rather have qualitative representations of items (e.g. descriptions, reviews, etc.), we can use a Natural Language Processing technique called TF-IDF (Term Frequency - Inverse Document Frequency) to parse through the descriptions, identify distinct phrases in each item's description, and then find 'similar' products based on those phrases.

In [185]:
# A sample dataset
data = [[0, "You two look very similar to each other"], [1, "You are very similar to each other"], [2, "A completely different person which completely different looks."]] 
df = pd.DataFrame(data, columns = ['ID', 'Description'])
df

Unnamed: 0,ID,Description
0,0,You two look very similar to each other
1,1,You are very similar to each other
2,2,A completely different person which completely...


In [191]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. 
# The 'stop_words' param tells the TF-IDF module to ignore common english words like 'the', etc.
tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 3),
                             min_df=0,
                             stop_words='english')
tfidf_matrix = tf.fit_transform(df['Description'])

# Compute similarity between all products using SciKit
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Iterate through each item's similar items and store the 100 most similar ones. 
# Stops at 100 because we do not normally show more than 100 items in recommendation.
for idx, row in df.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], df['ID'][i])
                     for i in similar_indices]

    # This 'sum' turns a list of tuples into a single tuple:
    # [(1,2), (3,4)] -> (1,2,3,4)
    flattened = sum(similar_items, ())
    print(flattened)

(1.0000000000000002, 0, 0.4736296010332684, 1, 0.0, 2)
(1.0, 1, 0.4736296010332684, 0, 0.0, 2)
(1.0000000000000002, 2, 0.0, 1, 0.0, 0)
