In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [23]:
import time
import re
import nltk
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [71]:
# datasets used
ted = pd.read_csv('./ted.csv')
ted_clean = pd.read_csv('./ted_clean.csv', index_col=0)
metadata = pd.read_csv('./movie_metadata.csv')
with open('./mother.txt', 'r', encoding='utf-8') as f:
    mother = f.read()
with open('./hopes.txt', 'r', encoding='utf-8') as f:
    hopes = f.read()
with open('./hey.txt', 'r', encoding='utf-8') as f:
    hey = f.read()    

# Building tf-idf document vectors

## tf-idf vectors for TED talks

In [8]:
ted.head()

Unnamed: 0,transcript,url
0,"We're going to talk — my — a new lecture, just...",https://www.ted.com/talks/al_seckel_says_our_b...
1,"This is a representation of your brain, and yo...",https://www.ted.com/talks/aaron_o_connell_maki...
2,It's a great honor today to share with you The...,https://www.ted.com/talks/carter_emmart_demos_...
3,"My passions are music, technology and making t...",https://www.ted.com/talks/jared_ficklin_new_wa...
4,It used to be that if you wanted to get a comp...,https://www.ted.com/talks/jeremy_howard_the_wo...


In [10]:
ted_ = ted.copy()
ted_ = ted_['transcript']

In [12]:
# create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted_)

print(tfidf_matrix.shape)

(500, 29158)


# Cosine similarity

## Computing dot product

In [14]:
# initialize numpy vectors
A = np.array([1,3])
B = np.array([-2,2])

# compute dot product
dot_prod = np.dot(A, B)

print(dot_prod)

4


## Cosine similarity matrix of a corpus

In [18]:
corpus = ['The sun is the largest celestial body in the solar system', 
          'The solar system consists of the sun and eight revolving planets',
          'Ra was the Egyptian Sun God', 
          'The Pyramids were the pinnacle of Egyptian architecture', 
          'The quick brown fox jumps over the lazy dog']

In [19]:
# initialize tfidf vec
tfidf_vectorizer = TfidfVectorizer()

# generate tfidf vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# compute the cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]


# Building a plot line based recommender

## Comparing `linear_kernel` and `cosine_similarity`

In [24]:
# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print(f'Time taken: {time.time() - start} seconds')

[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]
Time taken: 0.0029973983764648438 seconds


In [25]:
# Record start time
start = time.time()

# Compute cosine similarity matrix using linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print(f'Time taken: {time.time() - start} seconds')

[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]
Time taken: 0.0009999275207519531 seconds


Good job! Notice how both `linear_kernel` and `cosine_similarity` produced the same result. However, `linear_kernel` took a smaller amount of time to execute. When you're working with a very large amount of data and your vectors are in the tf-idf representation, it is good practice to default to `linear_kernel`.

## Recommender function & plot recommendation engine

In [26]:
metadata.head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,tagline
0,0,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,The Legend Ends
1,1,414,Batman Forever,The Dark Knight of Gotham City confronts a das...,"Courage now, truth always..."
2,2,268,Batman,The Dark Knight of Gotham City begins his war ...,Have you ever danced with the devil in the pal...
3,3,364,Batman Returns,"Having defeated the Joker, Batman now faces th...","The Bat, the Cat, the Penguin."
4,4,415,Batman & Robin,Along with crime-fighting partner Robin and ne...,Strength. Courage. Honor. And loyalty.


In [51]:
indices = pd.Series(metadata.index, index=metadata['title'])
movie_plots = metadata['overview'].fillna('')

In [63]:
def get_recommendations(title, cosine_sim, indices):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [64]:
# initialize the vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# construct the matrix
tfidf_matrix = tfidf.fit_transform(movie_plots)

# generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print(get_recommendations('The Dark Knight Rises', cosine_sim, indices))

1                              Batman Forever
2                                      Batman
3                              Batman Returns
8                  Batman: Under the Red Hood
9                            Batman: Year One
10    Batman: The Dark Knight Returns, Part 1
11    Batman: The Dark Knight Returns, Part 2
5                Batman: Mask of the Phantasm
7                               Batman Begins
4                              Batman & Robin
Name: title, dtype: object


## TED talk recommender

In [72]:
ted_clean.head(3)

Unnamed: 0_level_0,Unnamed: 0.1,title,url,transcript
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1407,10 top time-saving tech tips,https://www.ted.com/talks/david_pogue_10_top_t...,I've noticed something interesting about socie...
1,1524,Who am I? Think again,https://www.ted.com/talks/hetain_patel_who_am_...,"Hetain Patel: (In Chinese)Yuyu Rau: Hi, I'm He..."
2,2393,"""Awoo""",https://www.ted.com/talks/sofi_tukker_awoo\n,"(Music)Sophie Hawley-Weld: OK, you don't have ..."


In [73]:
def get_recommendations(title, cosine_sim, indices):
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwsie similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    ted_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return ted_clean['title'].iloc[ted_indices]

In [75]:
indices = pd.Series(ted_clean.index, index=ted_clean['title'])
transcripts = ted_clean['transcript']

In [78]:
# initialize the vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# construct the matrix
tfidf_matrix = tfidf.fit_transform(transcripts)

# generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print(get_recommendations('5 ways to kill your dreams', cosine_sim, indices))

Unnamed: 0
453             Success is a continuous journey
157                        Why we do what we do
494                   How to find work you love
149          My journey into movies that matter
447                        One Laptop per Child
230             How to get your ideas to spread
497         Plug into your hard-wired happiness
495    Why you will fail to have a great career
179             Be suspicious of simple stories
53                          To upgrade is human
Name: title, dtype: object


# Beyond n-grams: word embeddings

In [79]:
nlp = spacy.load('en_core_web_lg')

## Generating word vectors

In [80]:
sent = 'I like apples and oranges'
doc = nlp(sent)

# compute pairwise similarity scores
for token1 in doc:
    for token2 in doc:
        print(token1.text, token2.text, token1.similarity(token2))

I I 1.0
I like 0.55549127
I apples 0.20442721
I and 0.31607857
I oranges 0.18824081
like I 0.55549127
like like 1.0
like apples 0.32987145
like and 0.5267484
like oranges 0.27717474
apples I 0.20442721
apples like 0.32987145
apples apples 1.0
apples and 0.24097733
apples oranges 0.77809423
and I 0.31607857
and like 0.5267484
and apples 0.24097733
and and 1.0
and oranges 0.19245948
oranges I 0.18824081
oranges like 0.27717474
oranges apples 0.77809423
oranges and 0.19245948
oranges oranges 1.0


Notice how the words `apples` and `oranges` have the highest pairwaise similarity score. This is expected as they are both fruits and are more related to each other than any other pair of words.

## Computing similarity of Pink Floyd songs

In [81]:
mother_doc = nlp(mother)
hopes_doc = nlp(hopes)
hey_doc = nlp(hey)

# print similarity between songs
print(f'Similarity between Mother and Hopes: {mother_doc.similarity(hopes_doc)}')
print(f'Similarity between Mother and Hey: {mother_doc.similarity(hey_doc)}')

Similarity between Mother and Hopes: 0.8653562687318176
Similarity between Mother and Hey: 0.9595267490921296
