In [38]:
import pandas as pd
import numpy as np

#SKLearn
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import scale, StandardScaler
from sklearn.cluster import KMeans

We'll do topic modelling with our tuned vectorizor and the tuned LDA params we found earlier. Then we'll go on to build our recommendation system. 

**Build the LDA Topic Modeler first**

In [3]:
ted = pd.read_csv("ted_clean.csv")

In [4]:
script = ted.transcript

In [5]:
%store -r data
stop_words = data

In [6]:
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b', \
                     max_df = 0.25, min_df = 250)

In [7]:
doc_term_matrix = tf.fit_transform(script)

Understanding the matrices: 
- doc_term_matrix is our X
- lda.transform(X), which is our doc to topic matrix, is our V
- lda.components_, which is our topic to term matrix, is our U 

Since **X = V x U**:
\
Their shapes are going to be: (4005, 1142) = (4005, 15) x (15, 1142)

In [8]:
doc_term_matrix.shape

(4005, 1132)

In [9]:
vocab = tf.get_feature_names()

In [10]:
def show_topics(num_words= 10): 
    topic_words = []
    for idx, topic in enumerate(tm.components_):
    # Select the top 15 words in vocab for this topic.
        top_words = [vocab[i] for i in topic.argsort()[:-num_words-1:-1]]
        print(f"Topic {idx}:\n", ", ".join(top_words), "\n")
        topic_words.append(top_words) 
    return topic_words

In [11]:
tm = LatentDirichletAllocation(n_components = 15, random_state = 0, \
                               doc_topic_prior = 5, topic_word_prior = 2)
tm.fit(doc_term_matrix)
topic_words = show_topics()

Topic 0:
 climate, ocean, sea, carbon, ice, earth, species, planet, global, trees 

Topic 1:
 black, community, self, house, amazing, night, team, decided, walk, friends 

Topic 2:
 countries, china, india, government, africa, economic, growth, education, global, political 

Topic 3:
 women, men, woman, black, girls, sex, female, male, gender, white 

Topic 4:
 city, cities, states, united, data, police, countries, public, war, government 

Topic 5:
 kids, students, dna, science, stuff, god, evolution, teachers, teach, learning 

Topic 6:
 cells, health, cancer, disease, patients, blood, heart, medical, cell, patient 

Topic 7:
 food, fish, sleep, eat, plants, feed, waste, species, plant, night 

Topic 8:
 design, ideas, science, questions, project, nature, art, control, beautiful, self 

Topic 9:
 music, play, stories, language, father, word, voice, hear, book, read 

Topic 10:
 brain, animals, animal, brains, surface, image, images, light, blue, skin 

Topic 11:
 companies, business,

In [12]:
doc_topic = tm.transform(doc_term_matrix)
doc_topic.shape

(4005, 15)

In [13]:
topic_word = tm.components_
topic_word.shape

(15, 1132)

## Rec Sys

Put the document topic matrix (**V**) into a dataframe and then use the pairwise_distances function to find cosine similarities across the board

In [14]:
doc_topic_lda = pd.DataFrame(doc_topic.round(3),
                             index = ted.title)
doc_topic_lda

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Averting the climate crisis,0.175,0.051,0.074,0.038,0.101,0.036,0.030,0.060,0.058,0.056,0.039,0.104,0.058,0.058,0.064
The best stats you've ever seen,0.025,0.023,0.429,0.026,0.175,0.047,0.041,0.025,0.026,0.019,0.023,0.026,0.022,0.069,0.024
Simplicity sells,0.028,0.042,0.031,0.026,0.038,0.040,0.026,0.027,0.077,0.185,0.034,0.106,0.071,0.238,0.031
Greening the ghetto,0.048,0.053,0.142,0.061,0.270,0.030,0.027,0.157,0.032,0.026,0.019,0.035,0.031,0.022,0.045
Do schools kill creativity?,0.039,0.057,0.079,0.074,0.028,0.123,0.033,0.032,0.052,0.266,0.050,0.045,0.046,0.032,0.042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Crisis support for the world, one text away",0.046,0.057,0.090,0.098,0.085,0.037,0.083,0.065,0.051,0.083,0.036,0.104,0.050,0.087,0.028
The dark history of IQ tests,0.055,0.052,0.096,0.048,0.127,0.092,0.072,0.051,0.075,0.038,0.062,0.057,0.071,0.056,0.050
"How ""policing for profit"" undermines your rights",0.031,0.040,0.046,0.035,0.399,0.030,0.092,0.032,0.034,0.038,0.027,0.061,0.038,0.069,0.027
The electrifying speeches of Sojourner Truth,0.053,0.075,0.063,0.154,0.086,0.050,0.039,0.045,0.055,0.168,0.038,0.050,0.047,0.040,0.038


In [15]:
a = np.arange(0, 4005)
doc_topic_lda.set_index(a, inplace = True)
doc_topic_lda["title"] = ted.title 
doc_topic_lda

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,title
0,0.175,0.051,0.074,0.038,0.101,0.036,0.030,0.060,0.058,0.056,0.039,0.104,0.058,0.058,0.064,Averting the climate crisis
1,0.025,0.023,0.429,0.026,0.175,0.047,0.041,0.025,0.026,0.019,0.023,0.026,0.022,0.069,0.024,The best stats you've ever seen
2,0.028,0.042,0.031,0.026,0.038,0.040,0.026,0.027,0.077,0.185,0.034,0.106,0.071,0.238,0.031,Simplicity sells
3,0.048,0.053,0.142,0.061,0.270,0.030,0.027,0.157,0.032,0.026,0.019,0.035,0.031,0.022,0.045,Greening the ghetto
4,0.039,0.057,0.079,0.074,0.028,0.123,0.033,0.032,0.052,0.266,0.050,0.045,0.046,0.032,0.042,Do schools kill creativity?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4000,0.046,0.057,0.090,0.098,0.085,0.037,0.083,0.065,0.051,0.083,0.036,0.104,0.050,0.087,0.028,"Crisis support for the world, one text away"
4001,0.055,0.052,0.096,0.048,0.127,0.092,0.072,0.051,0.075,0.038,0.062,0.057,0.071,0.056,0.050,The dark history of IQ tests
4002,0.031,0.040,0.046,0.035,0.399,0.030,0.092,0.032,0.034,0.038,0.027,0.061,0.038,0.069,0.027,"How ""policing for profit"" undermines your rights"
4003,0.053,0.075,0.063,0.154,0.086,0.050,0.039,0.045,0.055,0.168,0.038,0.050,0.047,0.040,0.038,The electrifying speeches of Sojourner Truth


Let's look at the distance of the first one to the rest. 

In [16]:
dist_pairs = pairwise_distances(doc_topic[0].reshape(1,-1),doc_topic,metric='cosine').argsort()[0]
dist_pairs

array([   0, 3062, 3959, ...,  655, 1463, 2375])

In [17]:
first_talk_rec = doc_topic_lda.iloc[dist_pairs]
first_talk_rec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,title
0,0.175,0.051,0.074,0.038,0.101,0.036,0.030,0.060,0.058,0.056,0.039,0.104,0.058,0.058,0.064,Averting the climate crisis
3062,0.165,0.080,0.071,0.047,0.066,0.048,0.049,0.072,0.054,0.045,0.046,0.084,0.065,0.049,0.059,Why do competitors open their stores next to o...
3959,0.162,0.057,0.062,0.041,0.116,0.054,0.048,0.064,0.059,0.041,0.063,0.052,0.054,0.054,0.075,Why isn't the Netherlands underwater?
3762,0.142,0.059,0.113,0.046,0.137,0.059,0.048,0.047,0.056,0.039,0.037,0.084,0.054,0.033,0.047,Why I protest for climate justice
3994,0.221,0.076,0.071,0.041,0.066,0.027,0.043,0.032,0.065,0.047,0.037,0.137,0.075,0.029,0.033,How to shift your mindset and choose your future
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3626,0.021,0.022,0.021,0.019,0.026,0.023,0.629,0.021,0.022,0.018,0.018,0.024,0.025,0.093,0.019,"A ""living drug"" that could change the way we t..."
589,0.020,0.022,0.025,0.023,0.030,0.030,0.643,0.018,0.026,0.017,0.033,0.029,0.026,0.033,0.025,A new strategy in the war on cancer
655,0.023,0.019,0.021,0.032,0.017,0.019,0.645,0.069,0.021,0.014,0.027,0.026,0.024,0.019,0.026,Can we eat to starve cancer?
1463,0.022,0.020,0.021,0.715,0.022,0.020,0.015,0.018,0.021,0.023,0.018,0.024,0.028,0.017,0.016,Violence against women -- it's a men's issue


In [18]:
first_talk_rec.iloc[0:5].title

0                             Averting the climate crisis
3062    Why do competitors open their stores next to o...
3959                Why isn't the Netherlands underwater?
3762                    Why I protest for climate justice
3994     How to shift your mindset and choose your future
Name: title, dtype: object

**Try with own sentence**

Give the recommendation system a try: Let's insert our own sentence and see what it comes up with. 

In [19]:
t = ["I've read a lot of art history over my life"]

In [20]:
vt = tf.transform(t)

In [21]:
tt = tm.transform(vt)

In [22]:
dist_pairs = pairwise_distances(tt,doc_topic,metric='cosine').argsort()[0]
dist_pairs

array([  95,  145, 1393, ..., 2375, 1463,  967])

In [23]:
first_talk_rec = doc_topic_lda.iloc[dist_pairs]
first_talk_rec.iloc[0:5].title

95                                     "La Vie en Rose"
145                                      My magic moves
1393                                Dance, tiny robots!
1825    A dance in a hurricane of paper, wind and light
2344                                    "Rollercoaster"
Name: title, dtype: object

Let's see what those are and if they make sense 

In [24]:
print(ted.description.iloc[95])
print("\n")
print(print(ted.description.iloc[145]))
print("\n")
print(print(ted.description.iloc[1393]))
print("\n")
print(print(ted.description.iloc[1825]))
print("\n")
print(print(ted.description.iloc[2344]))

Featuring the vocals and mischievous bell-playing of accordionist and singer Rachelle Garniez, the TED House Band -- led by Thomas Dolby on keyboard -- delivers this delightful rendition of the Edith Piaf standard "La Vie en Rose."


Kenichi Ebina moves his body in a manner that appears to defy the limits imposed by the human skeleton. He combines breakdancing and hip-hop with mime using movements that are simultaneously precise and fluid.
None


There's a place in France where the robots do a dance. And that place is TEDxConcorde, where Bruno Maisonnier of Aldebaran Robotics choreographs a troupe of tiny humanoid Nao robots through a surprisingly emotive performance.
None


Choreographer Aakash Odedra is dyslexic and has always felt that his best expression comes through movement. “Murmur” is his ode to that experience, teaming up with co-creators Lewis Major and Ars Electronica Futurelab. Watch him spin his way through the center of a storm, as pages of books take flight all around h

Yup, I'd say that's doing pretty well. 

**Now let's make it official.** 

Write a function for this: 

In [25]:
def recommend_ted(string, num_talks, print_or_not = False): 
    t = [string]
    vt = tf.transform(t)
    tt = tm.transform(vt)
    dist_pairs = pairwise_distances(tt,doc_topic,metric='cosine').argsort()[0]
    recs = doc_topic_lda.iloc[dist_pairs]
    top_talks = recs.iloc[0:num_talks].title
    if print_or_not:
        print(top_talks)
        for i in top_talks.index: 
            print(ted.description.iloc[i])
            print('\n')
    return top_talks, top_talks.index

In [26]:
my_interest = "I tend to fly a lot for work and I'm worried about the carbon print of that"

In [27]:
recommendation, idx = recommend_ted(my_interest, 5)
recommendation

84      Global warming's theme song, "Manhattan in Jan...
95                                       "La Vie en Rose"
2344                                      "Rollercoaster"
1393                                  Dance, tiny robots!
1825      A dance in a hurricane of paper, wind and light
Name: title, dtype: object

In [28]:
recommend_ted(my_interest, 5, True)

84      Global warming's theme song, "Manhattan in Jan...
95                                       "La Vie en Rose"
2344                                      "Rollercoaster"
1393                                  Dance, tiny robots!
1825      A dance in a hurricane of paper, wind and light
Name: title, dtype: object
A happy song about global warming, from Jill Sobule.


Featuring the vocals and mischievous bell-playing of accordionist and singer Rachelle Garniez, the TED House Band -- led by Thomas Dolby on keyboard -- delivers this delightful rendition of the Edith Piaf standard "La Vie en Rose."


Singer, songwriter and actress Sara Ramirez is a woman of many talents. Joined by Michael Pemberton on guitar, Ramirez sings of opportunity, wisdom and the highs and lows of life in this live performance of her song, "Rollercoaster."


There's a place in France where the robots do a dance. And that place is TEDxConcorde, where Bruno Maisonnier of Aldebaran Robotics choreographs a troupe of t

(84      Global warming's theme song, "Manhattan in Jan...
 95                                       "La Vie en Rose"
 2344                                      "Rollercoaster"
 1393                                  Dance, tiny robots!
 1825      A dance in a hurricane of paper, wind and light
 Name: title, dtype: object,
 Int64Index([84, 95, 2344, 1393, 1825], dtype='int64'))

Let's get another rec to see if this works: 

In [29]:
new_video = "All feminisms are different"

In [30]:
recommend_ted(new_video, 5, True)

95                                       "La Vie en Rose"
2344                                      "Rollercoaster"
84      Global warming's theme song, "Manhattan in Jan...
1393                                  Dance, tiny robots!
145                                        My magic moves
Name: title, dtype: object
Featuring the vocals and mischievous bell-playing of accordionist and singer Rachelle Garniez, the TED House Band -- led by Thomas Dolby on keyboard -- delivers this delightful rendition of the Edith Piaf standard "La Vie en Rose."


Singer, songwriter and actress Sara Ramirez is a woman of many talents. Joined by Michael Pemberton on guitar, Ramirez sings of opportunity, wisdom and the highs and lows of life in this live performance of her song, "Rollercoaster."


A happy song about global warming, from Jill Sobule.


There's a place in France where the robots do a dance. And that place is TEDxConcorde, where Bruno Maisonnier of Aldebaran Robotics choreographs a troupe of t

(95                                       "La Vie en Rose"
 2344                                      "Rollercoaster"
 84      Global warming's theme song, "Manhattan in Jan...
 1393                                  Dance, tiny robots!
 145                                        My magic moves
 Name: title, dtype: object,
 Int64Index([95, 2344, 84, 1393, 145], dtype='int64'))

## Attempt: K-Means Clustering

This was an attempt to do clustering in order to incorporate in the Recommendation System, but does not seem to have gone far. 
\
More analysis is needed to incorporate back into RecSys.

In [34]:
scaler = StandardScaler()
doc_topic_knn = pd.DataFrame(doc_topic.round(3),
                             index = ted.title)
doc_topic_knn_scaled = scaler.fit_transform(doc_topic_knn)
doc_topic_knn_scaled.shape

(4005, 15)

In [35]:
doc_term_matrix_scaled = scaler.fit_transform(doc_term_matrix.toarray())
doc_term_matrix_scaled.shape

(4005, 1132)

In [36]:
doc_term_matrix_scaled

array([[-0.36106041, -0.34714142, -0.23660682, ..., -0.35878089,
        -0.30653496, -0.17667735],
       [-0.36106041, -0.34714142,  2.40297625, ..., -0.35878089,
        -0.30653496, -0.17667735],
       [-0.36106041,  4.10895274, -0.23660682, ...,  2.57071448,
         0.71143561, -0.17667735],
       ...,
       [-0.36106041, -0.34714142, -0.23660682, ..., -0.35878089,
        -0.30653496, -0.17667735],
       [-0.36106041, -0.34714142, -0.23660682, ..., -0.35878089,
         3.76534732, -0.17667735],
       [-0.36106041, -0.34714142, -0.23660682, ..., -0.35878089,
        -0.30653496, -0.17667735]])

In [39]:
num_clusters = 15
km = KMeans(n_clusters=num_clusters, random_state = 0, max_iter = 1000)

In [40]:
km.fit(doc_term_matrix_scaled)
km.labels_.shape

(4005,)

In [41]:
print(km.labels_)
km.cluster_centers_.shape

[14  4 10 ... 14  7  7]


(15, 1132)

In [42]:
km.inertia_

4329668.516379901

That does not look so good! :(

In [43]:
df_doc_topic_clusters = doc_topic_knn.copy()
df_doc_topic_clusters['Cluster']=km.labels_
df_doc_topic_clusters['Cluster'].value_counts()

7     1701
14     548
3      449
13     427
10     360
4      268
12     237
1        5
0        2
2        2
11       2
5        1
6        1
9        1
8        1
Name: Cluster, dtype: int64

In [44]:
df_clusters = df_doc_topic_clusters.groupby(['Cluster']).median()
df_clusters

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.056,0.024,0.053,0.0155,0.0585,0.024,0.0355,0.042,0.0235,0.016,0.02,0.043,0.0265,0.027,0.535
1,0.022,0.029,0.333,0.012,0.034,0.026,0.105,0.019,0.035,0.018,0.021,0.125,0.034,0.029,0.014
2,0.0365,0.0515,0.109,0.0555,0.076,0.03,0.018,0.0275,0.0415,0.3335,0.0265,0.095,0.042,0.035,0.022
3,0.061,0.043,0.035,0.028,0.035,0.05,0.036,0.053,0.046,0.031,0.065,0.038,0.045,0.046,0.108
4,0.043,0.035,0.226,0.029,0.0665,0.0325,0.029,0.043,0.036,0.027,0.027,0.065,0.036,0.034,0.036
5,0.03,0.032,0.027,0.043,0.038,0.029,0.016,0.028,0.034,0.069,0.019,0.17,0.035,0.4,0.03
6,0.056,0.016,0.246,0.012,0.04,0.015,0.028,0.048,0.018,0.012,0.013,0.097,0.019,0.101,0.279
7,0.056,0.062,0.051,0.052,0.054,0.056,0.051,0.056,0.062,0.063,0.06,0.055,0.061,0.058,0.058
8,0.353,0.014,0.045,0.012,0.04,0.016,0.009,0.037,0.015,0.013,0.012,0.371,0.016,0.014,0.033
9,0.023,0.034,0.024,0.279,0.027,0.026,0.384,0.029,0.027,0.021,0.025,0.03,0.027,0.021,0.023
