In [1]:
# data manipulation
import pandas as pd
import numpy as np

In [2]:
# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD

In [3]:
# LDA visualization 
import pyLDAvis
import pyLDAvis.sklearn

We know we will be using LDA because our text is long, but let's give it a try with the other topic modellers. 

## Load data + create Vectorizer

In [4]:
ted = pd.read_csv("ted_clean.csv")

In [5]:
script = ted.transcript

In [6]:
tf = CountVectorizer(stop_words = 'english')
dtm_tf = tf.fit_transform(script)

We'll go wit a default of 10 topics and try the 3 models of: LDA, NMF & LSA

# LDA

In [7]:
lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tf.fit(dtm_tf)

LatentDirichletAllocation(random_state=0)

In [8]:
vocab = tf.get_feature_names()
for idx, topic in enumerate(lda_tf.components_):
    # Select the top 15 words in vocab for this topic.
    top_words = [vocab[i] for i in topic.argsort()[:-16:-1]]
    print(f"Topic {idx}:\n", ", ".join(top_words), "\n")

Topic 0:
 like, earth, light, just, time, universe, years, life, know, space, planet, look, way, ocean, actually 

Topic 1:
 music, sound, like, laughter, language, applause, going, just, know, people, think, play, way, sounds, right 

Topic 2:
 women, like, men, just, laughter, time, know, work, did, people, think, going, really, world, make 

Topic 3:
 like, just, laughter, going, really, know, did, think, time, little, way, things, got, people, thing 

Topic 4:
 brain, cells, health, cancer, body, disease, patients, like, blood, medical, heart, care, patient, just, time 

Topic 5:
 people, think, like, just, going, really, actually, know, things, want, right, way, time, make, data 

Topic 6:
 like, just, water, world, really, going, new, years, make, need, actually, use, people, think, energy 

Topic 7:
 people, world, country, years, countries, just, percent, africa, new, need, today, like, government, time, states 

Topic 8:
 people, know, like, said, laughter, just, did, time, go

# NMF

In [9]:
nmf = NMF(n_components=10)

In [10]:
doc_topic = nmf.fit_transform(dtm_tf)



In [11]:
def get_top_terms(topic, n_terms, nmf=nmf, terms=vocab):
    components = nmf.components_[topic, :]
    top_term_indices = components.argsort()[-n_terms:]    
    top_terms = np.array(terms)[top_term_indices]    
    return top_terms.tolist()

In [12]:
for i in range(10):
    print(f"for topic {i}")
    print(get_top_terms(i, 15))
    print()

for topic 0
['want', 'like', 'actually', 'things', 'mean', 'lot', 'way', 'say', 'people', 'right', 'really', 'just', 'world', 'ca', 'think']

for topic 1
['better', 'change', 'health', 'need', 'percent', 'help', 'money', 'make', 'things', 'person', 'want', 'social', 'work', 'like', 'people']

for topic 2
['thought', 'come', 'kids', 'story', 'came', 'got', 'children', 'went', 'school', 'day', 'years', 'life', 'time', 'did', 'said']

for topic 3
['country', '000', 'time', 'global', 'energy', 'year', 'change', 'countries', 'today', 'water', 'new', 'percent', 'need', 'years', 'world']

for topic 4
['space', 'earth', 'make', 'light', 'music', 'look', 'little', 'way', 'time', 'life', 'kind', 'right', 'know', 'just', 'like']

for topic 5
['blood', 'right', 'sleep', 'time', 'does', 'going', 'disease', 'neurons', 'cell', 'different', 'human', 'body', 'cancer', 'cells', 'brain']

for topic 6
['little', 'yeah', 'say', 'does', 'want', 'got', 'said', 'ok', 'did', 'right', 'good', 'just', 'like', 'a

# LSA

In [13]:
lsa = TruncatedSVD(10)
doc_topic = lsa.fit_transform(dtm_tf)
lsa.explained_variance_ratio_

array([0.08749042, 0.01930134, 0.01663689, 0.01296377, 0.00974322,
       0.00936043, 0.0084517 , 0.00812133, 0.00731244, 0.00697759])

In [14]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3", "component_4", "component_5", \
                     "component_6","component_7", "component_8", "component_9", "component_10"],
             columns = tf.get_feature_names())
topic_word

Unnamed: 0,00,000,0000,000000004,0000001,000001,00001,000042,0001,00046,...,عسل,مسكين,مطعم,وله,อย,อยman,อร,你会说中文吗,你好,送你葱
component_1,0.0,0.038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-0.0,0.03,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0
component_3,0.0,-0.042,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
component_4,-0.0,-0.072,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0
component_5,0.0,0.011,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,...,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
component_6,-0.0,-0.028,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_7,0.001,-0.005,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0
component_8,-0.0,-0.062,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0
component_9,-0.0,-0.003,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0
component_10,-0.001,-0.003,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,...,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0


In [15]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [16]:
display_topics(lsa, tf.get_feature_names(), 15)


Topic  0
people, like, just, know, going, think, really, laughter, time, world, way, actually, did, things, want

Topic  1
people, world, countries, percent, need, country, think, health, social, change, china, global, ca, today, government

Topic  2
laughter, said, people, did, women, know, men, say, love, school, applause, got, went, day, man

Topic  3
think, people, really, going, know, ca, things, actually, lot, kind, right, thing, mean, say, sort

Topic  4
going, know, said, think, women, got, really, need, ca, want, world, say, did, years, countries

Topic  5
brain, women, cells, cancer, body, men, disease, patients, health, children, life, said, heart, blood, time

Topic  6
like, know, people, did, life, kind, time, kids, school, city, started, water, said, story, just

Topic  7
women, like, think, world, men, ca, love, woman, just, know, feel, say, mean, sex, life

Topic  8
women, actually, really, work, data, men, make, use, new, design, like, health, need, woman, working

To

Definitely LDA is the leader here but I'd say NMF is a second best and LSA is a bit behind. \
And I'd say 'like' is definitely a stop_word :) We'll deal with that in the next chapter. 