In [1]:
# data manipulation
import pandas as pd
import numpy as np

In [2]:
# sklearn & NLTK
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import PorterStemmer, SnowballStemmer

In [3]:
# LDA visualization 
import pyLDAvis
import pyLDAvis.sklearn

### Prep: Load data + Helper Function for Pipeline

In [4]:
ted = pd.read_csv("ted_clean.csv")

In [5]:
script = ted.transcript

**Helper Function**

In [6]:
def topic_modeller(corpus, vectorizer, modeller, n_components = 10, num_words = 10): 
    doc_term_matrix = vectorizer.fit_transform(corpus)
 
    tm = modeller(n_components = n_components, random_state = 0)
    tm.fit(doc_term_matrix)
    
    vocab = vectorizer.get_feature_names()
    for idx, topic in enumerate(tm.components_):
        # Select the top (num_words) words in vocab for this topic.
        top_words = [vocab[i] for i in topic.argsort()[:-num_words-1:-1]]
        print(f"Topic {idx}:\n", ", ".join(top_words), "\n")
    
    return tm, doc_term_matrix

# Part 1: Choose Vectorizer

**Try: CountVectorizer**

In [7]:
tf = CountVectorizer(stop_words = 'english')

In [8]:
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 like, earth, light, just, time, universe, years, life, know, space 

Topic 1:
 music, sound, like, laughter, language, applause, going, just, know, people 

Topic 2:
 women, like, men, just, laughter, time, know, work, did, people 

Topic 3:
 like, just, laughter, going, really, know, did, think, time, little 

Topic 4:
 brain, cells, health, cancer, body, disease, patients, like, blood, medical 

Topic 5:
 people, think, like, just, going, really, actually, know, things, want 

Topic 6:
 like, just, water, world, really, going, new, years, make, need 

Topic 7:
 people, world, country, years, countries, just, percent, africa, new, need 

Topic 8:
 people, know, like, said, laughter, just, did, time, going, life 

Topic 9:
 like, people, did, said, life, know, just, say, world, time 



In [9]:
# visualize(lda_tf, dtm_tf, tf)
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf)

  default_term_info = default_term_info.sort_values(


**Try: TF-IDF**

In [10]:
tfidf = TfidfVectorizer(stop_words = 'english')

In [11]:
lda_tfidf, dtm_tfidf = topic_modeller(script, tfidf, LatentDirichletAllocation)

Topic 0:
 tm, lt, dragonfly, jf, pms, kabuki, mizzone, carers, vivaldi, tribology 

Topic 1:
 bf, biscuit, sekou, peseshet, cymatics, oumuamua, gando, dante, lucius, fb 

Topic 2:
 marshmallow, yawn, tk, quixote, romo, bailey, beauvoir, boraqchin, tardigrades, bananananana 

Topic 3:
 people, like, just, going, laughter, know, think, really, world, time 

Topic 4:
 musa, connectome, domitia, ect, rubies, antimatter, whitopia, zeno, enheduanna, elvish 

Topic 5:
 sdgs, progeria, rufus, neruda, tapirs, saleem, doodling, floaters, wg, glamour 

Topic 6:
 hs, sw, daedalus, ynh, loisfoeribari, juana, heforshe, asl, diogenes, foie 

Topic 7:
 stasi, givers, obituaries, edi, ems, milo, abed, 404, fireball, kiteflyer 

Topic 8:
 heh, telomeres, clonie, sloths, sankara, nathaniel, wildebeest, forfeiture, rasputin, futura 

Topic 9:
 rb, mk, wk, neurogenesis, fonio, phages, claudio, dyslexia, td, vonnegut 



In [12]:
# visualize(lda_tfidf, dtm_tfidf, tfidf)
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf)

  default_term_info = default_term_info.sort_values(


I would say we are definitely getting better results with CountVectorizer() so let's go with that, and we'll try and tune the params for that; adding preprocessing and see what we get with the topic modeller each time. 

# Part 2: Tune Vectorizer

**Tune: stop_words**

In [13]:
tf = CountVectorizer()
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 sex, to, women, male, is, men, female, that, they, you 

Topic 1:
 the, and, of, in, to, they, is, was, for, their 

Topic 2:
 is, the, you, and, to, it, that, of, in, are 

Topic 3:
 cells, brain, cancer, body, blood, patients, disease, cell, heart, patient 

Topic 4:
 and, the, to, that, it, is, you, of, was, this 

Topic 5:
 the, and, to, of, in, that, we, is, not, are 

Topic 6:
 the, and, to, is, of, that, we, in, it, you 

Topic 7:
 biscuit, stress, your, bf, dh, gg, mk, gabby, da, telomeres 

Topic 8:
 the, to, and, of, in, that, was, he, for, his 

Topic 9:
 the, of, and, to, is, in, that, it, we, this 



In [14]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf)

  default_term_info = default_term_info.sort_values(


Let's add stop_words = "English" and see. 

In [15]:
tf = CountVectorizer(stop_words = "english")
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 like, earth, light, just, time, universe, years, life, know, space 

Topic 1:
 music, sound, like, laughter, language, applause, going, just, know, people 

Topic 2:
 women, like, men, just, laughter, time, know, work, did, people 

Topic 3:
 like, just, laughter, going, really, know, did, think, time, little 

Topic 4:
 brain, cells, health, cancer, body, disease, patients, like, blood, medical 

Topic 5:
 people, think, like, just, going, really, actually, know, things, want 

Topic 6:
 like, just, water, world, really, going, new, years, make, need 

Topic 7:
 people, world, country, years, countries, just, percent, africa, new, need 

Topic 8:
 people, know, like, said, laughter, just, did, time, going, life 

Topic 9:
 like, people, did, said, life, know, just, say, world, time 



In [16]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf)

  default_term_info = default_term_info.sort_values(


Let's also add some frequently appearing other words: 

In [17]:
stop_words = ENGLISH_STOP_WORDS.union(["people", 'like', "said", 'know', 'just', 'really', 'actually', 'right', 'going'])

In [18]:
tf = CountVectorizer(stop_words = stop_words)
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 laughter, life, time, did, love, feel, think, way, want, say 

Topic 1:
 world, years, africa, percent, countries, new, china, dollars, today, million 

Topic 2:
 think, world, time, want, things, work, need, make, way, did 

Topic 3:
 things, think, time, way, little, laughter, look, thing, kind, make 

Topic 4:
 water, years, energy, world, food, need, earth, ocean, use, make 

Topic 5:
 children, school, kids, students, education, child, did, parents, teachers, time 

Topic 6:
 women, did, laughter, world, time, applause, life, day, years, story 

Topic 7:
 cells, cancer, body, disease, blood, cell, dna, heart, different, make 

Topic 8:
 brain, think, human, use, robot, information, different, technology, way, time 

Topic 9:
 laughter, did, got, think, want, city, little, good, time, make 



**Tune: token pattern** We're seeing a lot of 've', let's get rid of that. 

In [19]:
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b')
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 women, did, time, children, school, years, want, laughter, work, world 

Topic 1:
 dna, human, species, life, bacteria, different, cells, think, years, animals 

Topic 2:
 water, years, earth, time, life, world, planet, way, laughter, little 

Topic 3:
 brain, cells, body, different, time, does, human, use, make, think 

Topic 4:
 cancer, health, new, think, city, disease, years, way, work, make 

Topic 5:
 universe, energy, time, quantum, physics, particles, way, space, matter, make 

Topic 6:
 think, things, want, make, way, laughter, time, thing, world, new 

Topic 7:
 light, data, space, world, time, image, look, new, images, black 

Topic 8:
 laughter, did, think, time, life, way, say, music, love, want 

Topic 9:
 world, country, countries, government, global, need, today, years, war, states 



**Tune: max_df & min_df**

OK, let's get rid of some more frequently appearing words. Also let's try to distinguish between topics by lowering our max_df

In [20]:
stop_words = stop_words.union(['got', 'don', 'let'])

In [21]:
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.10)
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 cars, fly, consciousness, mathematics, muscle, network, bridge, legs, objects, driving 

Topic 1:
 girls, violence, teachers, police, gender, schools, mom, teacher, son, justice 

Topic 2:
 sleep, stress, memory, pain, mental, depression, neurons, brains, virus, patients 

Topic 3:
 dna, computers, code, machines, intelligence, software, biology, bees, gene, camera 

Topic 4:
 robot, robots, compassion, biscuit, prison, military, emotions, dance, suffering, pain 

Topic 5:
 fish, bacteria, ocean, sea, microbes, dna, cancer, sharks, ants, birds 

Topic 6:
 cancer, china, patients, patient, drug, sex, income, security, poverty, drugs 

Topic 7:
 ice, architecture, web, record, buildings, page, site, museum, mountain, dinosaurs 

Topic 8:
 universe, stars, mars, planets, physics, particles, theory, star, solar, galaxy 

Topic 9:
 carbon, oil, ocean, plants, nuclear, plant, solar, trees, waste, forest 



In [22]:
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.25)
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


Topic 0:
 plastic, bees, mars, city, waste, food, yeah, science, paper, nuclear 

Topic 1:
 cells, health, cancer, disease, patients, blood, dna, medical, heart, cell 

Topic 2:
 brain, food, brains, neurons, language, memory, play, behavior, learning, sleep 

Topic 3:
 city, cities, data, public, community, government, kids, education, states, law 

Topic 4:
 earth, universe, energy, planet, science, light, data, sun, stars, climate 

Topic 5:
 countries, women, africa, global, business, war, states, dollars, china, government 

Topic 6:
 women, men, woman, stories, black, kids, mother, god, felt, friends 

Topic 7:
 design, light, computer, video, car, machine, data, project, stuff, energy 

Topic 8:
 music, sound, language, play, voice, hear, sounds, song, video, english 

Topic 9:
 species, ocean, fish, animals, food, sea, planet, carbon, land, earth 



Also, let's see if having a min_df makes our topic set tighter, if not we'll drop it. 

In [23]:
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.25, min_df = 500)
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 health, disease, heart, medical, death, risk, study, child, early, brain 

Topic 1:
 music, language, sound, play, word, hear, voice, sounds, audience, video 

Topic 2:
 women, men, black, woman, white, mother, culture, society, stories, community 

Topic 3:
 stories, book, friends, god, felt, mother, father, read, night, yeah 

Topic 4:
 light, energy, air, earth, design, image, video, surface, beautiful, blue 

Topic 5:
 city, kids, community, students, education, design, game, play, project, learning 

Topic 6:
 brain, data, science, computer, scientists, behavior, numbers, control, questions, reality 

Topic 7:
 food, planet, species, earth, animals, nature, land, global, billion, natural 

Topic 8:
 countries, africa, government, states, united, war, public, global, political, state 

Topic 9:
 dollars, business, company, companies, car, cost, buy, value, stuff, pay 



I would say our tightness went down, so let's experiment once more with min_df and possibly drop it. 

In [24]:
stop_words = stop_words.union(['yeah', 'everybody', 'somebody', 'guy'])
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.25, min_df = 250)
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 food, dollars, countries, africa, city, cities, business, billion, global, economic 

Topic 1:
 data, internet, computer, video, car, online, company, digital, phone, media 

Topic 2:
 earth, light, energy, planet, brain, species, ocean, animals, fish, air 

Topic 3:
 cells, health, brain, cancer, disease, patients, blood, medical, cell, dna 

Topic 4:
 music, play, sound, hear, god, voice, okay, night, game, head 

Topic 5:
 city, stories, design, community, art, felt, project, house, decided, york 

Topic 6:
 science, reality, brain, ideas, book, theory, universe, self, machine, computer 

Topic 7:
 kids, students, education, language, child, parents, learning, teachers, english, study 

Topic 8:
 war, government, states, political, united, global, countries, state, public, china 

Topic 9:
 women, men, black, woman, sex, girls, female, male, white, gender 



**Tune: n_gram** Probably, not the smartest idea but let's give it a try nonetheless

In [25]:
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.25, min_df = 250, ngram_range = (1, 2))
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)

Topic 0:
 women, men, woman, black, mother, stories, girls, father, girl, white 

Topic 1:
 countries, global, china, climate, economic, economy, africa, india, growth, century 

Topic 2:
 brain, language, sex, study, sleep, brains, heart, pain, blood, memory 

Topic 3:
 cells, cancer, cell, disease, dna, blood, science, game, patients, drug 

Topic 4:
 music, play, video, art, book, read, word, audience, hear, stuff 

Topic 5:
 light, computer, universe, earth, machine, data, energy, science, planet, image 

Topic 6:
 species, animals, earth, ocean, fish, planet, sea, science, animal, ice 

Topic 7:
 city, energy, food, cities, design, car, air, oil, carbon, plant 

Topic 8:
 data, government, states, united, united states, public, war, internet, media, law 

Topic 9:
 kids, health, education, students, dollars, business, community, company, child, food 



Not a huge difference, but I'd say we were better off earlier, so let's drop that range and go with just unigrams. \
It looks like we are pretty in tune here. One last visualization: 

In [26]:
tf = CountVectorizer(stop_words = stop_words, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.25, min_df = 250)
lda_tf, dtm_tf = topic_modeller(script, tf, LatentDirichletAllocation)
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf)

Topic 0:
 food, dollars, countries, africa, city, cities, business, billion, global, economic 

Topic 1:
 data, internet, computer, video, car, online, company, digital, phone, media 

Topic 2:
 earth, light, energy, planet, brain, species, ocean, animals, fish, air 

Topic 3:
 cells, health, brain, cancer, disease, patients, blood, medical, cell, dna 

Topic 4:
 music, play, sound, hear, god, voice, okay, night, game, head 

Topic 5:
 city, stories, design, community, art, felt, project, house, decided, york 

Topic 6:
 science, reality, brain, ideas, book, theory, universe, self, machine, computer 

Topic 7:
 kids, students, education, language, child, parents, learning, teachers, english, study 

Topic 8:
 war, government, states, political, united, global, countries, state, public, china 

Topic 9:
 women, men, black, woman, sex, girls, female, male, white, gender 



  default_term_info = default_term_info.sort_values(


Also, let's add to our stop_words. 

In [27]:
stop_words = stop_words.union(['couldn', 'okay', 'basically'])

And let's save our stop_words for use later. 

In [28]:
data = stop_words
%store data
del data

Stored 'data' (frozenset)


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
