In [3]:
# data manipulation
import pandas as pd

In [1]:
# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# LDA visualization 
import pyLDAvis
import pyLDAvis.sklearn

## Load data

In [4]:
ted = pd.read_csv("ted_clean.csv")

In [5]:
script = ted.transcript

## Text Preprocessing

Try with both Vectorizers and some initial text preprocessing; will tune the processor when tuning vectorizer later. 

**Try with: Count Vectorizer**

In [6]:
tf_vectorizer = CountVectorizer(stop_words = 'english')
dtm_tf = tf_vectorizer.fit_transform(script)
print(dtm_tf.shape)

(4005, 68209)


**Try with: TD-IDF**

In [7]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(script)
print(dtm_tfidf.shape)



(4005, 68209)


# LDA

We have long text, so we will end up using LDA but we will compare topic modellers later. 

**Count Vectorizor** 

In [8]:
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(dtm_tf)

LatentDirichletAllocation(n_components=20, random_state=0)

In [9]:
vocab = tf_vectorizer.get_feature_names()
for idx, topic in enumerate(lda_tf.components_):
    # Select the top 15 words in vocab for this topic.
    top_words = [vocab[i] for i in topic.argsort()[:-16:-1]]
    print(f"Topic {idx}:\n", ", ".join(top_words), "\n")

Topic 0:
 light, universe, like, space, just, science, stars, theory, time, energy, physics, black, matter, know, way 

Topic 1:
 music, sound, play, like, applause, hear, sounds, laughter, going, song, just, really, time, know, way 

Topic 2:
 going, like, computer, really, data, time, just, actually, game, world, things, technology, use, way, make 

Topic 3:
 like, just, really, laughter, people, think, going, did, work, new, time, know, things, design, way 

Topic 4:
 health, cancer, patients, disease, blood, medical, care, patient, body, brain, heart, treatment, drugs, drug, doctors 

Topic 5:
 people, actually, just, think, really, like, going, know, things, time, want, right, way, make, did 

Topic 6:
 like, cells, new, actually, just, really, make, cell, dna, going, different, way, think, years, human 

Topic 7:
 africa, world, people, countries, war, country, african, today, years, time, just, political, europe, like, family 

Topic 8:
 people, said, school, know, did, like, ki

**TD-IDF**

In [10]:
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=20, random_state=0)

In [11]:
vocab = tfidf_vectorizer.get_feature_names()
for idx, topic in enumerate(lda_tfidf.components_):
    # Select the top 15 words in vocab for this topic.
    top_words = [vocab[i] for i in topic.argsort()[:-16:-1]]
    print(f"Topic {idx}:\n", ", ".join(top_words), "\n")

Topic 0:
 fonio, slime, teszler, pheromones, microbiome, gāo, lungi, microbiomes, shirley, lollipop, minto, prophets, augustus, av, minty 

Topic 1:
 olfactory, mycelium, boraqchin, mario, galen, bonica, enheduanna, rng, lt, betsy, wukong, nm, bj, zk, golding 

Topic 2:
 crows, bioluminescence, neruda, whitopia, quad, cayla, norden, mo, fosbury, octopus, saleem, sd, angiogenesis, rufus, cuttlefish 

Topic 3:
 happiness, hiv, artists, planets, encourage, clothes, solving, la, forests, antibiotics, challenging, tower, struggling, philosophy, dancing 

Topic 4:
 dragonfly, sankara, futura, iris, oumuamua, rabies, carousel, eclipse, stumai, nodules, steno, hg, nanopatch, superconductor, archimedes 

Topic 5:
 hercules, clonie, doodling, redwood, tapirs, gk, fishes, blockchain, civility, sw, cr, tomlinson, lhc, cornea, visicalc 

Topic 6:
 nb, yawn, milo, bonobos, bailey, sisyphus, perfectionism, matrices, poe, tribology, anas, adx, mongol, aiva, solemn 

Topic 7:
 givers, chopsticks, thyro

## Visualize

After the visualization, the results make a lot more sense with a CountVectorizer, so we will choose to go ahead with that. 

In [12]:
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  default_term_info = default_term_info.sort_values(


In [13]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

  default_term_info = default_term_info.sort_values(
