In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install bertopic

env: TOKENIZERS_PARALLELISM=false
Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: hdbscan,

In [2]:
import pandas as pd
# we only use the text fields so let's not load anything else
df = pd.read_csv(filepath_or_buffer='/kaggle/input/stranger-things-dialogue-dataset/stranger_things_all_dialogue.csv',
                usecols=['raw_text', 'stage_direction', 'dialogue'])
# make a big text field that includes everything
df['text'] = df['raw_text'] + ' ' + df['stage_direction'] + ' ' + df['dialogue'].fillna(value='')
df = df.dropna(subset=['text'])
# try to remove blank tokens
df['text'] = df['text'].apply(func=lambda x: ' '.join(x.split()))
df.head()

Unnamed: 0,raw_text,stage_direction,dialogue,text
0,[crickets chirping],[crickets chirping],,[crickets chirping] [crickets chirping]
1,[alarm blaring],[alarm blaring],,[alarm blaring] [alarm blaring]
2,[panting],[panting],,[panting] [panting]
3,[elevator descending],[elevator descending],,[elevator descending] [elevator descending]
4,[elevator dings],[elevator dings],,[elevator dings] [elevator dings]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10678 entries, 0 to 32518
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   raw_text         10678 non-null  object
 1   stage_direction  10678 non-null  object
 2   dialogue         4594 non-null   object
 3   text             10678 non-null  object
dtypes: object(4)
memory usage: 417.1+ KB


In [4]:
from collections import Counter
text = ' '.join(df['text'].dropna().values).replace('[', '').replace(']', '')
counts = Counter(text.split())
print(counts.most_common(n=20))
unique = set(text.split())
print(len(unique))

[('music', 1801), ('playing', 1392), ('I', 960), ('you', 782), ('in', 769), ('sighs', 764), ('grunts', 637), ('the', 594), ('chuckles', 548), ('a', 522), ('to', 510), ('continues', 478), ('panting', 477), ('Russian', 476), ('Dustin', 458), ('Hopper', 418), ('Mike', 408), ('on', 377), ('Joyce', 368), ('is', 345)]
7127


Some of our most common/prevalent tokens are coming from the stage directions, which is weird.

In [5]:
from collections import defaultdict
document_counts = defaultdict(int)
for index, row in df.dropna(subset=['text']).iterrows():
    current = set(row['text'].replace('[', '').replace(']', '').split())
    for token in current:
        document_counts[token] += 1
percentages = {key: value/len(df['text'].dropna()) for key, value in document_counts.items()}
percentages = sorted(list(percentages.items()), reverse=True, key=lambda x: x[1])
print(percentages[:10])

[('music', 0.08540925266903915), ('playing', 0.06518074545795093), ('I', 0.043079228319910096), ('sighs', 0.038209402509833304), ('in', 0.03568083910844728), ('you', 0.03511893613036149), ('grunts', 0.03137291627645627), ('chuckles', 0.0278141974152463), ('the', 0.02612848848098895), ('Dustin', 0.025285634013860272)]


We count token prevalence in documents to see if we will benefit from setting the count vectorizer max_df parameter; this little analysis suggests not.

In [6]:
from arrow import now
from bertopic import BERTopic
from plotly.express import histogram
from sklearn.feature_extraction.text import CountVectorizer

# we want to leave proper names intact
LOWERCASE = False
# we tune this to control run time during testing
SAMPLE_FRACTION = 1.0
time_start = now()
large_df = df[['text']].sample(frac=SAMPLE_FRACTION, random_state=2023)
large_model = BERTopic(verbose=True, top_n_words=20, nr_topics=None, language='english', 
                       vectorizer_model=CountVectorizer(stop_words='english', lowercase=LOWERCASE, min_df = 3),
                      embedding_model='all-MiniLM-L6-v2', )
large_topics, large_probs = large_model.fit_transform(large_df['text'].dropna().values)
print('After {} our model says we have {} topics.'.format(now() - time_start, len(set(large_topics))))
print('And we have {} documents with no topic ({}%).'.format(large_topics.count(-1), round(100 * large_topics.count(-1)/len(large_topics))))
histogram(x=[item for item in large_topics if item > -1], title = 'topic > -1 histogram')

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/334 [00:00<?, ?it/s]

2023-10-25 15:40:21,165 - BERTopic - Transformed documents to Embeddings
2023-10-25 15:40:59,746 - BERTopic - Reduced dimensionality
2023-10-25 15:41:00,197 - BERTopic - Clustered reduced embeddings


After 0:01:38.107212 our model says we have 295 topics.
And we have 1407 documents with no topic (13%).


In [7]:
# What's in that big topic?
large_model.get_topic(topic=0)

[('grunts', 0.1495348420849909),
 ('grunt', 0.05965315594577972),
 ('Tom', 0.022892293487378673),
 ('cool', 0.02065504421426721),
 ('agree', 0.015440032659949051),
 ('tight', 0.015440032659949051),
 ('tomorrow', 0.015440032659949051),
 ('person', 0.015440032659949051),
 ('shouldn', 0.015440032659949051),
 ('strained', 0.015440032659949051),
 ('Nice', 0.014187756633163575),
 ('Try', 0.014187756633163575),
 ('Trust', 0.014187756633163575),
 ('straining', 0.013376487876678857),
 ('Too', 0.013226975478367221),
 ('Son', 0.013226975478367221),
 ('Be', 0.012450386031864023),
 ('hurts', 0.011800754491154876),
 ('Or', 0.011800754491154876),
 ('Ooh', 0.011243921492253397)]

BERTopic will easily visualize all the topics together with a limited number of words per topic, so let's do that.

In [8]:
large_model.visualize_topics(width=1000, height=1000)

And let's pick some words that are important and see what topics/words are nearby in our corpus.

In [9]:
from math import sqrt
from plotly.express import scatter
from sklearn.manifold import TSNE
from plotly.express.colors import sequential


# 1200 seems big but 1000 is cramped
HEIGHT = 1200

def visualize(model, term):
    # let's build a DataFrame of the first 20 topics that include our term
    # and their top terms, along with their weights
    word = []
    topic_indices = []
    topic_probability = []
    word_indices = []
    word_weight = []
    overall_weight = []
    topics, probabilities = model.find_topics(search_term=term, top_n=20)
    for topic_index, topic in enumerate(topics):
        probability = probabilities[topic_index]
        topic_result = model.get_topic(topic=topic)
        for word_index, word_result in enumerate(topic_result):
            word.append(word_result[0])
            topic_indices.append(topic_index)
            topic_probability.append(probability)
            word_indices.append(word_index)
            word_weight.append(word_result[1])
            # let's use the geometric mean to get individual word weights within our sample
            overall_weight.append(sqrt(probability * word_result[1]))
    plot_df = pd.DataFrame(data={'word': word, 'topic_index': topic_indices, 
                                 'topic_probability': topic_probability, 'word_index': word_indices,
                                 'word_weight': word_weight,'overall_weight': overall_weight})
    # terms can recur, so let's get the 'heaviest' occurrence using the overall weight
    plot_df = plot_df.sort_values(ascending=False, by='overall_weight').drop_duplicates(keep='first', subset=['word'])
    # now we want to transform the data we have into something that looks nice in a graph but also
    # keeps the relationships we have from the weight and rank data above
    tsne = TSNE(n_components=2, learning_rate='auto', n_iter=1000, init='pca', random_state=2023, verbose=0)
    plot_df[['x', 'y']] = tsne.fit_transform(X=plot_df.drop(columns=['word']))
    # we use the overall weight for both the color and the size to differentiate the more
    # important words from the others
    scatter(data_frame=plot_df,
            x='x', y='y', text='word', color='overall_weight', size='overall_weight',
            # Shades of red seem appropriate for Stranger Things
            color_continuous_scale=sequential.Reds, title=term, height=HEIGHT,
            # our colorbar doesn't mean anything intuitive so let's remove it
           ).update_traces(textposition='middle right').update_coloraxes(showscale=False).show()

visualize(model=large_model, term='grunt')

In [10]:
for term in ['chuckle', 'Hopper', 'Eleven', 'Dustin', 'Demodog']:
    visualize(model=large_model, term=term)

In [11]:
for term in ['Dustin']:
    visualize(model=large_model, term=term)