In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install bertopic

env: TOKENIZERS_PARALLELISM=false
Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | / done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: hdbs

In [2]:
import pandas as pd
df = pd.read_csv(filepath_or_buffer='/kaggle/input/stranger-things-dialogue-dataset/stranger_things_all_dialogue.csv')
df['text'] = df['raw_text'] + ' ' + df['stage_direction']
df.head()

Unnamed: 0,season,episode,line,raw_text,stage_direction,dialogue,start_time,end_time,text
0,1,1,1,[crickets chirping],[crickets chirping],,00:00:07,00:00:09,[crickets chirping] [crickets chirping]
1,1,1,2,[alarm blaring],[alarm blaring],,00:00:49,00:00:51,[alarm blaring] [alarm blaring]
2,1,1,3,[panting],[panting],,00:00:52,00:00:54,[panting] [panting]
3,1,1,4,[elevator descending],[elevator descending],,00:01:01,00:01:02,[elevator descending] [elevator descending]
4,1,1,5,[elevator dings],[elevator dings],,00:01:09,00:01:10,[elevator dings] [elevator dings]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32519 entries, 0 to 32518
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   season           32519 non-null  int64 
 1   episode          32519 non-null  int64 
 2   line             32519 non-null  int64 
 3   raw_text         32519 non-null  object
 4   stage_direction  10678 non-null  object
 5   dialogue         26435 non-null  object
 6   start_time       32519 non-null  object
 7   end_time         32519 non-null  object
 8   text             10678 non-null  object
dtypes: int64(3), object(6)
memory usage: 2.2+ MB


In [4]:
from collections import Counter
text = ' '.join(df['text'].dropna().values).replace('[', '').replace(']', '')
counts = Counter(text.split())
print(counts.most_common(n=20))
unique = set(text.split())
print(len(unique))

[('music', 1801), ('playing', 1392), ('sighs', 764), ('in', 665), ('grunts', 637), ('chuckles', 548), ('continues', 478), ('panting', 477), ('Russian', 472), ('I', 460), ('Dustin', 454), ('Hopper', 416), ('Mike', 405), ('you', 390), ('Joyce', 365), ('door', 337), ('on', 330), ('Eleven', 313), ('Steve', 312), ('the', 307)]
6924


In [5]:
from collections import defaultdict
document_counts = defaultdict(int)
for index, row in df.dropna(subset=['text']).iterrows():
    current = set(row['text'].replace('[', '').replace(']', '').split())
    for token in current:
        document_counts[token] += 1
percentages = {key: value/len(df['text'].dropna()) for key, value in document_counts.items()}
print(list(percentages.items())[:10])

[('chirping', 0.001311106948866829), ('crickets', 0.00028095148904289194), ('blaring', 0.002341262408690766), ('alarm', 0.003652369357557595), ('panting', 0.023038022101517137), ('descending', 0.00018730099269526128), ('elevator', 0.001311106948866829), ('dings', 0.0014047574452144597), ('breathing', 0.013017418992320659), ('heavily', 0.009833302116501217)]


In [6]:
pcts = sorted(list(percentages.items()), reverse=True, key=lambda x: x[1])
print(pcts[:10])

[('music', 0.08540925266903915), ('playing', 0.06518074545795093), ('I', 0.038677654991571456), ('sighs', 0.038209402509833304), ('in', 0.03568083910844728), ('you', 0.03493163513766623), ('grunts', 0.03137291627645627), ('chuckles', 0.0278141974152463), ('the', 0.025941187488293688), ('Dustin', 0.02519198351751264)]


In [7]:
from arrow import now
from bertopic import BERTopic
from plotly.express import histogram
from sklearn.feature_extraction.text import CountVectorizer

SAMPLE_FRACTION = 1.0
time_start = now()
large_df = df[['text']].sample(frac=SAMPLE_FRACTION, random_state=2023)
large_model = BERTopic(verbose=True, top_n_words=20, nr_topics=None, language='english', 
                       vectorizer_model=CountVectorizer(stop_words='english', lowercase=True,
                                                       min_df = 3),
                      embedding_model='all-MiniLM-L6-v2', )
large_topics, large_probs = large_model.fit_transform(large_df['text'].dropna().values)
print('After {} our model says we have {} topics.'.format(now() - time_start, len(set(large_topics))))
print('And we have {} documents with no topic ({}%).'.format(large_topics.count(-1), round(100 * large_topics.count(-1)/len(large_topics))))
histogram(x=[item for item in large_topics if item > -1], title = 'topic > -1 histogram')

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/334 [00:00<?, ?it/s]

2023-10-24 00:03:45,831 - BERTopic - Transformed documents to Embeddings
2023-10-24 00:04:28,601 - BERTopic - Reduced dimensionality
2023-10-24 00:04:29,150 - BERTopic - Clustered reduced embeddings


After 0:01:39.848207 our model says we have 288 topics.
And we have 1107 documents with no topic (10%).


In [8]:
large_model.visualize_topics(width=1000, height=1000)

In [9]:
from math import log10
from plotly.express import scatter

def visualize(model, term):
    word = []
    topic_probability = []
    word_weight = []
    overall_weight = []
    topics, probabilities = model.find_topics(search_term=term, top_n=20)
    for index, topic in enumerate(topics):
        probability = probabilities[index]
        topic_result = model.get_topic(topic=topic)
        for word_result in topic_result:
            word.append(word_result[0])
            topic_probability.append(probability)
            word_weight.append(word_result[1])
            overall_weight.append(log10(probability * word_result[1]))
    scatter(data_frame=pd.DataFrame(data={'word': word, 'topic_probability': topic_probability, 'word_weight': word_weight,
                                         'overall_weight': overall_weight}),
            x='topic_probability', y='word_weight', hover_name='word', color='overall_weight', log_x=True, log_y=True,
            color_continuous_scale='Reds', title=term,
           ).show()

visualize(model=large_model, term='grunt')

In [10]:
visualize(model=large_model, term='hopper')

In [11]:
visualize(model=large_model, term='chuckle')