In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install bertopic

env: TOKENIZERS_PARALLELISM=false
Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/143.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: hdbscan,

In [2]:
import pandas as pd
# we are doing word stuff so we don't really need any of the numerical columns
usecols=['text', 'speaker', 'season', 'episode', 'scene', 'utterance'][:2]
df = pd.read_csv(filepath_or_buffer='/kaggle/input/friends/friends.csv', usecols=usecols)
# we are going to throw away very short documents
df['token count'] = df['text'].apply(lambda x: len(x.split()))
df = df[df['token count'] > 2].drop(columns=['token count'])
df.head()

Unnamed: 0,text,speaker
0,There's nothing to tell! He's just some guy I ...,Monica Geller
1,"C'mon, you're going out with the guy! There's ...",Joey Tribbiani
2,"All right Joey, be nice. So does he have a hum...",Chandler Bing
3,"Wait, does he eat chalk?",Phoebe Buffay
4,"(They all stare, bemused.)",Scene Directions


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54751 entries, 0 to 67371
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     54751 non-null  object
 1   speaker  54672 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [4]:
# let's build our stopword collection
# we collected these from the wordcloud package in a previous run
wordcloud_stopwords = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever', 'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'hence', 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'therefore', 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", 'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']
# let's add in some stopwords from looking at topic -1 in prior runs
prior_stopwords = ['Oh', 'Im', 'dont', 'know', 'No', 'What', 'great', 'believe', 'Well', 'thing', 'um', 'feel', 'way', 'uh', 'youre', 'You', 'Youre',
                  'thats', 'theres', 'right', 'think', 'Yeah', 'We', 'said', 'Ooh', 'things', 'little', 'gonna', 'want',
                   'And', 'one', 'got', 'The', 'really', 'So', 'All', 'Its', 'go', 'see', 'Okay', 'Thats', 'need',  'guys', 'Go',  'back', 'Ill', 'wanna', 'mean', 'cant', 'play', 'Hey', 'going',
                   'getting', 'Scene', 'look', 'now', 'tell', 'good', 'us', 'take', 'time', 'well', 'something', 'Hi', 'Look', 'didnt',
                   'Do', 'lot', 'better', 'new', 'guy', 'much', 'big', 'This', 'give', 'maybe', 'But', 'That', 'make', 'Yknow', 'sorry', 'say', 'Just', 'help',
                   'Ive', 'talking', 'will', 'II', 'come', 'Wow', 'never', 'yknow', 'okay', 'wait', 'night', 'nothing', 'together', 'actually', 'hundred', 'yeah', 'man', 'enough',
                  'uhoh', 'Uhoh', 'OK', 'iswhat', 'Are', 'knowJust', ]
stopwords = sorted(wordcloud_stopwords + prior_stopwords)
print(stopwords[:20])

['All', 'And', 'Are', 'But', 'Do', 'Go', 'Hey', 'Hi', 'II', 'Ill', 'Im', 'Its', 'Ive', 'Just', 'Look', 'No', 'OK', 'Oh', 'Okay', 'Ooh']


In [5]:
from arrow import now
from bertopic import BERTopic
from plotly.express import histogram
from sklearn.feature_extraction.text import CountVectorizer

# we can dial this up and down for testing; a full run takes nearly seven minutes
SAMPLE_FRACTION = 0.99
time_start = now()
large_model = BERTopic(verbose=True, top_n_words=20, nr_topics=None, language='english', 
                       vectorizer_model=CountVectorizer(stop_words=stopwords, lowercase=False))
large_topics, large_probs = large_model.fit_transform(df['text'].sample(frac=SAMPLE_FRACTION, random_state=2023).dropna().values)
print('After {} our topic says we have {} topics.'.format(now() - time_start, len(set(large_topics))))
print('And we have {} documents with no topic ({}%).'.format(large_topics.count(-1), round(100 * large_topics.count(-1)/len(large_topics))))
histogram(x=[item for item in large_topics if item > -1], title = 'topic > -1 histogram')

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1694 [00:00<?, ?it/s]

2023-10-20 16:21:03,696 - BERTopic - Transformed documents to Embeddings
2023-10-20 16:22:01,944 - BERTopic - Reduced dimensionality
2023-10-20 16:22:05,320 - BERTopic - Clustered reduced embeddings


After 0:05:13.536782 our topic says we have 771 topics.
And we have 17368 documents with no topic (32%).


In [6]:
from plotly.express import scatter
def visualize(arg_model, search_term):
    words = []
    values = []
    topic_result = arg_model.get_topic(arg_model.find_topics(search_term=search_term, top_n=1)[0][0],)
    for item in topic_result:
        if item[0] not in {search_term, search_term + 's'}: # drop self and possessive/plural
            words.append(item[0])
            values.append(item[1])
    scatter(x=words, y=values, title=search_term).show()

for term in ['Phoebe', 'Rachel', 'Chandler', 'relationship', 'breakup', 'Marcel', 'U2', 'leather', 'armadillo']:
    visualize(arg_model=large_model, search_term=term)