In [None]:
from bertopic import BERTopic
import pandas as pd

dataset_path = '/content/drive/MyDrive/Datasets/reddit_wsb.csv'

In [None]:
df = pd.read_csv(dataset_path, sep=',', index_col=0)

In [None]:
df

Unnamed: 0_level_0,score,id,url,comms_num,created,body,timestamp
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"It's not about the money, it's about sending a message. 🚀💎🙌",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1.611863e+09,,2021-01-28 21:37:41
Math Professor Scott Steiner says the numbers spell DISASTER for Gamestop shorts,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1.611862e+09,,2021-01-28 21:32:10
Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1.611862e+09,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
NEW SEC FILING FOR GME! CAN SOMEONE LESS RETARDED THAN ME PLEASE INTERPRET?,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1.611862e+09,,2021-01-28 21:28:57
"Not to distract from GME, just thought our AMC brothers should be aware of this",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1.611862e+09,,2021-01-28 21:26:56
...,...,...,...,...,...,...,...
Save The American Family. Short Lumber Futures (LBSN1).,5,nw4b9g,https://www.reddit.com/r/wallstreetbets/commen...,18,1.623296e+09,"As everyone one here knows, many commodities h...",2021-06-10 06:35:52
Guh,32,nw4afg,https://i.redd.it/4skl1uwjla471.jpg,41,1.623296e+09,,2021-06-10 06:34:52
$NOK Nok NOK nok'n down heaven's door,43,nw48t7,https://i.redd.it/ftfjlr87la471.gif,14,1.623296e+09,,2021-06-10 06:32:56
Lordstown Motors gains on report of capital raise,30,nw46j0,https://newsfilter.io/a/0f45da878cd91d88433f55...,28,1.623296e+09,,2021-06-10 06:30:21


In [None]:
df.dropna(subset=['body'], inplace=True)

# Preprocessing

In [None]:
import re

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
other_chars = ['*', '#', '&x200B', '[', ']', '; ',' ;' "&nbsp", "“","“","”", "x200b"]
def remove_other_chars(x: str):
    for char in other_chars:
        x = x.replace(char, '')
    
    return x

trim = lambda x : x.strip()

In [None]:
def lower_case_text(text):
    return text.lower()

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
funcs = [
    remove_urls, 
    remove_emoji, 
    remove_other_chars,
    lower_case_text,
    trim]

for fun in funcs:
    df['body'] = df['body'].apply(fun)

In [None]:
data = df.body.tolist()

In [None]:
load = False
save = True
embedding_model = "paraphrase-MiniLM-L6-v2"
model_path = '/content/drive/MyDrive/DL Experiments/NLP/WSBert'

In [None]:
if not load: 
  topic_model = BERTopic(
      language="english", 
      calculate_probabilities=True, 
      embedding_model=embedding_model,
      nr_topics=50, 
      n_gram_range=(1, 2))
else:
  topic_model = BERTopic.load(model_path, embedding_model=embedding_model)

topics, _ = topic_model.fit_transform(data)

In [None]:
# Get the most frequent topics
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq['Count'][topic_freq['Topic']==-1].iloc[0]
print(f"{outliers} documents have not been classified")
print(f"The other {topic_freq['Count'].sum() - outliers} documents are {topic_freq['Topic'].shape[0]-1} topics")

10011 documents have not been classified
The other 12826 documents are 50 topics


In [None]:
topic_freq.head()

Unnamed: 0,Topic,Count
0,-1,10011
1,0,731
2,1,716
3,2,645
4,3,615


In [None]:
print(f"There are {topic_freq['Count'].iloc[1]} documents that are talking about topic ID {topic_freq['Topic'].iloc[1]}")

There are 731 documents that are talking about topic ID 0


In [None]:
topic_model.get_topic(topic_freq['Topic'].iloc[1])

[('amc', 0.0341975689140426),
 ('shares', 0.008474022918010601),
 ('gme', 0.008309746548079313),
 ('are', 0.006577739279598116),
 ('at', 0.006458138735048932),
 ('will', 0.005951601492204748),
 ('buy', 0.0057156037287010965),
 ('amc is', 0.005490058057845866),
 ('stock', 0.005452807481736496),
 ('if', 0.005277048005082728)]

In [None]:
topic_model.visualize_topics()

In [None]:
if save:
  topic_model.save(model_path, save_embedding_model=False)


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.

