In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bertopic import BERTopic
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [1]:
import numpy as np
import re, string
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import matplotlib.pyplot as plt

def NER(df):
    #remove web characters
    df['tweet'] = df['text'].replace(regex = {'\n': ' ', 
                                    '<br>': ' ', 
                                    '<\br>': ' ',
                                    '<b>': ' ',
                                    '<\b>': ' ',
                                    '&quot;': '"',
                                    '&#39;': '"',
                                    '&amp;': '&'})

    #create new column with '@'s
    df["mentions"] = df['tweet'].apply(lambda x: re.findall(r'@[^ ]+', str(x)))

    #remove 'RT'
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'^RT[\s]+', '', str(x)))

    #remove hyperlinks
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'https?://[^\s\n\r]+', '', str(x)))

    #remove #'s
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'#', '', str(x)))

    #remove duplicates
    df = df.drop_duplicates(subset=['text'], keep=False)

    #remove tweets of len < 2
    df['len'] = df['tweet'].apply(lambda x: len(x))
    df = df[df['len'] > 1]

    return df

def TopicExtraction(df):
    #remove stopwords
    stop_words = stopwords.words('english')
    punctuation = string.punctuation 
    punctuation += "’"
    punctuation += "—"
    stopwords_dict = Counter(stop_words)
    df['tokens'] = df['tweet'].apply(lambda x: [word for sent in sent_tokenize(x) for word in word_tokenize(sent)])
    df['stopwords'] = df['tokens'].apply(lambda x: [w for w in x if w.lower() not in stopwords_dict])
    df['stopwords'] = df['stopwords'].apply(lambda x: [w for w in x if w.lower() not in punctuation])
    df['tweet'] = df['stopwords'].apply(lambda x: ' '.join(x))

    #lemmatize
    lemmatizer = WordNetLemmatizer()    
    df['tweet'] = df['tweet'].apply(lambda x: [lemmatizer.lemmatize(word, pos ='v') for word in word_tokenize(x)])
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join(x))
    return df

In [5]:
df = pd.read_excel("results-01.xlsx")
df = NER(df)
df = TopicExtraction(df)

In [20]:
def bert_model(df):
    tweets = df['tweet'].tolist()
    model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
    topics, probs = model.fit_transform(tweets)
    model.visualize_topics()
    model.visualize_distribution(probs[200], min_probability=0.02)
    return model

def bert_freq(model, n):
    freq = model.get_topic_info()
    top_freqs = freq.sort_values(by=['Count'], ascending=False).head(n)
    print(top_freqs)
    #save top_freqs data
    #os.makedirs('results', exist_ok=True)  
    #top_freqs.to_json('results/frequent_topics.json')
    return None

In [7]:
model = bert_model(df)
bert_freq(model)
#-1 refers to all outliers and should typically be ignored

Batches: 100%|████████████████████████████████| 933/933 [01:58<00:00,  7.87it/s]
2023-06-20 12:23:32,837 - BERTopic - Transformed documents to Embeddings
2023-06-20 12:23:53,090 - BERTopic - Reduced dimensionality
2023-06-20 12:26:30,860 - BERTopic - Clustered reduced embeddings


    Topic  Count                                               Name   
0      -1  10387                    -1_vote_democrats_work_families  \
1       0    777                         0_student_debt_loan_cancel   
2       1    640                    1_border_illegal_alien_southern   
3       2    602                  2_abortion_ban_reproductive_women   
4       3    517                   3_ukraine_putin_russia_ukrainian   
5       4    415                  4_climate_reduction_change_planet   
6       5    411                        5_oil_energy_gas_production   
7       6    320  6_housegop_commitment_commitmenttoamerica_acco...   
8       7    305                    7_crime_bail_criminals_cashless   
9       8    287                      8_judge_jackson_ketanji_brown   
10      9    278                       9_union_workers_unions_labor   
11     10    233               10_drug_prescription_pharma_medicare   
12     11    218                     11_china_communist_chinese_ccp   
13    

In [22]:
topics = bert_freq(model, 6)

   Topic  Count                               Name   
0     -1  10387    -1_vote_democrats_work_families  \
1      0    777         0_student_debt_loan_cancel   
2      1    640    1_border_illegal_alien_southern   
3      2    602  2_abortion_ban_reproductive_women   
4      3    517   3_ukraine_putin_russia_ukrainian   
5      4    415  4_climate_reduction_change_planet   

                                      Representation   
0  [vote, democrats, work, families, get, join, h...  \
1  [student, debt, loan, cancel, relief, borrower...   
2  [border, illegal, alien, southern, bidenborder...   
3  [abortion, ban, reproductive, women, care, rig...   
4  [ukraine, putin, russia, ukrainian, war, russi...   
5  [climate, reduction, change, planet, crisis, e...   

                                 Representative_Docs  
0  [LIVE go build economy work us workers organiz...  
1  [Good morning ☀ Today would great day cancel s...  
2  [first time history 2 million migrants encount...  
3  ['m l

In [16]:
model.visualize_topics()

In [39]:
model.visualize_heatmap(top_n_topics=10)

- Topic 0: Student Debt
- Topic 1: Border
- Topic 2: Abortion
- Topic 3: Russia-Ukraine War
- Topic 4: Climate change
- Topic 5: Oil


In [29]:
model.visualize_hierarchy(top_n_topics=6)

In [34]:
model.visualize_barchart(top_n_topics=6)

### Topics Over Time

In [44]:
#https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html#visualization