In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from bertopic.representation import KeyBERTInspired
import umap
from umap import UMAP
from hdbscan import HDBSCAN
import re
import nltk
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import unicodedata


## 0. Preprocessing

In [2]:

# Load data from CSV
all_reddit = pd.read_csv("reddit/all-reddit-final.csv")

# Load stopwords list
with open("stopwords-en.txt", "r") as f:
    stopwords_list = f.read().splitlines()

# Additional custom stopwords
custom_stopwords = list(ENGLISH_STOP_WORDS) + stopwords_list + [
    "uk", "reddit", "subreddit", "subreddits", "sub", "post", "comment", "comments",
    "posts", "thread", "threads", "user", "users", "people", "person",
    "askuk", "moderator", "mod", "moderators", "question", "questions",
    "questioning", "questionings", "answer", "include", "including",
    "includes", "lot", "remove", "removed", "removes", "removing",
    "guide", "common", "messaging", "message", "messages", "appeal", 
    "discussion", "discussions", "discuss", "discussing", "discussed", 
    "diy", "survey", "surveys", "surveying", "subject", "issues", "overcome", 
    "rule", "rules", "read", "search", "engine", "ranting", "vent", "vents", 
    "vented", "venting", "advertising", "title", "light-hearted", "shitposts", 
    "catch-all", "check", "sidebar", "specialised", "specialisation", "delete", 
    "vent", "explicitly", "throwaway", "genuine", "non-genuine", "wikitravel", 
    "tourist", "tripadvisor", "google", "visiting", "suggestions", "suggestion", "submission",
    "submissions", "submit", "submitting", "submitted", "submitter", "easily", "online", 
    "figure", "repetitive", "closed", "answered", "answers", "mark", "ensure", "understand", "gt", "link", "paywall"]


# Preprocessing function to clean text
import re
import unicodedata

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove words with both letters and numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    
    # Remove specific patterns (e.g., '#', 'â', 'î', 'û')
    text = re.sub(r'#', '', text)
    text = re.sub(r'[âîû]', '', text)
    
    # Normalize unicode characters to NFKD form and remove diacritics
    text = ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')
    
    # Remove any remaining non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove words with 2 or fewer characters
    text = ' '.join([word for word in text.split() if len(word) > 2])
    
    return text

# Apply preprocessing to each comment
all_reddit['clean_body'] = all_reddit['body'].apply(preprocess_text)

# Convert the cleaned text to a list
texts = all_reddit['clean_body'].tolist()


In [4]:
# write all_reddit to a csv file
all_reddit.to_csv("reddit_cleaned.csv", index=False)

## 1. Precalculate Embeddings

In [4]:
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import numpy as np
import torch

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Check if a GPU is available and if so, use it
if torch.cuda.is_available():
    embedding_model = embedding_model.to(torch.device("cuda"))

# Function to process a batch
def process_batch(batch):
    return embedding_model.encode(batch, batch_size=len(batch), show_progress_bar=False)

# Set batch size
batch_size = 64

# Generate batches
def generate_batches(texts, batch_size):
    for i in range(0, len(texts), batch_size):
        yield texts[i:i + batch_size]

# Prepare the output array
n = len(texts)
embeds = np.zeros((n, embedding_model.get_sentence_embedding_dimension()))

# Use ThreadPoolExecutor to parallelize the batch processing
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_batch, batch): (i, i + len(batch)) for i, batch in enumerate(generate_batches(texts, batch_size))}
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        i_start, i_end = futures[future]
        embeds[i_start:i_end, :] = future.result()

# Check embeddings shape
print(embeds.shape)


100%|██████████| 258/258 [02:36<00:00,  1.65it/s]

(16459, 384)





In [15]:
# save embeddings
np.save('reddit-embeddings.npy', embeds)

In [4]:
embeds = np.load('reddit-embeddings.npy')
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

## 2. Preventing stochastic behaviour

In [5]:
umap_model = UMAP(n_neighbors=5, n_components=3, min_dist=0.1, metric='cosine', random_state=42)

## 3. Controlling number of topics

In [6]:
hdbscan_model = HDBSCAN(min_cluster_size=50, min_samples=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

## 4. Improving Default Representation

In [7]:
vectorizer_model = CountVectorizer(stop_words=custom_stopwords, min_df=2, ngram_range=(1, 2))

## 5. KeyBERT Representation

In [8]:
# Create your representation model
representation_model = KeyBERTInspired()

## 6. Training

In [9]:
reddit_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    

    # Hyperparameters
    top_n_words=15,
    verbose=True,
    calculate_probabilities=True
)

topics, probs = reddit_model.fit_transform(texts, embeds)

2024-06-28 13:51:03,494 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-06-28 13:51:18,896 - BERTopic - Dimensionality - Completed ✓
2024-06-28 13:51:18,897 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-28 13:51:26,445 - BERTopic - Cluster - Completed ✓
2024-06-28 13:51:26,450 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-28 13:51:39,602 - BERTopic - Representation - Completed ✓


## Checkpoint: Save base model

In [12]:
# Save model
reddit_model.save("base_reddit_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)


## Reload base model

In [195]:
# load reddit model
reddit_model = BERTopic.load("base_reddit_model", embedding_model=embedding_model)

In [196]:
# base model topics
reddit_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5352,-1_migrants_immigration_immigrants_visas,"[migrants, immigration, immigrants, visas, asy...",
1,0,432,0_migrants_asylum seekers_refugees_migrant,"[migrants, asylum seekers, refugees, migrant, ...",
2,1,401,1_migrants_asylum seekers_immigration_refugees,"[migrants, asylum seekers, immigration, refuge...",
3,2,364,2_labour_brexit_migrants_tory,"[labour, brexit, migrants, tory, nhs, tories, ...",
4,3,269,3_citizenship_northern ireland_irish_ireland,"[citizenship, northern ireland, irish, ireland...",
...,...,...,...,...,...
106,105,51,105_brexit_nhs_voted brexit_brexiters,"[brexit, nhs, voted brexit, brexiters, immigra...",
107,106,51,106_asylum seekers_migrants_immigration_immigrate,"[asylum seekers, migrants, immigration, immigr...",
108,107,50,107_immigration britain_migrants_asylum seeker...,"[immigration britain, migrants, asylum seekers...",
109,108,50,108_immigrant british_asylum seekers_refugees_...,"[immigrant british, asylum seekers, refugees, ...",


Base model contain 110 topics, with 5352 outliers. We will deal with outliers first. Let's visualise how the intertopic distance map looks with the base model, so we can see if tweaking improves the model.

In [200]:
# Visualize topics from first iteration
fig = reddit_model.visualize_topics()

fig.update_layout(title_text='Base Model')

fig.update_traces(marker=dict(color='rgba(255, 0, 0, 0.6)', 
                              line=dict(color='DarkSlateGrey', width=2)))

fig.show()

## 7. Outlier Reduction

In [201]:
# Use the "c-TF-IDF" strategy with a threshold
new_topics = reddit_model.reduce_outliers(texts, topics, strategy="c-tf-idf", threshold=0.3)

# Step 2: Apply the distributions strategy with a threshold
new_topics1 = reddit_model.reduce_outliers(texts, new_topics, strategy="distributions", threshold=0.1)

reddit_model.update_topics(texts, topics=new_topics1, vectorizer_model=vectorizer_model)

100%|██████████| 6/6 [00:03<00:00,  1.50it/s]


In [202]:
reddit_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1024,-1_settlement_foreigner_car_time,"[settlement, foreigner, car, time, visa, forei...",
1,0,456,0_asylum_country_women_refugees,"[asylum, country, women, refugees, immigration...",
2,1,413,1_visa_immigration_schengen_time,"[visa, immigration, schengen, time, country, l...",
3,2,368,2_visa_imgs_labour_time,"[visa, imgs, labour, time, immigration, govern...",
4,3,275,3_visa_british_citizenship_foreigners,"[visa, british, citizenship, foreigners, time,...",
...,...,...,...,...,...
106,105,124,105_brexit_voted_trade_political,"[brexit, voted, trade, political, vote, voters...",
107,106,93,106_court_settlement_court settlement_ethnicity,"[court, settlement, court settlement, ethnicit...",
108,107,51,107_asylum_immigration_housing_migrants,"[asylum, immigration, housing, migrants, immig...",
109,108,60,108_wfh_office_hours_week,"[wfh, office, hours, week, meetings, time, asy...",


There are now 1024 outliers. Outliers are outliers for a reason so I'm not going to force all outliers to a topic, lest my topics become less coherent

In [204]:
# Visualize topics from second iteration
fig = reddit_model.visualize_topics()

fig.update_layout(title_text='Model After Outlier Reduction')

fig.update_traces(marker=dict(color='rgba(255, 0, 0, 0.6)', 
                              line=dict(color='DarkSlateGrey', width=2)))

# Show the plot
fig.show()

In [205]:
topic_allocation_110 = pd.DataFrame(reddit_model.get_topic_info())
# save to csv
topic_allocation_110.to_csv('topic-allocation-110.csv', index=False)

## 8. Topic Reduction

In [206]:
topic_allocation_110

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1024,-1_settlement_foreigner_car_time,"[settlement, foreigner, car, time, visa, forei...",
1,0,456,0_asylum_country_women_refugees,"[asylum, country, women, refugees, immigration...",
2,1,413,1_visa_immigration_schengen_time,"[visa, immigration, schengen, time, country, l...",
3,2,368,2_visa_imgs_labour_time,"[visa, imgs, labour, time, immigration, govern...",
4,3,275,3_visa_british_citizenship_foreigners,"[visa, british, citizenship, foreigners, time,...",
...,...,...,...,...,...
106,105,124,105_brexit_voted_trade_political,"[brexit, voted, trade, political, vote, voters...",
107,106,93,106_court_settlement_court settlement_ethnicity,"[court, settlement, court settlement, ethnicit...",
108,107,51,107_asylum_immigration_housing_migrants,"[asylum, immigration, housing, migrants, immig...",
109,108,60,108_wfh_office_hours_week,"[wfh, office, hours, week, meetings, time, asy...",


### 8.1. Visa topics

In [118]:
# get visa topics

visa_topics = []
for index, row in topic_allocation_110.iterrows():
    # Check if 'visa' is the first word in the 'Representation' list
    if 'visa' in row['Representation']:
        # Append the topic number to the list (unless it is -1)
        visa_topics.append(row['Topic']) if row['Topic'] != -1 else None
    

In [120]:
# merge visa topics
reddit_model.merge_topics(texts, visa_topics)

In [121]:
reddit_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1024,-1_settlement_time_foreigner_visa,"[settlement, time, foreigner, visa, car, immig...",[never accept liability and unless you want hi...
1,0,5510,0_visa_immigration_job_time,"[visa, immigration, job, time, country, pay, b...",[the department for education dfe pushing for ...
2,1,468,1_citizenship_british_born_citizen,"[citizenship, british, born, citizen, british ...","[actually got british citizenship now, you wer..."
3,2,456,2_asylum_country_immigration_refugees,"[asylum, country, immigration, refugees, women...",[love that people people support migrating cro...
4,3,425,3_rights_human rights_human_israel,"[rights, human rights, human, israel, palestin...",[reminder that amnesty international and human...
...,...,...,...,...,...
71,70,55,70_nhs_food_immigration_health,"[nhs, food, immigration, health, limits, unlim...",[worked for big dairy the midlands sometime ar...
72,71,55,71_material_country_expat_immigration,"[material, country, expat, immigration, partne...",[check out the sponsor list from the govt webs...
73,72,54,72_living_cost_job_immigration,"[living, cost, job, immigration, citizenship, ...",[this one for your asylum seeker status not pr...
74,73,53,73_israel_jews_potatoes_haganah,"[israel, jews, potatoes, haganah, cps, jewish,...",[these aren any specific order but are just wh...


Now at 75 topics

In [122]:
topic_allocation_75 = pd.DataFrame(reddit_model.get_topic_info())
# save to csv
topic_allocation_75.to_csv('topic-allocation-75.csv', index=False)

### 8.2. Multiculturalism topics

In [125]:
# Multiculturalism Topics

multiculturalism_topics = []
for index, row in topic_allocation_75.iterrows():
    if 'multiculturalism' in row['Representation']:
        # Append the topic number to the list (unless it is -1)
        multiculturalism_topics.append(row['Topic']) if row['Topic'] != -1 else None
    

In [126]:
multiculturalism_topics

[22, 23, 53, 54]

In [127]:
# merge multiculturalism topics
reddit_model.merge_topics(texts, multiculturalism_topics)


In [128]:
reddit_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1024,-1_settlement_time_foreigner_visa,"[settlement, time, foreigner, visa, car, immig...",[never accept liability and unless you want hi...
1,0,5510,0_visa_immigration_time_job,"[visa, immigration, time, job, country, pay, b...",[the department for education dfe pushing for ...
2,1,468,1_citizenship_british_born_citizen,"[citizenship, british, born, citizen, british ...","[actually got british citizenship now, you wer..."
3,2,456,2_asylum_country_immigration_refugees,"[asylum, country, immigration, refugees, women...",[love that people people support migrating cro...
4,3,432,3_immigration_white_multiculturalism_country,"[immigration, white, multiculturalism, country...",[housing expensive result large population gro...
...,...,...,...,...,...
68,67,55,67_material_country_expat_immigration,"[material, country, expat, immigration, partne...",[check out the sponsor list from the govt webs...
69,68,55,68_nhs_food_immigration_health,"[nhs, food, immigration, health, limits, unlim...",[worked for big dairy the midlands sometime ar...
70,69,54,69_living_cost_job_immigration,"[living, cost, job, immigration, citizenship, ...",[this one for your asylum seeker status not pr...
71,70,53,70_israel_jews_potatoes_haganah,"[israel, jews, potatoes, haganah, jewish, cps,...",[these aren any specific order but are just wh...


Now at 72  topics

In [130]:
topic_allocation_72 = pd.DataFrame(reddit_model.get_topic_info()[['Topic', 'Representation']])
# save to csv
topic_allocation_72.to_csv('topic-allocation-72.csv', index=False)

### 8.3. Citizenship & British Identity/Culture topics

In [131]:
citizenship_topics = [1,24,55,63,69]
british_identity_topics = [5,8,54,59,64]

# merge both topics
reddit_model.merge_topics(texts, 
                          [citizenship_topics,
                           british_identity_topics])

In [132]:
reddit_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1024,-1_settlement_time_foreigner_visa,"[settlement, time, foreigner, visa, car, immig...",[said been convicted journalism after collapsi...
1,0,5510,0_visa_immigration_time_job,"[visa, immigration, time, job, country, pay, b...",[don know about miserable say more cynical rem...
2,1,890,1_immigration_country_foreigners_britain,"[immigration, country, foreigners, britain, br...",[exactly why move stay here awful you immigrat...
3,2,789,2_citizenship_british_born_english,"[citizenship, british, born, english, country,...",[you weren born here why would you want stay t...
4,3,456,3_asylum_country_immigration_refugees,"[asylum, country, immigration, refugees, women...",[holiday inn close where live one the hotels t...
...,...,...,...,...,...
60,59,59,59_bacon_south_african_south african,"[bacon, south, african, south african, aparthe...",[buy some english canadian bacon put common pe...
61,60,55,60_country_material_expat_immigration,"[country, material, expat, immigration, partne...",[check out the sponsor list from the govt webs...
62,61,55,61_nhs_immigration_food_health,"[nhs, immigration, food, health, limits, unlim...",[worked for big dairy the midlands sometime ar...
63,62,53,62_israel_jews_potatoes_haganah,"[israel, jews, potatoes, haganah, jewish, cps,...",[these aren any specific order but are just wh...


Now at 64 topics

In [134]:
topic_allocation_64 = pd.DataFrame(reddit_model.get_topic_info()[['Topic', 'Count', 'Representation']])
# save to csv
topic_allocation_64.to_csv('topic-allocation-64.csv', index=False)

### 8.4. Human Rights & War topics

In [135]:
# Human Rights Topics

humanrights_topics = []
for index, row in topic_allocation_64.iterrows():
    if 'human rights' in row['Representation']:
        # Append the topic number to the list (unless it is -1)
        humanrights_topics.append(row['Topic']) if row['Topic'] != -1 else None
    

In [136]:
humanrights_topics

[5, 21, 33, 51, 53]

In [137]:
# war topics
russia_topics = [19,45,55]
israel_topics = [15,29,47]

In [138]:
# merge topics
reddit_model.merge_topics(texts, 
                          [humanrights_topics,
                           russia_topics,
                           israel_topics]
                           )

Now at 55 topics

In [140]:
topic_allocation_55 = pd.DataFrame(reddit_model.get_topic_info()[['Topic', 'Count', 'Representation']])
# save to csv
topic_allocation_55.to_csv('topic-allocation-55.csv', index=False)

### 8.5. Merging more Israel-Palestine topics

In [141]:
refined_israel_topics = [6,25,33,45,46,54]

# merge
reddit_model.merge_topics(texts, refined_israel_topics)

Now at 51 topics

In [143]:
topic_allocation_51 = pd.DataFrame(reddit_model.get_topic_info()[['Topic', 'Count', 'Representation']])
# save to csv
topic_allocation_51.to_csv('topic-allocation-51.csv', index=False)

### 8.6. Asylum, Housing & NHS topics 

In [148]:
asylum_topics = [5,7,12,18,20,50]
housing_topics = [8,11,17,37]
nhs_topics = [32,49]

# merge
reddit_model.merge_topics(texts, 
                          [asylum_topics,
                           housing_topics,
                           nhs_topics])


Now at 42 topics

In [151]:
topic_allocation_42 = pd.DataFrame(reddit_model.get_topic_info()[['Topic', 'Count', 'Representation']])
# save to csv
topic_allocation_42.to_csv('topic-allocation-42.csv', index=False)

### 8.7. Merging more multiculturalism topics & Expat topics

In [152]:
refined_multiculturalism_topics = [7,41]
expat_topics = [9,11,21]

# merge
reddit_model.merge_topics(texts, 
                          [refined_multiculturalism_topics,
                           expat_topics])

Now at 39 topics

In [156]:
topic_allocation_39 = pd.DataFrame(reddit_model.get_topic_info()[['Topic', 'Count', 'Representation']])
# save to csv
topic_allocation_39.to_csv('topic-allocation-39.csv', index=False)

### 8.8. Merging more NHS topics & Detaining topics

In [157]:
detain_topics = [12,13,18]
refined_nhs_topics = [11,14]

# merge
reddit_model.merge_topics(texts, 
                          [detain_topics,
                           refined_nhs_topics])

Final number of topics is 36

In [182]:
topic_allocation_final = pd.DataFrame(reddit_model.get_topic_info())
# save to csv
topic_allocation_final.to_csv('topic-allocation-final.csv', index=False)

In [218]:
fig = reddit_model.visualize_topics()

fig.update_layout(title_text='Model After Topic Reduction')

fig.update_traces(marker=dict(color='rgba(230, 10, 10, 0.9)',  
                              line=dict(color='DarkSlateGrey', width=2)))

fig.show()

In [180]:
fig.write_html("final-iteration.html")

## Checkpoint: Save final BERT

In [184]:
reddit_model.save("final_reddit_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

## Load Final Model

In [2]:
# load reddit model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
reddit_model = BERTopic.load("final_reddit_model", embedding_model=embedding_model)

### Custom labels

In [3]:
# create manual topic labels
reddit_topic_labels = {
    -1: "0: Outliers",
    0: "1: Visa and Immigration Advice",
    1: "2: Asylum Seeking Issues",
    2: "3: British Culture",
    3: "4: Human Rights",
    4: "5: Housing Issues and Refugees",
    5: "6: British Citizenship",
    6: "7: Israel–Hamas war",
    7: "8: Expat Experiences",
    8: "9: Multiculturalism in UK",
    9: "10: Police Detention",
    10: "11: International Medical Students and NHS",
    11: "12: Russian-Ukraine War",
    12: "13: Immigration and Labour",
    13: "14: Left-wing Immigration Politics",
    14: "15: Economic Impact of Immigration",
    15: "16: Brexit and Immigration",
    16: "17: Asylum Boat Arrivals",
    17: "18: Immigration Protests",
    18: "19: European Migration",
    19: "20: Visa and Living Costs",
    20: "21: Gary Lineker Controversy",
    21: "22: Home Office",
    22: "23: Immigration in London",
    23: "24: Court settlements",
    24: "25: Legal Immigration",
    25: "26: Mass immigration",
    26: "27: Criminality and Law Enforcement",
    27: "28: Immigration Enforcement",
    28: "29: Assimilation of Immigrants",
    29: "30: Career pathways for Immigrants",
    30: "31: Mental health",
    31: "32: Policing",
    32: "33: Child Marriage",
    33: "34: Immigration and Healthy Economy",
    34: "35: Remote work and Isolation",
    35: "36: South African Apartheid and Foreigners"
}
# assign topic labels
reddit_model.set_topic_labels(reddit_topic_labels)

In [4]:
fig = reddit_model.visualize_heatmap(n_clusters=20, custom_labels=True)
fig.update_layout(
    title_text='Cosine Similarity Matrix of Reddit Topics',
    coloraxis=dict(colorscale='Redor')
)
fig.show()

In [5]:
# save plotly figure
fig.write_html("reddit-similarity.html")

In [None]:
fig = reddit_model.visualize_heatmap(n_clusters=20, custom_labels=True)
fig.update_layout(
    title_text='Cosine Similarity Matrix of Reddit Topics',
    coloraxis=dict(colorscale='Redor')
)
fig.show()

# Reddit topic assignments

In [170]:
all_reddit['topic'] = reddit_model.topics_

In [220]:
# load final news model
news_model = BERTopic.load("final_news_model", embedding_model=embedding_model)

In [221]:
# Extract topic representations from the news model
news_topics = news_model.get_topics()

# Extract topic representations from the Reddit model
reddit_topics = reddit_model.get_topics()
