In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
import umap
import re
import nltk
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import unicodedata

## 0. Pre-processing

In [None]:

# Load data from CSV
all_news = pd.read_csv("news/all-news-final.csv")

# Load stopwords list
with open("stopwords-en.txt", "r") as f:
    stopwords_list = f.read().splitlines()

# Additional custom stopwords
custom_stopwords = list(ENGLISH_STOP_WORDS) + stopwords_list + ["uk"]


# Preprocessing function to clean text
import re
import unicodedata

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove words with both letters and numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    
    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)
    
    # Remove specific patterns (e.g., '#', 'â', 'î', 'û')
    text = re.sub(r'#', '', text)
    text = re.sub(r'[âîû]', '', text)
    
    # Normalize unicode characters to NFKD form and remove diacritics
    text = ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')
    
    # Remove any remaining non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply preprocessing to each comment
all_news['clean_body'] = all_news['summary'].apply(preprocess_text)

# Convert the cleaned text to a list
texts = all_news['clean_body'].tolist()


In [None]:
# write all_news to csv
all_news.to_csv("news_cleaned.csv", index=False)

In [None]:
# reload the cleaned data
all_news = pd.read_csv("news_cleaned.csv")
texts = all_news['clean_body'].tolist()

## 1. Precalculate Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode(texts, show_progress_bar=True)

In [None]:
# save embeddings
np.save('news-embeddings.npy', embeddings)

In [16]:
import numpy as np
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = np.load('news-embeddings.npy')

## 2. Preventing stochastic behaviour

In [None]:
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=3, min_dist=0.1, metric='cosine', random_state=42)

## 3. Controlling number of topics

In [17]:
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

## 4. Improving Default Representation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words=custom_stopwords, min_df=2, ngram_range=(1, 2))

## 5. Training

In [None]:
from bertopic import BERTopic
news_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,

    # Hyperparameters
    top_n_words=15,
    verbose=True,
    calculate_probabilities=True
)

topics, probs = news_model.fit_transform(texts, embeddings)

In [None]:
news_model.get_topic_info()

## 6. Outlier Reduction

In [None]:
# Reduce outliers
new_topics = news_model.reduce_outliers(texts, topics, strategy="c-tf-idf", threshold=0.10)
new_topics1 = news_model.reduce_outliers(texts, new_topics, probabilities=probs, strategy="probabilities")

# Update the topics
news_model.update_topics(texts, topics=new_topics1, vectorizer_model=vectorizer_model)

In [None]:
news_topics = pd.DataFrame(news_model.get_topic_info())
news_topics.to_csv('news-topics.csv', index=False)

In [None]:
news_model.get_topic_info()

## Checkpoint: Save final BERT

In [None]:
news_model.save("final_news_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

## Load Final Model

In [3]:
# load reddit model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
news_model = BERTopic.load("final_news_model", embedding_model=embedding_model)

In [9]:
# Visualize topics
fig = news_model.visualize_topics(custom_labels=True)

# Update the layout to change the title
fig.update_layout(title_text='Intertopic Distance Map of Immigration News Topics')

# Change the color of the bubbles
fig.update_traces(marker=dict(color='rgba(0, 128, 128, 0.6)', 
                              line=dict(color='DarkSlateGrey', width=2)))
# Show the plot
fig.show()

In [10]:
# save fig to html
fig.write_html("news_intertopic_map.html")

### Custom labels

In [6]:
news_topic_labels = {
    0: "1: Rwanda Court Ruling",
    1: "2: Asylum Seekers Accommodation",
    2: "3: Taliban Convictions",
    3: "4: Braverman Immigration Policy",
    4: "5: Migration Record Figures",
    5: "6: Rishi Sunak Politics",
    6: "7: Illegal Migration Legislation",
    7: "8: Migrants Crossing",
    8: "9: RAF Scampton Asylum",
    9: "10: Migrant Boat Tragedies",
    10: "11: Bibby Stockholm Barge",
    11: "12: EU Migration Deal",
    12: "13: UK Police Crimes",
    13: "14: Protests at Hotels",
    14: "15: Migrant Smuggling Gangs",
    15: "16: Gary Lineker Controversy",
    16: "17: Foreign Worker Visas",
    17: "18: Asylum Claims Backlog",
    18: "19: Missing Unaccompanied Children",
    19: "20: Legionella on Barge",
    20: "21: Portland Asylum Barge",
    21: "22: Stopping Migrant Boats",
    22: "23: Emmanuel Macron Politics",
    23: "24: Foreign University Students",
    24: "25: European Human Rights",
    25: "26: Archbishop on Migration"
}

news_model.set_topic_labels(news_topic_labels)

In [4]:
fig = news_model.visualize_hierarchy(custom_labels=True)
fig.update_layout(
    title={
        'text': "Hierarchical Clustering of News Topics",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {
            'size': 20
        }
    }
)
fig.show()

In [11]:
fig = news_model.visualize_heatmap(n_clusters=5, custom_labels=True)
fig.update_layout(
    title_text='Cosine Similarity Matrix of News Topics',
    coloraxis=dict(colorscale='Darkmint')  
)
fig.show()

In [12]:
# save plotly figure
fig.write_html("news-similarity.html")

There is a dark square at the bottom right showing a cluster of similar topics. Let's visualise these clearer.

In [11]:
fig = news_model.visualize_heatmap(n_clusters=6, custom_labels=True)

fig.update_layout(
    title_text='Similarity of News Topics on Migrants',
    coloraxis=dict(colorscale='Darkmint', showscale=False)  
)

fig.update_traces(
    texttemplate='%{z:.2f}',  
    textfont=dict(size=10)    
)

fig.show()

A similar cluster exists for topics on UK and EU migration policies

In [5]:
fig = news_model.visualize_heatmap(n_clusters=6, custom_labels=True)

fig.update_layout(
    title_text='Similarity of News Topics on UK and EU Migration Policies',
    coloraxis=dict(colorscale='Darkmint', showscale=False)  
)

fig.update_traces(
    texttemplate='%{z:.2f}',  
    textfont=dict(size=10)    
)

fig.show()