# Topic modelling using BERTopic

## Libraries/data required

In [36]:
# IMPORTS
from bertopic import BERTopic
import os
import pandas as pd
import spacy

In [37]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries

(18520, 5)


Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125
1,The article discusses the military actions tak...,2011-07-03,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011-06-30,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011-07-04,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125


In [38]:
# Load the spacy language model
nlp = spacy.load("en_core_web_sm") # Remember to install this by typing 'python -m spacy download en_core_web_sm' in terminal
 
# Lemmatization on docs
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

# Lemmatize and convert to lowercase
lemmatized_documents = [lemmatize_text(doc).lower() for doc in docs]

## Fitting BERTopic

This might take a while on a CPU. In the background a pre-trained Large Language Model, called the sentence embedder, is used to convert the articles to a semantic vector space. We then perform clustering in this space.

In [39]:
if os.path.exists('southsudan_model'):
    bertopic = BERTopic.load('southsudan_model')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True) # Initialize the BERTopic model

    bertopic.fit_transform(lemmatized_documents) # Fit the model to the list of article summaries
    bertopic.save("southsudan_model") # Save the trained model as "southsudan_model"

Batches: 100%|██████████| 579/579 [08:16<00:00,  1.17it/s]
2023-09-11 11:21:40,775 - BERTopic - Transformed documents to Embeddings
2023-09-11 11:21:50,277 - BERTopic - Reduced dimensionality
2023-09-11 11:23:04,770 - BERTopic - Clustered reduced embeddings


In [40]:
#Due to the modularity of the model, there is a lot of randomness that hinders reproducibiity of the model.
#To fight this, you can for example set random state in the dimensionality reduction step via the following lines 
#or explore a different approach

from bertopic import BERTopic
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42)
topic_model = BERTopic(umap_model=umap_model)

## Interactive visualization of the vector space

As you can see, documents with related topics are close in the space.

In [None]:
bertopic.visualize_documents(docs) # Create a plot of the topics, this may take a while

### Creating smaller topics

Within our list of topics, we find topics that are semantically closest to 4 keywords:

"Hunger", "Refugees", "Conflict", and "Humanitarian".

**Feel free to change this approach!**

In [41]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, threshold):
    '''
    Retrieve a list of the top n number of relevant topics to the provided (list of) keyword(s)
    
    
    Parameters:
        bertopic_model: a (fitted) BERTopic model object
        
        keywords:   a string containing one or multiple keywords to match against,
                    
                    This can also be a list in the form of ['keyword(s)', keyword(s), ...]
                    
                    In this case a maximum of top_n topics will be found per list element 
                    and subsetted to the top_n most relevant topics.
                    
                    !!!
                    Take care that this method only considers the relevancy per inputted keyword(s) 
                    and not the relevancy to the combined list of keywords.
                    
                    In other words, topics that appear in the output might be significantly related to a 
                    particular element in the list of keywords but not so to any other element, 
                    
                    while topics that do not appear in the output might be significantly related to the 
                    combined list of keywords but not much to any of the keyword(s) in particular.
                    !!!
                    
        top_n: an integer indicating the number of desired relevant topics to be retrieved
        
        
        Return: a list of the top_n (or less) topics most relevant to the (list of) provided keyword(s)
    '''
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topic_words, topic_values = bertopic_model.find_topics(keyword)
        
        for word, value in zip(topic_words, topic_values):
            if value >= threshold:
            # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
                relevant_topics.append((word, value))
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [42]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=["hunger", "starvation", "malnutrition", "food insecurity", "poverty", "famine", "food shortage", "nutrient deficiency", "undernourishment", "hunger relief", "food assistance", "food distribution", "hunger crisis", "world hunger", "hunger statistics", "hunger causes", "global food insecurity", "food access", "hunger advocacy", "food banks", "hunger awareness", "hunger programs", "food aid", "hunger solutions", "child malnutrition", "food assistance programs", "hunger eradication", "hunger in developing countries", "hunger and health", "hunger and education"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

8 0.6458866
96 0.6431589
67 0.6257351
226 0.5694127
173 0.5279678
47 0.5115141


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8,173,8_food_million_famine_fao,"[food, million, famine, fao, hunger, insecurit...",[the article discuss the major food security a...
96,36,96_malnutrition_child_unicef_nutrition,"[malnutrition, child, unicef, nutrition, acute...",[the article discuss the joint effort of unice...
67,48,67_wfp_food_programme_million,"[wfp, food, programme, million, assistance, wo...",[the article discuss the u.s. government 's an...
226,13,226_ration_wfp_cut_food,"[ration, wfp, cut, food, refugee, kilocalorie,...",[the article discuss the decision by the world...
173,20,173_wfp_airdrop_food_metric,"[wfp, airdrop, food, metric, assistance, progr...",[the article discuss wfp 's response to the ur...
47,65,47_agriculture_agricultural_farmer_food,"[agriculture, agricultural, farmer, food, fore...",[the article discuss the need for cooperation ...


In [43]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=["refugees", "displaced", "asylum seekers", "forced migration", "internally displaced", "refugee crisis", "migrant camps", "humanitarian aid", "migration policy", "refugee resettlement", "UNHCR", "immigration", "refugee rights", "conflict displacement", "war refugees", "refugee assistance", "displaced populations", "refugee status", "refugee camps", "cross-border displacement", "forced displacement", "migration crisis", "refugee protection", "stateless", "refugee integration", "refugee healthcare", "migration challenges", "displaced children", "refugee support", "host communities"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

5 0.7638098
12 0.7199073
155 0.70956254
129 0.6873833
88 0.6820332
134 0.6795865
219 0.6763356
89 0.6558311
53 0.65565604
137 0.63402426


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,212,5_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[the article discuss the $ 600 million aid ple...
12,149,12_refugee_uganda_district_settlement,"[refugee, uganda, district, settlement, flee, ...",[the article discuss the experience of a south...
155,22,155_unhcr_refugee_funding_million,"[unhcr, refugee, funding, million, provide, ap...",[the article discuss the urgent need for fundi...
129,27,129_refugee_unhcr_ethiopia_camp,"[refugee, unhcr, ethiopia, camp, number, gambe...",[the article discuss the increase number of so...
88,39,88_humanitarian_pibor_jonglei_amos,"[humanitarian, pibor, jonglei, amos, aid, reli...",[the article discuss the worsen situation of c...
134,27,134_kakuma_refugee_camp_kenya,"[kakuma, refugee, camp, kenya, unhcr, influx, ...",[the article discuss the increase number of re...
219,14,219_israel_israeli_asylum_migrant,"[israel, israeli, asylum, migrant, seeker, dep...",[the article discuss the deadline of march 31 ...
89,39,89_refugee_yida_unhcr_camp,"[refugee, yida, unhcr, camp, arrival, nuba, ko...",[the article discuss the increase in aid opera...
53,61,53_worker_aid_humanitarian_killing,"[worker, aid, humanitarian, killing, maban, at...",[the article discuss the killing of at least f...
137,27,137_refugee_nile_water_blue,"[refugee, nile, water, blue, unhcr, supply, up...",[the article discuss the un refugee agency 's ...


In [45]:
# Get the top 10 topics related to the keywords 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["humanitarian", "aid", "assistance", "relief", "charity", "support", "philanthropy", "compassion", "volunteer", "donation", "humanity", "crisis", "disaster", "emergency", "refugees", "displaced", "vulnerable", "disadvantaged", "migrant", "rescue", "peace", "conflict", "war", "healthcare", "shelter", "food", "water", "education", "protection", "rehabilitation", "development", "NGO", "non-profit", "international", "cooperation", "human rights", "community", "resilience", "global", "sustainability", "outreach", "equality", "hope", "solidarity"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

12 0.65837014
200 0.64613986
5 0.64079696
88 0.64049846
219 0.6390035
129 0.6382911
7 0.6244055
134 0.6136804
53 0.602668
155 0.5943496


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,149,12_refugee_uganda_district_settlement,"[refugee, uganda, district, settlement, flee, ...",[the article discuss the experience of a south...
200,17,200_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, mr, ...",[the article discuss a press briefing with the...
5,212,5_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[the article discuss the $ 600 million aid ple...
88,39,88_humanitarian_pibor_jonglei_amos,"[humanitarian, pibor, jonglei, amos, aid, reli...",[the article discuss the worsen situation of c...
219,14,219_israel_israeli_asylum_migrant,"[israel, israeli, asylum, migrant, seeker, dep...",[the article discuss the deadline of march 31 ...
129,27,129_refugee_unhcr_ethiopia_camp,"[refugee, unhcr, ethiopia, camp, number, gambe...",[the article discuss the increase number of so...
7,178,7_human_right_rights_crime,"[human, right, rights, crime, violation, abuse...",[the article discuss the celebration of human ...
134,27,134_kakuma_refugee_camp_kenya,"[kakuma, refugee, camp, kenya, unhcr, influx, ...",[the article discuss the increase number of re...
53,61,53_worker_aid_humanitarian_killing,"[worker, aid, humanitarian, killing, maban, at...",[the article discuss the killing of at least f...
155,22,155_unhcr_refugee_funding_million,"[unhcr, refugee, funding, million, provide, ap...",[the article discuss the urgent need for fundi...


In [46]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["conflict", "fighting", "murder", "violence", "warfare", "combat", "bloodshed", "hostilities", "struggle", "battles", "killings", "homicide", "war", "armed conflict", "deadly force", "combatants", "casualties", "aggression", "civil war", "genocide", "atrocity", "tension", "clashes", "massacre", "war crimes", "assault", "aggressors", "conflict resolution", "peacekeeping", "conflict zones", "conflict management", "armed struggle", "conflict prevention", "conflict escalation", "conflict mediation", "conflict transformation", "interpersonal conflict", "gang violence", "domestic violence", "crime", "criminal activity", "homicide investigation", "murder trial"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids] 

17 0.7015362
220 0.6935157
144 0.64998674
52 0.6469631
181 0.64094657
25 0.5552026
0 0.5429848
114 0.5346274
101 0.5325159
245 0.5209693


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17,130,17_unmiss_mission_un_mandate,"[unmiss, mission, un, mandate, peacekeeping, p...",[the article discuss the extension and revisio...
220,14,220_infantry_china_battalion_peacekeeping,"[infantry, china, battalion, peacekeeping, 700...",[the article discuss the deployment of a 700 -...
144,25,144_troop_tanzania_deployment_peacekeeping,"[troop, tanzania, deployment, peacekeeping, se...",[the article discuss tanzania 's potential dep...
52,62,52_police_rwanda_officer_contingent,"[police, rwanda, officer, contingent, formed, ...",[the article discuss that 170 rwanda national ...
181,19,181_rwanda_rdf_mission_deployment,"[rwanda, rdf, mission, deployment, unit, peace...",[the article discuss the visit of un peacekeep...
25,101,25_arm_embargo_weapon_impose,"[arm, embargo, weapon, impose, sanction, counc...",[the article discuss the decision of the unite...
0,437,0_jonglei_murle_yau_lou,"[jonglei, murle, yau, lou, pibor, nuer, disarm...",[the article discuss an attack by armed lou - ...
114,30,114_ethnic_nuer_dinka_machar,"[ethnic, nuer, dinka, machar, conflict, kiir, ...",[the article discuss the worsen humanitarian s...
101,35,101_gunfire_heavy_juba_guard,"[gunfire, heavy, juba, guard, clash, fighting,...","[the article discuss heavy gunfire in juba , s..."
245,11,245_talk_bor_rebel_machar,"[talk, bor, rebel, machar, jonglei, ababa, add...",[the article discuss peace talk between repres...


In [47]:
# Get the top 10 topics related to the keywords 'natural disaster'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["natural disaster", "disaster", "catastrophe", "emergency", "crisis", "calamity", "hurricane", "tornado", "earthquake", "flood", "tsunami", "volcano", "wildfire", "drought", "typhoon", "avalanche", "landslide", "heatwave", "severe weather", "climate change", "disaster relief", "emergency response", "disaster preparedness", "disaster recovery", "evacuation", "humanitarian aid", "victims", "damage assessment", "emergency management", "natural hazards", "rescue operations", "emergency shelter", "risk reduction", "natural disaster recovery", "environmental disaster", "extreme events", "disaster resilience", "disaster response teams", "emergency services", "community resilience", "disaster mitigation", "disaster-prone areas", "emergency evacuation", "natural disaster statistics"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["natural disaster"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

5 0.7638098
88 0.6820332
155 0.6728723
53 0.65565604
235 0.63344085
27 0.5466782
151 0.5143827


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,212,5_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[the article discuss the $ 600 million aid ple...
88,39,88_humanitarian_pibor_jonglei_amos,"[humanitarian, pibor, jonglei, amos, aid, reli...",[the article discuss the worsen situation of c...
155,22,155_unhcr_refugee_funding_million,"[unhcr, refugee, funding, million, provide, ap...",[the article discuss the urgent need for fundi...
53,61,53_worker_aid_humanitarian_killing,"[worker, aid, humanitarian, killing, maban, at...",[the article discuss the killing of at least f...
235,12,235_ton_metric_transport_sorghum,"[ton, metric, transport, sorghum, corridor, hu...",[the article discuss the arrival of 18 truck c...
27,92,27_flood_flooding_affect_rain,"[flood, flooding, affect, rain, people, water,...",[the article discuss the flooding situation in...
151,23,151_kenyans_evacuation_kenyan_flight,"[kenyans, evacuation, kenyan, flight, evacuate...",[the article discuss the evacuation of kenyans...


In [50]:
# Get the top 10 topics related to the keywords 'agriculture'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["agriculture", "farming", "crop yields", "agricultural practices", "smallholder farmers", "sustainable farming", "soil fertility", "irrigation", "crop rotation", "farm management", "agricultural extension", "livestock farming", "agribusiness", "seed varieties", "agricultural technology", "farm productivity", "farm mechanization", "agroecology", "subsistence farming", "food production"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["agriculture"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

47 0.6531219
171 0.65120435
167 0.51932395
195 0.5187984


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
47,65,47_agriculture_agricultural_farmer_food,"[agriculture, agricultural, farmer, food, fore...",[the article discuss the need for cooperation ...
171,20,171_livestock_animal_cattle_disease,"[livestock, animal, cattle, disease, veterinar...",[the article discuss the movement restriction ...
167,21,167_cattle_raid_raider_warrap,"[cattle, raid, raider, warrap, mayom, county, ...",[the article discuss the recovery of 56 herd o...
195,17,195_cattle_rustler_rustle_steal,"[cattle, rustler, rustle, steal, lamwo, raid, ...",[the article discuss how cattle rustle at poro...


In [52]:
original_df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])

# Combine article summaries with the newly created features
df = original_df.merge(
    df[["summary", "hunger", "refugees", "humanitarian", "conflict", "natural disaster", "agriculture"]],
    how="left",
    left_on="summary",
    right_on="summary",
)

df.to_csv("data/articles_topics.csv", index=False) # Save DataFrame to articles_topics_2.csv

In [None]:
print(len(df))
print(len(df[(df["hunger"]==False) & (df["refugees"] == False) & (df["humanitarian"] == False) & (df["conflict"] == False)]))

There are a lot of articles that do not get sorted into either of the categories. So, feel free to change or expand this approach!