In [67]:
from bertopic import BERTopic
import os
import pandas as pd
import spacy
from umap import UMAP

In [68]:
# Running time is around 8 mins
df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])
docs = df["summary"].tolist()

In [70]:
# Load the spacy language model
nlp = spacy.load("en_core_web_sm") # Remember to install this by typing 'python -m spacy download en_core_web_sm' in terminal
 
# Lemmatization on docs
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

# Lemmatize and convert to lowercase
lemmatized_documents = [lemmatize_text(doc).lower() for doc in docs]


In [71]:
if os.path.exists('models/southsudan_model_2'):
    bertopic = BERTopic.load('models/southsudan_model_2')
else:
    bertopic = BERTopic(language="english", calculate_probabilities=True, verbose=True)

    bertopic.fit_transform(lemmatized_documents)
    bertopic.save("models/southsudan_model_2")

In [72]:
umap_model = UMAP(n_neighbors=15, n_components=5, 
                 min_dist=0.0, metric='cosine', random_state=42)
topic_model = BERTopic(umap_model=umap_model)

In [73]:
# We create a function to calculate a list of the top n topics related to (a) given keyword(s)

def get_relevant_topics(bertopic_model, keywords, threshold):
    
    if type(keywords) is str: keywords = [keywords] # If a single string is provided convert it to list type
    
    relevant_topics = list() # Initilize an empty list of relevant topics
    
    for keyword in keywords: # Iterate through list of keywords
        
        # Find the top n number of topics related to the current keyword(s)
        topic_words, topic_values = bertopic_model.find_topics(keyword)
        
        for word, value in zip(topic_words, topic_values):
            if value >= threshold:
            # Add the topics to the list of relevant topics in the form of (topic_id, relevancy)
                relevant_topics.append((word, value))
    
    
    relevant_topics.sort(key=lambda x: x[1]) # Sort the list of topics on ASCENDING ORDER of relevancy
    
    # Get a list of the set of unique topics (with greates relevancy in case of duplicate topics)
    relevant_topics = list(dict(relevant_topics).items())
    
    
    relevant_topics.sort(key=lambda x: x[1], reverse=True) # Now sort the list of topics on DESCENDING ORDER of relevancy
    
    return relevant_topics[:10] # Return a list of the top_n unique relevant topics

In [74]:
# Get the top 10 topics related to the keywords 'refugees' and 'displaced'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=["refugees", "displaced", "asylum seekers", "forced migration", "internally displaced", "refugee crisis", "migrant camps", "humanitarian aid", "migration policy", "refugee resettlement", "UNHCR", "immigration", "refugee rights", "conflict displacement", "war refugees", "refugee assistance", "displaced populations", "refugee status", "refugee camps", "cross-border displacement", "forced displacement", "migration crisis", "refugee protection", "stateless", "refugee integration", "refugee healthcare", "migration challenges", "displaced children", "refugee support", "host communities"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["refugees"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

5 0.76618934
8 0.71858823
162 0.7111849
195 0.69244885
204 0.685431
123 0.6751723
92 0.6746706
54 0.6608417
103 0.6516859
152 0.63317364


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,193,5_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[the article discuss the commitment of the uni...
8,147,8_refugee_uganda_district_flee,"[refugee, uganda, district, flee, settlement, ...",[the article discuss the experience of a south...
162,21,162_unhcr_refugee_funding_million,"[unhcr, refugee, funding, million, appeal, urg...",[the article discuss the urgent need for fundi...
195,16,195_refugee_ethiopia_camp_unhcr,"[refugee, ethiopia, camp, unhcr, number, host,...",[the article discuss the international organiz...
204,15,204_israel_israeli_asylum_migrant,"[israel, israeli, asylum, migrant, seeker, dep...",[the article discuss an incident where egyptia...
123,29,123_kakuma_camp_refugee_kenya,"[kakuma, camp, refugee, kenya, kenyan, influx,...",[the article discuss the increase number of re...
92,39,92_humanitarian_pibor_jonglei_amos,"[humanitarian, pibor, jonglei, amos, aid, affe...",[the article discuss the worsen situation of c...
54,61,54_worker_aid_humanitarian_killing,"[worker, aid, humanitarian, killing, maban, at...",[the article discuss the disappearance of six ...
103,34,103_yida_refugee_unhcr_camp,"[yida, refugee, unhcr, camp, nuba, arrival, mo...",[the article discuss the increase in aid opera...
152,23,152_refugee_water_nile_unhcr,"[refugee, water, nile, unhcr, blue, supply, se...",[the article discuss unhcr 's emergency aid ai...


In [75]:
# Get the top 10 topics related to the keywords 'hunger' and 'food insecurity'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords=["hunger", "starvation", "malnutrition", "food insecurity", "poverty", "famine", "food shortage", "nutrient deficiency", "undernourishment", "hunger relief", "food assistance", "food distribution", "hunger crisis", "world hunger", "hunger statistics", "hunger causes", "global food insecurity", "food access", "hunger advocacy", "food banks", "hunger awareness", "hunger programs", "food aid", "hunger solutions", "child malnutrition", "food assistance programs", "hunger eradication", "hunger in developing countries", "hunger and health", "hunger and education"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["hunger"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

110 0.6535412
4 0.64521635
56 0.6132921
42 0.516693


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
110,32,110_malnutrition_unicef_child_nutrition,"[malnutrition, unicef, child, nutrition, acute...",[the article discuss the joint effort of unice...
4,197,4_food_fao_million_famine,"[food, fao, million, famine, hunger, insecurit...",[the article discuss the effort of fao to scal...
56,61,56_wfp_food_programme_assistance,"[wfp, food, programme, assistance, world, mill...",[the article discuss australia 's aud 1.25 mil...
42,76,42_agriculture_agricultural_farmer_food,"[agriculture, agricultural, farmer, food, farm...",[the article discuss the commitment of south s...


In [76]:
# Get the top 10 topics related to the keywords 'humanitarian'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["humanitarian", "aid", "assistance", "relief", "charity", "support", "philanthropy", "compassion", "volunteer", "donation", "humanity", "crisis", "disaster", "emergency", "refugees", "displaced", "vulnerable", "disadvantaged", "migrant", "rescue", "peace", "conflict", "war", "healthcare", "shelter", "food", "water", "education", "protection", "rehabilitation", "development", "NGO", "non-profit", "international", "cooperation", "human rights", "community", "resilience", "global", "sustainability", "outreach", "equality", "hope", "solidarity"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["humanitarian"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

8 0.658209
195 0.6500483
204 0.64625365
5 0.6364937
9 0.6325885
214 0.6323221
92 0.62714773
54 0.60549974
123 0.6044884
162 0.6009162


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8,147,8_refugee_uganda_district_flee,"[refugee, uganda, district, flee, settlement, ...",[the article discuss the experience of a south...
195,16,195_refugee_ethiopia_camp_unhcr,"[refugee, ethiopia, camp, unhcr, number, host,...",[the article discuss the international organiz...
204,15,204_israel_israeli_asylum_migrant,"[israel, israeli, asylum, migrant, seeker, dep...",[the article discuss an incident where egyptia...
5,193,5_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[the article discuss the commitment of the uni...
9,142,9_human_right_rights_violation,"[human, right, rights, violation, abuse, repor...",[the article discuss the call by various human...
214,14,214_lanzer_toby_coordinator_humanitarian,"[lanzer, toby, coordinator, humanitarian, mr, ...",[the article discuss a press briefing with the...
92,39,92_humanitarian_pibor_jonglei_amos,"[humanitarian, pibor, jonglei, amos, aid, affe...",[the article discuss the worsen situation of c...
54,61,54_worker_aid_humanitarian_killing,"[worker, aid, humanitarian, killing, maban, at...",[the article discuss the disappearance of six ...
123,29,123_kakuma_camp_refugee_kenya,"[kakuma, camp, refugee, kenya, kenyan, influx,...",[the article discuss the increase number of re...
162,21,162_unhcr_refugee_funding_million,"[unhcr, refugee, funding, million, appeal, urg...",[the article discuss the urgent need for fundi...


In [77]:
# Get the top 10 topics related to the keywords 'conflict', 'fighting', 'murder'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["conflict", "fighting", "murder", "violence", "warfare", "combat", "bloodshed", "hostilities", "struggle", "battles", "killings", "homicide", "war", "armed conflict", "deadly force", "combatants", "casualties", "aggression", "civil war", "genocide", "atrocity", "tension", "clashes", "massacre", "war crimes", "assault", "aggressors", "conflict resolution", "peacekeeping", "conflict zones", "conflict management", "armed struggle", "conflict prevention", "conflict escalation", "conflict mediation", "conflict transformation", "interpersonal conflict", "gang violence", "domestic violence", "crime", "criminal activity", "homicide investigation", "murder trial"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["conflict"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids] 

20 0.6992495
232 0.692268
135 0.6604829
171 0.64638436
58 0.6454412
247 0.6106866
111 0.5891198
67 0.5406675
98 0.5382053
161 0.53095514


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20,101,20_unmiss_mandate_mission_peacekeeping,"[unmiss, mandate, mission, peacekeeping, un, c...",[the article discuss the un security council p...
232,12,232_battalion_infantry_peacekeeping_china,"[battalion, infantry, peacekeeping, china, chi...",[the article discuss the deployment of a 700 -...
135,27,135_rwanda_mission_unit_deployment,"[rwanda, mission, unit, deployment, peacekeepi...",[the article discuss the four - day official v...
171,20,171_troop_tanzania_deployment_peacekeeping,"[troop, tanzania, deployment, peacekeeping, se...",[the article discuss tanzania 's potential dep...
58,60,58_police_rwanda_officer_contingent,"[police, rwanda, officer, contingent, formed, ...",[the article discuss the return of a formed po...
247,11,247_genocide_conscience_holocaust_museum,"[genocide, conscience, holocaust, museum, scho...",[the article discuss the warning issue by u.n....
111,32,111_disarmament_jonglei_disarm_gun,"[disarmament, jonglei, disarm, gun, cattle, co...",[the article discuss the disarmament exercise ...
67,52,67_rebel_bentiu_army_unity,"[rebel, bentiu, army, unity, spla, capital, at...",[the article discuss the south sudanese army p...
98,35,98_gunfire_guard_heavy_juba,"[gunfire, guard, heavy, juba, clash, president...",[the article discuss fresh fighting near the p...
161,21,161_dinka_nuer_ethnic_machar,"[dinka, nuer, ethnic, machar, kiir, riek, conf...",[the article discuss the call from internally ...


In [78]:
# Get the top 10 topics related to the keywords 'natural disaster'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["natural disaster", "disaster", "catastrophe", "emergency", "crisis", "calamity", "hurricane", "tornado", "earthquake", "flood", "tsunami", "volcano", "wildfire", "drought", "typhoon", "avalanche", "landslide", "heatwave", "severe weather", "climate change", "disaster relief", "emergency response", "disaster preparedness", "disaster recovery", "evacuation", "humanitarian aid", "victims", "damage assessment", "emergency management", "natural hazards", "rescue operations", "emergency shelter", "risk reduction", "natural disaster recovery", "environmental disaster", "extreme events", "disaster resilience", "disaster response teams", "emergency services", "community resilience", "disaster mitigation", "disaster-prone areas", "emergency evacuation", "natural disaster statistics"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["natural disaster"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

5 0.76618934
162 0.6764333
92 0.6746706
54 0.6608417
237 0.6193409
23 0.5424818
140 0.53392893
207 0.5226085


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,193,5_million_aid_humanitarian_assistance,"[million, aid, humanitarian, assistance, fundi...",[the article discuss the commitment of the uni...
162,21,162_unhcr_refugee_funding_million,"[unhcr, refugee, funding, million, appeal, urg...",[the article discuss the urgent need for fundi...
92,39,92_humanitarian_pibor_jonglei_amos,"[humanitarian, pibor, jonglei, amos, aid, affe...",[the article discuss the worsen situation of c...
54,61,54_worker_aid_humanitarian_killing,"[worker, aid, humanitarian, killing, maban, at...",[the article discuss the disappearance of six ...
237,12,237_ton_metric_corridor_transport,"[ton, metric, corridor, transport, sorghum, de...",[the article discuss the arrival of 18 truck c...
23,92,23_flood_flooding_affect_rain,"[flood, flooding, affect, rain, people, displa...",[the article discuss the flooding situation in...
140,26,140_kenyans_evacuation_evacuate_kenyan,"[kenyans, evacuation, evacuate, kenyan, citize...",[the article discuss the safe evacuation of ke...
207,14,207_bentiu_condition_camp_site,"[bentiu, condition, camp, site, living, diseas...",[the article discuss the living condition of p...


In [79]:
# Get the top 10 topics related to the keywords 'agriculture'
relevant_topics = get_relevant_topics(bertopic_model = bertopic, keywords= ["agriculture", "farming", "crop yields", "agricultural practices", "smallholder farmers", "sustainable farming", "soil fertility", "irrigation", "crop rotation", "farm management", "agricultural extension", "livestock farming", "agribusiness", "seed varieties", "agricultural technology", "farm productivity", "farm mechanization", "agroecology", "subsistence farming", "food production"], threshold=0.5)

topic_ids = [el[0] for el in relevant_topics] # Create seperate list of topic IDs

for topic_id, relevancy in relevant_topics: # Print neat list of (topic_id, relevancy) tuples
    print(topic_id, relevancy)
    
df["agriculture"] = [t in topic_ids for t in bertopic.topics_] # Add boolean column to df if topic in list of relevant topics

# View the Count, Name, Representation, and Representative Docs for the relevant topics
bertopic.get_topic_info().set_index('Topic').loc[topic_ids]

42 0.6576612
163 0.6567588
51 0.5180001


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,76,42_agriculture_agricultural_farmer_food,"[agriculture, agricultural, farmer, food, farm...",[the article discuss the commitment of south s...
163,21,163_livestock_animal_cattle_disease,"[livestock, animal, cattle, disease, veterinar...",[the article discuss how south sudan lead in t...
51,62,51_cattle_raid_warrap_raider,"[cattle, raid, warrap, raider, county, steal, ...",[the article discuss recent violence and cattl...


In [80]:
original_df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])

# Combine article summaries with the newly created features
df = original_df.merge(
    df[["summary", "hunger", "refugees", "humanitarian", "conflict", "natural disaster", "agriculture"]],
    how="left",
    left_on="summary",
    right_on="summary",
)

df.to_csv("data/articles_topics_2.csv", index=False) # Save DataFrame to articles_topics_2.csv