In [2]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bertopic.representation import PartOfSpeech, MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
import os
import pandas as pd
import numpy as np
import spacy
from umap import UMAP
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from keybert import KeyBERT
import logging
import matplotlib.pyplot as plt
from bertopic_merge import *

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [81]:
# Suppress the printing of log messages from the BERTopic
logging.getLogger('BERTopic').setLevel(logging.WARNING)

In [3]:
# Running time is around 8 mins
df = pd.read_csv("data/articles_summary_cleaned.csv", parse_dates=["date"])
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.strftime('%Y_%m')
docs = list(df.summary.values)

In [4]:
df

Unnamed: 0,summary,date,location_article,lat,lng
0,The article discusses the passing of the new C...,2011_07,Juba,4.859363,31.571250
1,The article discusses the military actions tak...,2011_07,Abyei,9.838551,28.486396
2,The article discusses the signing of a Framewo...,2011_06,Southern Kordofan,11.036544,30.895824
3,The article discusses the upcoming independenc...,2011_07,South Sudan,6.876992,31.306979
4,The article discusses the need for South Sudan...,2011_07,Juba,4.859363,31.571250
...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023_04,Maiwut Primary Health Care Centre,8.606200,33.924100
18516,The article discusses the bombing and forced e...,2023_04,Khartoum,15.500654,32.559899
18517,The article discusses how Prime Minister Abiy ...,2023_04,Addis Ababa,8.980603,38.757761
18518,The article discusses the collapse of a commer...,2023_04,Kampala International University,0.294360,32.603970


In [83]:
docs

['The article discusses the passing of the new Constitution of the Republic of South Sudan by its parliament before the July 7, 2011 deadline. The new Constitution includes the creation of a new country called the Republic of South Sudan, with its own coat of arms, national flag, motto, central bank, currency, and national anthem. It also highlights the right to citizenship in South Sudan and allows for dual citizenship. The new constitution also allows foreigners who have married South Sudanese citizens to apply and become citizens. The article also mentions the settlement of the border issues between the northern and southern Sudan by calling for a referendum in the contentious oil-rich Abyei region.',
 "The article discusses the military actions taken by Khartoum in the weeks leading up to independence for the Republic of South Sudan. The article suggests that the military actions are a result of the worst elements being fully in charge, and the intense economic distress in North Su

In [84]:
# Load the spacy language model
nlp = spacy.load("en_core_web_sm") # Remember to install this by typing 'python -m spacy download en_core_web_sm' in terminal
 
# Lemmatization on docs
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

# Lemmatize and convert to lowercase
lemmatized_documents = [lemmatize_text(doc).lower() for doc in docs]


In [85]:
# Improves the topic representation
representation_model = PartOfSpeech("en_core_web_sm")

# Extract common vocab
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(lemmatized_documents)
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))

# Remove stopwords
stopwords = CountVectorizer(stop_words='english').get_stop_words()
custom_stopwords = ["article ", "discusses ", "south ", "sudan"]
all_stopwords = list(stopwords) + custom_stopwords

vectorizer_model = CountVectorizer(stop_words=all_stopwords, vocabulary=vocabulary)

# Reduce impact of frequent words
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Dimension and cluster
umap_model = UMAP(n_components=3, n_neighbors=15, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=100,min_samples=10, gen_min_span_tree=True, prediction_data=True)

# Increase diversity
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(lemmatized_documents)
representation_model = MaximalMarginalRelevance(diversity=0.4)

topic_model = BERTopic(nr_topics = "auto", 
                      vectorizer_model = vectorizer_model,
                      representation_model = representation_model,
                      ctfidf_model=ctfidf_model,
                      embedding_model=sentence_model,
                      umap_model=umap_model, 
                      hdbscan_model=hdbscan_model,
                      n_gram_range=(1,4),
                      calculate_probabilities=True,
                      min_topic_size=10)

topics, ini_probs = topic_model.fit_transform(lemmatized_documents)

  idf = np.log((avg_nr_samples / df)+1)
  idf = np.log((avg_nr_samples / df)+1)


In [86]:
# Check if the "models" folder exists
if not os.path.exists("models"):
    # If it doesn't exist, create it
    os.makedirs("models")


topic_model.save("models/Jason_southsudan_model")

In [87]:
df['topics_auto'] = topics
df.to_csv("data/Jason_articles_with_classifications.csv", index=False) 
df

Unnamed: 0,summary,date,location_article,lat,lng,topics_auto
0,The article discusses the passing of the new C...,2011_07,Juba,4.859363,31.571250,-1
1,The article discusses the military actions tak...,2011_07,Abyei,9.838551,28.486396,-1
2,The article discusses the signing of a Framewo...,2011_06,Southern Kordofan,11.036544,30.895824,0
3,The article discusses the upcoming independenc...,2011_07,South Sudan,6.876992,31.306979,0
4,The article discusses the need for South Sudan...,2011_07,Juba,4.859363,31.571250,5
...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023_04,Maiwut Primary Health Care Centre,8.606200,33.924100,3
18516,The article discusses the bombing and forced e...,2023_04,Khartoum,15.500654,32.559899,-1
18517,The article discusses how Prime Minister Abiy ...,2023_04,Addis Ababa,8.980603,38.757761,-1
18518,The article discusses the collapse of a commer...,2023_04,Kampala International University,0.294360,32.603970,-1


In [88]:
df_both = bertopic_merge(df, 0.95, 0.1)
df_both

Unnamed: 0,group1,group2,avg_mean,p-adj_x,p-adj_y
11,5,6,0.0175,1.0,1.0
23,14,17,0.01965,1.0,1.0
1,-1,7,0.02035,1.0,1.0
4,2,7,0.02085,1.0,1.0
6,3,10,0.022,1.0,1.0
19,8,14,0.022,1.0,1.0
15,6,8,0.02885,1.0,1.0
0,-1,2,0.0336,1.0,1.0
21,9,12,0.0408,1.0,1.0
18,7,13,0.0415,1.0,1.0


In [89]:
topics_to_merge = [[2,10,13],[8,14,17], [4,5,11,15], [6,7]]
topic_model.merge_topics(lemmatized_documents, topics_to_merge)
df['topics_stat'] = topic_model.topics_
df.to_csv("data/Jason_articles_with_classifications.csv", index=False) 
df

  idf = np.log((avg_nr_samples / df)+1)


Unnamed: 0,summary,date,location_article,lat,lng,topics_auto,topics_stat
0,The article discusses the passing of the new C...,2011_07,Juba,4.859363,31.571250,-1,-1
1,The article discusses the military actions tak...,2011_07,Abyei,9.838551,28.486396,-1,-1
2,The article discusses the signing of a Framewo...,2011_06,Southern Kordofan,11.036544,30.895824,0,0
3,The article discusses the upcoming independenc...,2011_07,South Sudan,6.876992,31.306979,0,0
4,The article discusses the need for South Sudan...,2011_07,Juba,4.859363,31.571250,5,1
...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023_04,Maiwut Primary Health Care Centre,8.606200,33.924100,3,5
18516,The article discusses the bombing and forced e...,2023_04,Khartoum,15.500654,32.559899,-1,-1
18517,The article discusses how Prime Minister Abiy ...,2023_04,Addis Ababa,8.980603,38.757761,-1,-1
18518,The article discusses the collapse of a commer...,2023_04,Kampala International University,0.294360,32.603970,-1,-1


In [96]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6856,-1_peace_rebel_machar_kiir,"[peace, rebel, machar, kiir, conflict, sudanes...",[the article discuss the sudanese government '...
1,0,6779,0_abyei_peace_kiir_government,"[abyei, peace, kiir, government, agreement, ar...",[the article discuss south sudan 's president ...
2,1,1350,1_pipeline_export_production_fee,"[pipeline, export, production, fee, petroleum,...",[the article discuss how the khartoum governme...
3,2,903,2_lakes_aid_humanitarian_famine,"[lakes, aid, humanitarian, famine, agriculture...",[the article discuss the crisis in south sudan...
4,3,634,3_refugee_unhcr_000_uganda,"[refugee, unhcr, 000, uganda, influx, yida, ai...","[the article discuss the displacement of 35,00..."
5,4,570,4_education_unicef_arabic_examination,"[education, unicef, arabic, examination, recru...",[the article discuss the vulnerability of chil...
6,5,568,5_msf_cholera_outbreak_covid,"[msf, cholera, outbreak, covid, malaria, hospi...",[the article discuss a decline in new cholera ...
7,6,450,6_cup_chinese_qualifier_stadium,"[cup, chinese, qualifier, stadium, boda, ugand...",[the article discuss the popularity of footbal...
8,7,169,7_rape_empowerment_equality_victim,"[rape, empowerment, equality, victim, bangura,...",[the article discuss how the current conflict ...
9,8,133,8_eac_admission_join_arusha,"[eac, admission, join, arusha, tanzania, burun...",[the article discuss south sudan 's admission ...


In [90]:
# limit nr_topics
topic_model_10 = BERTopic(nr_topics = 10, 
                      vectorizer_model = vectorizer_model,
                      representation_model = representation_model,
                      ctfidf_model=ctfidf_model,
                      embedding_model=sentence_model,
                      umap_model=umap_model, 
                      hdbscan_model=hdbscan_model,
                      n_gram_range=(1,4),
                      calculate_probabilities=True,
                      min_topic_size=10)

topics_10, ini_probs_10 = topic_model_10.fit_transform(lemmatized_documents)

  idf = np.log((avg_nr_samples / df)+1)
  idf = np.log((avg_nr_samples / df)+1)


In [91]:
topic_model_10.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6856,-1_president_peace_rebel_article,"[president, peace, rebel, article, kiir, macha...",[the article discuss the sudanese government '...
1,0,6786,0_abyei_security_peace_kiir,"[abyei, security, peace, kiir, government, agr...",[the article discuss the ongoing conflict and ...
2,1,1815,1_refugee_health_aid_camp,"[refugee, health, aid, camp, humanitarian, msf...",[the article discuss the increase number of so...
3,2,1437,2_bank_pipeline_export_revenue,"[bank, pipeline, export, revenue, corruption, ...",[the article discuss south sudan 's plan to bu...
4,3,883,3_child_education_unicef_labour,"[child, education, unicef, labour, gender, min...",[the article discuss the fear express by unice...
5,4,215,4_cup_tournament_qualifier_stadium,"[cup, tournament, qualifier, stadium, cecafa, ...",[the article discuss south sudan 's confirmati...
6,5,158,5_lakes_rumbek_governor_clan,"[lakes, rumbek, governor, clan, county, reveng...",[the article discuss the death of colonel yol ...
7,6,132,6_dam_nile_irrigation_grand,"[dam, nile, irrigation, grand, ethiopia, ripar...",[the article discuss egypt 's support for the ...
8,7,130,7_beijing_yi_foreign_wang,"[beijing, yi, foreign, wang, oil, relation, em...",[the article discuss the chinese envoy for afr...
9,8,108,8_rwandan_peacekeeping_fpu_peacekeeper,"[rwandan, peacekeeping, fpu, peacekeeper, meda...",[the article discuss the united nations award ...


In [92]:
df['topics_10'] = topics_10
df.to_csv("data/Jason_articles_with_classifications.csv", index=False) 
df

Unnamed: 0,summary,date,location_article,lat,lng,topics_auto,topics_stat,topics_10
0,The article discusses the passing of the new C...,2011_07,Juba,4.859363,31.571250,-1,-1,-1
1,The article discusses the military actions tak...,2011_07,Abyei,9.838551,28.486396,-1,-1,-1
2,The article discusses the signing of a Framewo...,2011_06,Southern Kordofan,11.036544,30.895824,0,0,0
3,The article discusses the upcoming independenc...,2011_07,South Sudan,6.876992,31.306979,0,0,0
4,The article discusses the need for South Sudan...,2011_07,Juba,4.859363,31.571250,5,1,2
...,...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023_04,Maiwut Primary Health Care Centre,8.606200,33.924100,3,5,1
18516,The article discusses the bombing and forced e...,2023_04,Khartoum,15.500654,32.559899,-1,-1,-1
18517,The article discusses how Prime Minister Abiy ...,2023_04,Addis Ababa,8.980603,38.757761,-1,-1,-1
18518,The article discusses the collapse of a commer...,2023_04,Kampala International University,0.294360,32.603970,-1,-1,-1


In [93]:
# limit nr_topics
topic_model_7 = BERTopic(nr_topics = 7, 
                      vectorizer_model = vectorizer_model,
                      representation_model = representation_model,
                      ctfidf_model=ctfidf_model,
                      embedding_model=sentence_model,
                      umap_model=umap_model, 
                      hdbscan_model=hdbscan_model,
                      n_gram_range=(1,4),
                      calculate_probabilities=True,
                      min_topic_size=10)

topics_7, ini_probs_7 = topic_model_7.fit_transform(lemmatized_documents)

  idf = np.log((avg_nr_samples / df)+1)
  idf = np.log((avg_nr_samples / df)+1)


In [94]:
topic_model_7.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6856,-1_president_article_peace_south,"[president, article, peace, south, rebel, kiir...",[the article discuss the sudanese government '...
1,0,7777,0_article_south_government_peace,"[article, south, government, peace, security, ...",[the article discuss the continued conflict an...
2,1,1815,1_refugee_health_aid_unhcr,"[refugee, health, aid, unhcr, humanitarian, ca...",[the article discuss the displacement of over ...
3,2,1567,2_pipeline_export_corruption_port,"[pipeline, export, corruption, port, finance, ...",[the article discuss south sudan 's decision t...
4,3,215,3_tournament_qualifier_stadium_cecafa,"[tournament, qualifier, stadium, cecafa, coach...",[the article discuss the training and friendly...
5,4,158,4_lakes_rumbek_dhuol_governor,"[lakes, rumbek, dhuol, governor, clan, county,...",[the article discuss the death of colonel yol ...
6,5,132,5_dam_nile_irrigation_project,"[dam, nile, irrigation, project, ethiopia, gra...",[the article discuss egypt 's support for the ...


In [95]:
df['topics_7'] = topics_7
df.to_csv("data/Jason_articles_with_classifications.csv", index=False) 
df

Unnamed: 0,summary,date,location_article,lat,lng,topics_auto,topics_stat,topics_10,topics_7
0,The article discusses the passing of the new C...,2011_07,Juba,4.859363,31.571250,-1,-1,-1,-1
1,The article discusses the military actions tak...,2011_07,Abyei,9.838551,28.486396,-1,-1,-1,-1
2,The article discusses the signing of a Framewo...,2011_06,Southern Kordofan,11.036544,30.895824,0,0,0,0
3,The article discusses the upcoming independenc...,2011_07,South Sudan,6.876992,31.306979,0,0,0,0
4,The article discusses the need for South Sudan...,2011_07,Juba,4.859363,31.571250,5,1,2,2
...,...,...,...,...,...,...,...,...,...
18515,The article discusses the successful delivery ...,2023_04,Maiwut Primary Health Care Centre,8.606200,33.924100,3,5,1,1
18516,The article discusses the bombing and forced e...,2023_04,Khartoum,15.500654,32.559899,-1,-1,-1,-1
18517,The article discusses how Prime Minister Abiy ...,2023_04,Addis Ababa,8.980603,38.757761,-1,-1,-1,-1
18518,The article discusses the collapse of a commer...,2023_04,Kampala International University,0.294360,32.603970,-1,-1,-1,-1
