In [1]:
import spacy
import pandas as pd
import nl_core_news_lg
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook

In [2]:
df = pd.read_csv('500_nl_data.csv')

In [3]:
df.head(5)

Unnamed: 0,_id,url,date,text,title
0,5ec8066f404b0dba70c1d842,https://vrtnws.be/p.A70oaym3k,2020-05-22T15:07:00.000Z,“Als je 26.000 inwoners hebt en amper 50 van h...,Gratis wifi in Zottegem verdwijnt: “Er zijn te...
1,5ec8066f656ebc99880e73b3,https://vrtnws.be/p.3kO00VKaP,2020-05-22T11:25:17.000Z,De vlucht was onderweg naar Karachi vanuit Lah...,"Vliegtuig neergestort in woonwijk in Pakistan,..."
2,5ec8067060653ee843c32706,https://www.nieuwsblad.be/cnt/dmf20200519_0496...,2020-05-20T01:25:00.000Z,Luchtvaartmaatschappij EasyJet is getroffen do...,Gegevens 9 miljoen EasyJet-klanten gestolen
3,5ec806708e00d85b900a3fd6,https://www.nieuwsblad.be/cnt/dmf20200519_0496...,2020-05-19T11:15:34.000Z,Vliegmaatschappij easyJet is getroffen door ee...,Easyjet heeft cyberaanval ondergaan: gegevens ...
4,5ec806708e00d85b900a3fd7,https://www.nieuwsblad.be/cnt/dmf20200518_0496...,2020-05-18T19:23:33.000Z,Proximus krijgt van de Gegevensbeschermingsaut...,Proximus krijgt recordboete van 50.000 euro vo...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     500 non-null    object
 1   url     500 non-null    object
 2   date    500 non-null    object
 3   text    500 non-null    object
 4   title   500 non-null    object
dtypes: object(5)
memory usage: 19.7+ KB


In [5]:
df_drop = df.drop(columns = ['_id','url', 'date', 'title'])

In [6]:
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    500 non-null    object
dtypes: object(1)
memory usage: 4.0+ KB


In [7]:
df_drop['text'] = df_drop['text'].astype(str)

In [8]:
# SpaCy model:
nlp = nl_core_news_lg.load()
# Tags I want to remove from the text
removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

tokens = []
for summary in nlp.pipe(df_drop['text']):
   proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
   tokens.append(proj_tok)

In [9]:
df_drop['tokens'] = tokens
df_drop['tokens']

0      [inwoner, maken, gebruik, publiek, durven, afv...
1      [vlucht, karachi, lahore, stad, provincie, pun...
2      [luchtvaartmaatschappij, easyjet, treffen, gro...
3      [vliegmaatschappij, easyjet, treffen, groot, d...
4      [proximus, krijgen, gegevensbeschermingsautori...
                             ...                        
495    [festivalzomer, water, vallen, organisator, be...
496    [vlagg, komen, aanvrag, beschikbaar, vlagg, vr...
497    [internet, circuleren, lijst, telenet, wachtwo...
498    [duidelijk, datum, stelen, belgisch, filiaal, ...
499    [jan, avonts, analist, vermogensbeheerder, leo...
Name: tokens, Length: 500, dtype: object

Create dictionary and corpus

In [10]:
# I will apply the Dictionary Object from Gensim, which maps each word to their unique ID:
dictionary = Dictionary(df_drop['tokens'])

In [11]:
print(dictionary)

Dictionary<18165 unique tokens: ['afschaffing', 'afvragen', 'beslissen', 'besparing', 'beëindigen']...>


In [12]:
#some of the IDs assigned to the tokens
print(dictionary.token2id)

{'afschaffing': 0, 'afvragen': 1, 'beslissen': 2, 'besparing': 3, 'beëindigen': 4, 'blind': 5, 'brecht': 6, 'burgerparticipatie': 7, 'buurtnetwerken': 8, 'cassiman': 9, 'contract': 10, 'databundel': 11, 'digitaal': 12, 'durven': 13, 'euro': 14, 'gaan': 15, 'gebruik': 16, 'geld': 17, 'goed': 18, 'goedkoper': 19, 'gratis': 20, 'herinvesteren': 21, 'inwoner': 22, 'jaar': 23, 'kol': 24, 'komen': 25, 'leverancier': 26, 'maken': 27, 'mogelijkheid': 28, 'nodig': 29, 'publiek': 30, 'schip': 31, 'sop': 32, 'sparen': 33, 'stad': 34, 'tarief': 35, 'uitrol': 36, 'voorzien': 37, 'vroegtijdig': 38, 'waard': 39, 'weghalen': 40, 'wifileverancier': 41, 'wifinetwerk': 42, 'zeggen': 43, 'zien': 44, 'zottegem': 45, 'alsof': 46, 'beeld': 47, 'bevatten': 48, 'blokkeren': 49, 'brand': 50, 'cockpit': 51, 'crash': 52, 'dos': 53, 'duidelijk': 54, 'gesprek': 55, 'karachi': 56, 'lahore': 57, 'landingsgestel': 58, 'lijken': 59, 'momenteel': 60, 'motor': 61, 'neerstorten': 62, 'noodsignaal': 63, 'oorzaak': 64, 'pil

In [39]:
#filter out low-frequency and high-frequency tokens, also limit the vocabulary to a max of 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)

In [40]:
#construct the corpus using the dictionary from above and the doc2bow function.
#The function doc2bow() simply counts the number of occurrences of each distinct word,
#converts the word to its integer word id and returns the result as a sparse vector
corpus = [dictionary.doc2bow(doc) for doc in df_drop['tokens']]

Model building

In [41]:
#train the unsupervised machine learning model on the data. 
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

Calculating the coherence score using C_umass

In [None]:
topics = []
score = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
   cm = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

Calculating the coherence score using C_v

In [None]:
topics = []
score = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
   cm = CoherenceModel(model=lda_model, texts = df_drop['tokens'], corpus=corpus, dictionary=dictionary, coherence='c_v')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

When looking at the coherence using the C_umass or C_v algorithm, the best is usually the max. Looking at the graphs I choose to go with 6 topics.

Optimal model

In [42]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, num_topics=9, workers = 4, passes=100)

Print and visualize topics

In [43]:
lda_model.print_topics(-1)

[(0,
  '0.020*"nieuw" + 0.020*"stad" + 0.020*"juni" + 0.016*"gent" + 0.013*"mens" + 0.013*"gemeente" + 0.012*"italië" + 0.012*"blijken" + 0.011*"zeggen" + 0.010*"geval"'),
 (1,
  '0.015*"zeggen" + 0.014*"china" + 0.013*"land" + 0.012*"zitten" + 0.012*"vaccin" + 0.011*"groot" + 0.011*"nieuw" + 0.010*"politiek" + 0.009*"goed" + 0.009*"snel"'),
 (2,
  '0.025*"zeggen" + 0.018*"miljard" + 0.015*"contactonderzoek" + 0.014*"commissie" + 0.014*"euro" + 0.013*"gba" + 0.013*"maken" + 0.011*"land" + 0.011*"databank" + 0.011*"europees"'),
 (3,
  '0.027*"league" + 0.026*"seizoen" + 0.023*"club" + 0.021*"augustus" + 0.021*"spelen" + 0.021*"juli" + 0.018*"ek" + 0.017*"september" + 0.015*"finale" + 0.015*"volgen"'),
 (4,
  '0.019*"mens" + 0.016*"amerikaans" + 0.016*"land" + 0.015*"vrouw" + 0.013*"trump" + 0.013*"maken" + 0.012*"vs" + 0.011*"groot" + 0.011*"goed" + 0.011*"zeggen"'),
 (5,
  '0.023*"app" + 0.019*"mens" + 0.016*"krijgen" + 0.015*"test" + 0.014*"contact" + 0.011*"gegeven" + 0.011*"informat

Where does a text belong to

In [44]:
df['text'][0]

'“Als je 26.000 inwoners hebt en amper 50 van hen maken gebruik van het publieke wifi-netwerk, dan moet je je durven afvragen of het sop de kolen waard is” zegt schepen Brecht Cassiman. “We zien ook dat de tarieven van databundels steeds goedkoper worden, er is een goede 4G-verbinding in Zottegem en er komt ook nog eens 5G aan. Dan moeten we ons afvragen of een gratis wifinetwerk wel nodig is.”\\n\\nDe stad besliste dus om het contract met de wifileverancier vroegtijdig te beëindigen. “In het contract was die mogelijkheid voorzien, de leverancier komt ook alle wifi-punten weghalen. Met de afschaffing van het wifinetwerk sparen we 17.500 euro per jaar uit. Dat is geen blinde besparing, maar dat geld gaan we herinvesteren in burgerparticipatie en de digitale uitrol van buurtnetwerken.'

In [45]:
lda_model[corpus][0]

[(0, 0.14762983), (6, 0.69980663), (8, 0.12223409)]

In [None]:
#According to our LDA model, the above text belongs to Topic 0 and 4.
#The article is 80% belonging to topic 4 (index 3) and 16% belonging to topic 0 (index 1).

Visualization

In [46]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)

In [None]:
#path = os.path.join('.results')

In [None]:
#with open(path, 'wb') as f:
    #pickle.dump(LDAvis,f)

#with open(path, 'rb') as f:
    #LDAvis = pickle.load(f)

#pyLDAvis.save_html(LDAvis, './results' + '_spacy' + '.html')

In [None]:
# Load the SpaCy model
nlp = spacy.load('nl_core_news_lg')

# Tags I want to remove from the text
removal = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM', 'AUX', 'SCONJ', 'INTJ']

# Words I want to remove from the tokens
remove_words = ['al', 'als', 'een', 'om', 'het', 'de', 'dat', 'nog', 'ook']

tokens = []

for summary in nlp.pipe(df_drop['text']):
    proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha
                and token.text.lower() not in remove_words]
    tokens.append(proj_tok)