In [1]:
import spacy
import pandas as pd
import nl_core_news_lg
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook

In [2]:
df = pd.read_csv('gegevens_data.csv')

In [3]:
df.head(3)

Unnamed: 0,_id,url,date,text,title
0,5ec8066f404b0dba70c1d842,https://vrtnws.be/p.A70oaym3k,2020-05-22T15:07:00.000Z,“Als je 26.000 inwoners hebt en amper 50 van h...,Gratis wifi in Zottegem verdwijnt: “Er zijn te...
1,5ec8066f656ebc99880e73b3,https://vrtnws.be/p.3kO00VKaP,2020-05-22T11:25:17.000Z,De vlucht was onderweg naar Karachi vanuit Lah...,"Vliegtuig neergestort in woonwijk in Pakistan,..."
2,5ec806707db77e056c19b7d9,https://www.nieuwsblad.be/cnt/dmf20200522_0496...,2020-05-22T14:02:00.000Z,Koksijde - Nu de tweedeverblijvers naar de kus...,Burgemeester Koksijde wil informatie van conta...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18190 entries, 0 to 18189
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     18190 non-null  object
 1   url     18190 non-null  object
 2   date    18190 non-null  object
 3   text    18190 non-null  object
 4   title   18190 non-null  object
dtypes: object(5)
memory usage: 710.7+ KB


In [5]:
df_drop = df.drop(columns = ['_id','url', 'date', 'title'])

In [6]:
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18190 entries, 0 to 18189
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    18190 non-null  object
dtypes: object(1)
memory usage: 142.2+ KB


In [7]:
df_drop['text'] = df_drop['text'].astype(str)

In [8]:
# SpaCy model:
nlp = nl_core_news_lg.load()
# Tags I want to remove from the text
removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

tokens = []
for summary in nlp.pipe(df_drop['text']):
   proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
   tokens.append(proj_tok)

In [9]:
df_drop['tokens'] = tokens
df_drop['tokens']

0        [inwoner, maken, gebruik, publiek, durven, afv...
1        [vlucht, karachi, lahore, stad, provincie, pun...
2        [koksijde, tweedeverblijver, kust, trekken, ma...
3        [luchtvaartmaatschappij, easyjet, treffen, gro...
4        [vliegmaatschappij, easyjet, treffen, groot, d...
                               ...                        
18185    [vrouw, starten, borstvoeding, blijken, nieuw,...
18186    [vrouw, starten, borstvoeding, blijken, nieuw,...
18187    [europees, invoer, vloeibaar, maken, aardgas, ...
18188    [parket, initiatief, onderzoek, starten, delen...
18189    [grappig, bedoelen, meme, barbie, oppenheimer,...
Name: tokens, Length: 18190, dtype: object

Create dictionary and corpus

In [10]:
# I will apply the Dictionary Object from Gensim, which maps each word to their unique ID:
dictionary = Dictionary(df_drop['tokens'])

In [11]:
print(dictionary)

Dictionary<126968 unique tokens: ['afschaffing', 'afvragen', 'beslissen', 'besparing', 'beëindigen']...>


In [12]:
#some of the IDs assigned to the tokens
print(dictionary.token2id)



In [13]:
#filter out low-frequency and high-frequency tokens, also limit the vocabulary to a max of 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)

In [14]:
#construct the corpus using the dictionary from above and the doc2bow function.
#The function doc2bow() simply counts the number of occurrences of each distinct word,
#converts the word to its integer word id and returns the result as a sparse vector
corpus = [dictionary.doc2bow(doc) for doc in df_drop['tokens']]

Model building

In [15]:
#train the unsupervised machine learning model on the data. 
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

Calculating the coherence score using C_umass

In [None]:
topics = []
score = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
   cm = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

Calculating the coherence score using C_v

In [None]:
topics = []
score = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
   cm = CoherenceModel(model=lda_model, texts = df_drop['tokens'], corpus=corpus, dictionary=dictionary, coherence='c_v')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

When looking at the coherence using the C_umass or C_v algorithm, the best is usually the max. Looking at the graphs I choose to go with 6 topics.

Optimal model

In [16]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, num_topics=5, workers = 4, passes=100)

Print and visualize topics

In [17]:
lda_model.print_topics(-1)

[(0,
  '0.188*"probleem" + 0.094*"tijdelijk" + 0.093*"technisch" + 0.076*"ondervinden" + 0.074*"raadplegen" + 0.070*"dataleverancier" + 0.070*"koersinformatie" + 0.058*"russisch" + 0.033*"oekraïne" + 0.032*"rusland"'),
 (1,
  '0.071*"procent" + 0.030*"aantal" + 0.023*"jaar" + 0.022*"dag" + 0.021*"gemiddeld" + 0.020*"mens" + 0.019*"week" + 0.016*"blijken" + 0.016*"cijfer" + 0.015*"stijgen"'),
 (2,
  '0.015*"mens" + 0.014*"land" + 0.010*"zeggen" + 0.010*"vaccin" + 0.009*"europees" + 0.009*"groot" + 0.008*"komen" + 0.008*"nieuw" + 0.008*"maken" + 0.007*"goed"'),
 (3,
  '0.021*"bedrijf" + 0.020*"euro" + 0.017*"jaar" + 0.014*"miljoen" + 0.011*"amerikaans" + 0.010*"groot" + 0.010*"zeggen" + 0.009*"maken" + 0.009*"krijgen" + 0.008*"nieuw"'),
 (4,
  '0.017*"jaar" + 0.016*"komen" + 0.014*"zeggen" + 0.011*"zien" + 0.011*"maken" + 0.010*"staan" + 0.010*"mens" + 0.009*"vinden" + 0.009*"krijgen" + 0.009*"goed"')]

Where does a text belong to

In [18]:
df['text'][0]

'“Als je 26.000 inwoners hebt en amper 50 van hen maken gebruik van het publieke wifi-netwerk, dan moet je je durven afvragen of het sop de kolen waard is” zegt schepen Brecht Cassiman. “We zien ook dat de tarieven van databundels steeds goedkoper worden, er is een goede 4G-verbinding in Zottegem en er komt ook nog eens 5G aan. Dan moeten we ons afvragen of een gratis wifinetwerk wel nodig is.”\\n\\nDe stad besliste dus om het contract met de wifileverancier vroegtijdig te beëindigen. “In het contract was die mogelijkheid voorzien, de leverancier komt ook alle wifi-punten weghalen. Met de afschaffing van het wifinetwerk sparen we 17.500 euro per jaar uit. Dat is geen blinde besparing, maar dat geld gaan we herinvesteren in burgerparticipatie en de digitale uitrol van buurtnetwerken.'

In [19]:
lda_model[corpus][0]

[(3, 0.74666), (4, 0.22561944)]

In [None]:
#According to our LDA model, the above text belongs to Topic 0 and 4.
#The article is 80% belonging to topic 4 (index 3) and 16% belonging to topic 0 (index 1).

Visualization

In [20]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)

In [None]:
#path = os.path.join('.results')

In [None]:
#with open(path, 'wb') as f:
    #pickle.dump(LDAvis,f)

#with open(path, 'rb') as f:
    #LDAvis = pickle.load(f)

#pyLDAvis.save_html(LDAvis, './results' + '_spacy' + '.html')

In [None]:
# Load the SpaCy model
nlp = spacy.load('nl_core_news_lg')

# Tags I want to remove from the text
removal = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM', 'AUX', 'SCONJ', 'INTJ']

# Words I want to remove from the tokens
remove_words = ['al', 'als', 'een', 'om', 'het', 'de', 'dat', 'nog', 'ook']

tokens = []

for summary in nlp.pipe(df_drop['text']):
    proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha
                and token.text.lower() not in remove_words]
    tokens.append(proj_tok)