In [1]:
import spacy
import pandas as pd
import nl_core_news_lg
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook

In [2]:
df = pd.read_csv('nl_1000.csv')

In [3]:
df.head(2)

Unnamed: 0,_id,url,date,text,title
0,5ec8066f656ebc99880e73ae,https://vrtnws.be/p.xV5bZwAjN,2020-05-22T17:00:00.000Z,"Zonder al te technisch te worden, werkt het al...",KU Leuven boekt hoopvolle resultaten in zoekto...
1,5ec8066fac7298b94a3cf3b9,https://vrtnws.be/p.EeR17mpDp,2020-05-22T03:56:28.000Z,Het federale Overlegcomité heeft een handleidi...,Liveblog: Jongerenkampen mogen doorgaan deze z...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     1000 non-null   object
 1   url     1000 non-null   object
 2   date    1000 non-null   object
 3   text    1000 non-null   object
 4   title   1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [5]:
df_drop = df.drop(columns = ['_id','url', 'date', 'title'])

In [6]:
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [7]:
df_drop['text'] = df_drop['text'].astype(str)

In [None]:
# Load the SpaCy model
nlp = spacy.load('nl_core_news_lg')

# Tags I want to remove from the text
removal = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM', 'AUX', 'SCONJ', 'INTJ']

# Words I want to remove from the tokens
remove_words = ['al', 'als', 'een', 'om', 'het', 'de', 'dat', 'nog', 'ook']

tokens = []

for summary in nlp.pipe(df_drop['text']):
    proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha
                and token.text.lower() not in remove_words]
    tokens.append(proj_tok)


In [8]:
# SpaCy model:
nlp = nl_core_news_lg.load()
# Tags I want to remove from the text
removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

tokens = []
for summary in nlp.pipe(df_drop['text']):
   proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
   tokens.append(proj_tok)

In [9]:
df_drop['tokens'] = tokens
df_drop['tokens']

0      [technisch, werken, volgen, januari, genetisch...
1      [federaal, overlegcomité, handleiding, organis...
2      [gendefect, mucopatiënt, last, abnormaal, taai...
3              [brit, zoeken, nieuw, hond, vinden, hond]
4      [lang, vergadering, minister, weyts, onderwijs...
                             ...                        
995    [max, verstappen, wachten, oostenrijk, echt, w...
996    [ga, mens, volgen, dag, twee, verblijf, kust, ...
997    [promotor, jobfixers, belgian, rally, champion...
998    [jaar, seijas, el, bulli, sluiting, toprestaur...
999    [digitaal, ongeletterdheid, gaan, hand, hand, ...
Name: tokens, Length: 1000, dtype: object

Create dictionary and corpus

In [10]:
# I will apply the Dictionary Object from Gensim, which maps each word to their unique ID:
dictionary = Dictionary(df_drop['tokens'])

In [11]:
print(dictionary)

Dictionary<20445 unique tokens: ['afweer', 'bekend', 'code', 'coronavirus', 'gebruik']...>


In [12]:
#some of the IDs assigned to the tokens
print(dictionary.token2id)

{'afweer': 0, 'bekend': 1, 'code': 2, 'coronavirus': 3, 'gebruik': 4, 'geel': 5, 'genetisch': 6, 'heel': 7, 'hopen': 8, 'immuniteit': 9, 'januari': 10, 'lang': 11, 'leggen': 12, 'maken': 13, 'neyts': 14, 'nieuw': 15, 'ontstaan': 16, 'opbouwen': 17, 'plakken': 18, 'publiceren': 19, 'spuit': 20, 'stuk': 21, 'team': 22, 'technisch': 23, 'vaccin': 24, 'volgen': 25, 'werken': 26, 'activiteit': 27, 'begeleider': 28, 'beslissen': 29, 'blijven': 30, 'buurtspeeltuinen': 31, 'contactbubbel': 32, 'contactbubbels': 33, 'doorgaan': 34, 'expert': 35, 'federaal': 36, 'goedkeuren': 37, 'handleiding': 38, 'jaar': 39, 'jeugdactiviteit': 40, 'jeugdhuis': 41, 'jong': 42, 'jongeren': 43, 'juli': 44, 'kind': 45, 'lucht': 46, 'maximaal': 47, 'mei': 48, 'open': 49, 'openen': 50, 'opstarten': 51, 'organisatie': 52, 'oud': 53, 'overleg': 54, 'overlegcomité': 55, 'overnachting': 56, 'persoon': 57, 'speelpleinwerkingen': 58, 'veiligheidsafstand': 59, 'zogenaamd': 60, 'aankunnen': 61, 'abnormaal': 62, 'bedragen': 

In [13]:
#filter out low-frequency and high-frequency tokens, also limit the vocabulary to a max of 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)

In [14]:
#construct the corpus using the dictionary from above and the doc2bow function.
#The function doc2bow() simply counts the number of occurrences of each distinct word,
#converts the word to its integer word id and returns the result as a sparse vector
corpus = [dictionary.doc2bow(doc) for doc in df_drop['tokens']]

Model building

In [15]:
#train the unsupervised machine learning model on the data. 
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers = 4, passes=10)

Calculating the coherence score using C_umass

In [None]:
topics = []
score = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
   cm = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

Calculating the coherence score using C_v

In [None]:
topics = []
score = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
   cm = CoherenceModel(model=lda_model, texts = df_drop['tokens'], corpus=corpus, dictionary=dictionary, coherence='c_v')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

When looking at the coherence using the C_umass or C_v algorithm, the best is usually the max. Looking at the graphs I choose to go with 6 topics.

Optimal model

In [16]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, num_topics=5, workers = 4, passes=100)

Print and visualize topics

In [17]:
lda_model.print_topics(-1)

[(0,
  '0.019*"mens" + 0.017*"zeggen" + 0.013*"week" + 0.013*"gaan" + 0.012*"groot" + 0.012*"winkel" + 0.012*"dag" + 0.012*"nieuw" + 0.009*"procent" + 0.009*"komen"'),
 (1,
  '0.021*"club" + 0.017*"jaar" + 0.017*"seizoen" + 0.017*"politie" + 0.017*"man" + 0.016*"gaan" + 0.014*"spelen" + 0.011*"nieuw" + 0.010*"euro" + 0.010*"maken"'),
 (2,
  '0.017*"zeggen" + 0.016*"open" + 0.016*"minister" + 0.015*"nieuw" + 0.014*"komen" + 0.013*"gaan" + 0.010*"blijven" + 0.010*"partij" + 0.009*"water" + 0.009*"mens"'),
 (3,
  '0.021*"jaar" + 0.018*"gaan" + 0.013*"maken" + 0.013*"komen" + 0.012*"goed" + 0.012*"zeggen" + 0.011*"groot" + 0.010*"weergave" + 0.010*"staan" + 0.009*"tijd"'),
 (4,
  '0.028*"procent" + 0.025*"euro" + 0.023*"miljoen" + 0.023*"jaar" + 0.022*"bedrijf" + 0.010*"coronacrisis" + 0.010*"komen" + 0.010*"groot" + 0.009*"miljard" + 0.009*"maand"')]

Where does a text belong to

In [18]:
df['text'][0]

'Zonder al te technisch te worden, werkt het als volgt: In januari is de genetische code van het coronavirus gepubliceerd. Ook de genetische code van het gele koorts-vaccin is al langer bekend. Het team van Neyts plakt een stukje van de genetische code van het coronavirus in de code van dat vaccin tegen gele koorts.\\n\\n"Zo hopen we dat we met dat nieuwe vaccin afweer opbouwen tegen het coronavirus én dat het mogelijk zal zijn om met één spuitje een heel lange immuniteit te doen ontstaan", legt Neyts uit. Ze maken daarvoor gebruik van de CRISPR-techniek.'

In [19]:
lda_model[corpus][0]

[(0, 0.47982663), (3, 0.4946258)]

In [None]:
#According to our LDA model, the above text belongs to Topic 0 and 4.
#The article is 80% belonging to topic 4 (index 3) and 16% belonging to topic 0 (index 1).

Visualization

In [20]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)

In [None]:
#path = os.path.join('.results')

In [None]:
#with open(path, 'wb') as f:
    #pickle.dump(LDAvis,f)

#with open(path, 'rb') as f:
    #LDAvis = pickle.load(f)

#pyLDAvis.save_html(LDAvis, './results' + '_spacy' + '.html')