# LDA Topic Modelling

In [None]:
#natural language toolkit for stopwords

import nltk
nltk.download("stopwords")

import numpy as np
import json
import glob

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

# validation
from gensim.models import CoherenceModel

#spacy
import spacy
# you may have to run in terminal: python3 -m spacy download en
from nltk.corpus import stopwords

#visualizing the data
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

#surpress some useless warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from tqdm import tqdm

[nltk_data] Downloading package stopwords to /home/ckoss/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [None]:
import sqlite3, pandas as pd, src
 
db = sqlite3.connect(src.PATH / "data/example.db")
cur = db.cursor()
df = pd.DataFrame(cur.execute("""select * from abstracts""").fetchall(), columns=[c[0] for c in cur.description])
df.head()

Unnamed: 0,pk_abstracts,fk_items,text
0,24957473,1451557,Both psychosis and anhedonia have been associated to some extent with striatal functioning. The current study examined wheth...
1,24958118,5420251,This article introduces a new approach for evaluating replication results. It combines effect-size estimation with hypothesi...
2,24958343,6761399,"Major depressive disorder (MDD) is a devastating disease affecting over 300 million people worldwide, and costing an estimat..."
3,24959375,13397708,Posttraumatic stress disorder (PTSD) is associated with regional alterations in brain structure and function that are hypoth...
4,24961506,26129139,This article offers a new model for bulimia nervosa (BN) that explains both the initial impulsive nature of binge eating and...


In [None]:
stopwords = stopwords.words("english")
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Lemmatize and Clean text

In [None]:
#reduce words to nouns, adjectives, verbs, adverbes and transform to root 

def lemmatization(texts, allowed_postags=["NOUN", "ADJ"]):#,"VERB","ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser","ner"]) #reduce expensive computation
    texts_out = []
    
    for text in tqdm(texts):
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_) #lemma = lemmatized form
                
        final = " ".join(new_text)
        texts_out.append(final)
    return texts_out       

lemmatized_texts = lemmatization(df['text'])


100%|██████████| 5557/5557 [01:37<00:00, 56.79it/s]


In [None]:
print(df['text'][0])

Both psychosis and anhedonia have been associated to some extent with striatal functioning. The current study examined whether either psychosis risk or social anhedonia was associated with performance on 3 tasks related to striatal functioning. Psychosis risk participants had extremely elevated Perceptual Aberration/Magical Ideation (PerMag) scores (n = 69), with 43% of psychosis risk participants also having semistructured interview-assessed psychotic-like experiences which further heightens their risk of psychotic disorder (Chapman, Chapman, Kwapil, Eckblad, & Zinser, 1994). Compared with both extremely elevated social anhedonia (n = 60) and control (n = 68) groups, the PerMag group exhibited poorer performance on 2 of the striatum-related tasks, the Weather Prediction Task (WPT) and the Learned Irrelevance Paradigm, but not on Finger Tapping. In addition, PerMag participants with psychotic-like experiences were especially impaired on the WPT. Overall, this study arguably provides th

In [None]:
lemmatized_texts[0]

'psychosis extent striatal functioning current study psychosis risk social performance task striatal functioning psychosis risk participant elevated score % psychosis risk participant interview psychotic like experience risk psychotic disorder elevated social anhedonia control n group group poor performance task addition permag participant psychotic like experience study first evidence psychosis risk social performance task activation associative striatum useful behavioral measure psychosis risk'

In [None]:
def gen_words(texts):
    final = []
    for text in tqdm(texts):
        new = gensim.utils.simple_preprocess(text, deacc=True) #deacc = remove accents
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

100%|██████████| 5557/5557 [00:02<00:00, 2603.23it/s]


In [None]:
print(data_words[0])

['psychosis', 'extent', 'striatal', 'functioning', 'current', 'study', 'psychosis', 'risk', 'social', 'performance', 'task', 'striatal', 'functioning', 'psychosis', 'risk', 'participant', 'elevated', 'score', 'psychosis', 'risk', 'participant', 'interview', 'psychotic', 'like', 'experience', 'risk', 'psychotic', 'disorder', 'elevated', 'social', 'anhedonia', 'control', 'group', 'group', 'poor', 'performance', 'task', 'addition', 'permag', 'participant', 'psychotic', 'like', 'experience', 'study', 'first', 'evidence', 'psychosis', 'risk', 'social', 'performance', 'task', 'activation', 'associative', 'striatum', 'useful', 'behavioral', 'measure', 'psychosis', 'risk']


# Reduce words to numbers

In [None]:
id2word = corpora.Dictionary(data_words)

corpus = []

for text in tqdm(data_words):
    new = id2word.doc2bow(text)
    corpus.append(new)
    
print(corpus[0])

100%|██████████| 5557/5557 [00:00<00:00, 18481.49it/s]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 2), (11, 1), (12, 1), (13, 2), (14, 2), (15, 1), (16, 2), (17, 1), (18, 3), (19, 3), (20, 1), (21, 1), (22, 6), (23, 3), (24, 6), (25, 1), (26, 3), (27, 2), (28, 1), (29, 2), (30, 3), (31, 1)]





In [None]:
for i in range(10):
    print(i, id2word[i])

0 activation
1 addition
2 anhedonia
3 associative
4 behavioral
5 control
6 current
7 disorder
8 elevated
9 evidence


# First simple Model

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, #just an initial guess
                                           random_state=100,
                                           update_every=1, #update topic model after each duration
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


# Lower the perplexity better the model.
# Higher the topic coherence, the topic is more human interpretable.


Perplexity:  -13.360019566719181

Coherence Score:  0.3734046178530506


# Visualize and Analyze the Model

In [None]:
pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
lda_viz

  default_term_info = default_term_info.sort_values(


In [None]:
# are we seeing few overlap of circles (topics)? why?
# are we seeing good representing top words?

# Topic 4: "more", "when", "less" -> not good representer of a topic

# 2nd try - Make the Model better

# Boost word Representation

In [None]:
#BIGRAMS and TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return (trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = [text for text in make_trigrams(data_bigrams)]

In [None]:
print(data_bigrams_trigrams[0])

['psychosis', 'extent', 'striatal', 'functioning', 'current', 'study', 'psychosis', 'risk', 'social', 'performance', 'task', 'striatal', 'functioning', 'psychosis', 'risk', 'participant', 'elevated', 'score', 'psychosis', 'risk', 'participant', 'interview', 'psychotic', 'like', 'experience', 'risk', 'psychotic_disorder', 'elevated', 'social', 'anhedonia', 'control', 'group', 'group', 'poor', 'performance', 'task', 'addition', 'permag', 'participant', 'psychotic', 'like', 'experience', 'study', 'first', 'evidence', 'psychosis', 'risk', 'social', 'performance', 'task', 'activation', 'associative', 'striatum', 'useful', 'behavioral', 'measure', 'psychosis', 'risk']


In [None]:
print(data_bigrams_trigrams[1])

['article', 'new', 'approach', 'replication', 'result', 'effect', 'size', 'estimation', 'hypothesis', 'testing', 'extent', 'replication', 'result', 'consistent', 'effect', 'size', 'big', 'detectable', 'original', 'study', 'approach', 'replication', 'finding', 'benefit', 'unsuccessful', 'replication_attempt', 'study', 'noisy', 'effect', 'different', 'true', 'finding', 'underpowered', 'replication', 'compelling', 'inference', 'general', 'revisited', 'replication', 'particular']


In [None]:
print(data_bigrams_trigrams[2])

['major_depressive', 'disorder', 'devastating', 'disease', 'people', 'productivity', 'health_care', 'wealth', 'research', 'understanding', 'therapy', 'effective', 'symptom', 'disease', 'recent', 'clinical', 'preclinical', 'study', 'genetic', 'screening', 'transgenic', 'rodent', 'major', 'role', 'gene', 'central', 'expression', 'receptor', 'protein', 'individual', 'risk', 'gene', 'brain', 'tissue', 'organism', 'immediate', 'long_term', 'response', 'social', 'environmental', 'stressor', 'primary', 'contributor', 'review', 'current', 'state', 'knowledge', 'physiology', 'occurrence', 'symptom', 'review', 'finding', 'multiple', 'laboratory', 'part', 'symposium', 'topic', 'annual', 'meeting', 'idea', 'datum', 'review', 'great', 'progress', 'past', 'few', 'decade', 'understanding', 'pathway', 'novel', 'treatment', 'detection', 'method', 'disorder', 'right']


In [None]:
print(data_bigrams_trigrams[3])

['posttraumatic_stress_disorder', 'ptsd', 'regional', 'alteration', 'brain', 'structure', 'function', 'symptom', 'cognitive', 'deficit', 'disorder', 'first', 'systematic', 'meta_analysis', 'neurocognitive', 'outcome', 'broad_range', 'cognitive', 'domain', 'profile', 'cognitive', 'deficit', 'clinical', 'factor', 'study', 'characteristic', 'report', 'datum', 'study', 'participant', 'trauma', 'comparison', 'participant', 'healthy', 'comparison', 'participant', 'trauma', 'exposure', 'effect', 'size', 'estimate', 'effect', 'meta_analysis', 'cognitive', 'domain', 'attention', 'memory', 'executive_function', 'verbal', 'learning', 'verbal', 'memory', 'visual', 'learning', 'visual', 'memory', 'language', 'speed', 'information', 'processing', 'visuospatial', 'ability', 'significant', 'neurocognitive', 'effect', 'magnitude', 'large', 'effect', 'size', 'verbal', 'learning', 'speed', 'information', 'processing', 'attention', 'memory', 'verbal', 'memory', 'effect', 'size', 'estimate', 'large', 'trea

In [None]:
#TF-IDF Removal  #drawback:may delete important, but frequent words
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)
texts = data_bigrams_trigrams
corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03 #good treshold? frequency of word in corpus
words = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

# 2nd Model with n_grams and tfidf

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, #just an initial guess
                                           random_state=100,
                                           update_every=1, #update topic model after each duration
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


# Lower the perplexity better the model.
# Higher the topic coherence, the topic is more human interpretable.


Perplexity:  -14.78157699443581

Coherence Score:  0.3709570962275056


In [None]:
pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
lda_viz

  default_term_info = default_term_info.sort_values(


In [None]:
# less overlap, but a lot of non-representative words