# LDA Visualisation for Topics around Lunatic Asylums

This notebook uses pyLDAvis to visualise topics.

In [1]:
!pip install pyLDAvis==2.1.2



In [2]:
import pandas as pd
import numpy as np
import re
import spacy
nlp = spacy.load("en_core_web_sm")

from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)

import nltk
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import models

import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tashfeen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def show_pyldavis(docs, num_topics):
    docs = [remove_stopwords(doc.lower()) for doc in docs]
    
    token_ = [strip_punctuation(' '.join([str(x) for x in nlp(doc)])) for doc in docs]
    token_ = [x.split(" ") for x in token_ if len(x)>2]

    lmtzr = WordNetLemmatizer()

    for token in token_:
        token = [lmtzr.lemmatize(x) for x in token if len(x.strip())>2]
        token = [x for x in token if x not in set(stopwords.words('english'))]

    bigram = Phrases(token_, min_count=5, threshold=2,delimiter=b' ')
    bigram_phraser = Phraser(bigram)

    bigram_token = []
    for sent in token_:
        bigram_token.append(bigram_phraser[sent])

    # now you can make dictionary of bigram token 
    dictionary = gensim.corpora.Dictionary(bigram_token)

    corpus = [dictionary.doc2bow(text) for text in bigram_token]
    lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=20)
    viz = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    return pyLDAvis.display(viz)

In [4]:
df = pd.read_csv('df.csv')
del df['Unnamed: 0']
df

Unnamed: 0,text
0,gtx \r\nsi the 110 phi price\r\nstatistical re...
1,triennial report of the lunatic asylums under ...
2,report on the lunatic asylums under the govern...
3,indian hemp drugs commission vol vi evidence o...
4,leprosy and its control in the bombay presiden...
...,...
60,9 no 1053 a proceedings of the honble the li...
61,report on the working of the micro biological ...
62,annual administration and progress report on t...
63,annual administration and progress report on t...


In [5]:
# Picking 3 articles
a = df['text'].iloc[0]
a = ''.join([i for i in a if not i.isdigit()])
a = " ".join(a.split())
a = ' '.join( [w for w in a.split() if len(w)>1] )
b = df['text'].iloc[1]
b = ''.join([i for i in b if not i.isdigit()])
b = " ".join(b.split())
b = ' '.join( [w for w in b.split() if len(w)>1] )
c = df['text'].iloc[2]
c = ''.join([i for i in c if not i.isdigit()])
c = " ".join(c.split())
c = ' '.join( [w for w in c.split() if len(w)>1] )
d = df['text'].iloc[3]
d = ''.join([i for i in d if not i.isdigit()])
d = " ".join(d.split())
d = ' '.join( [w for w in d.split() if len(w)>1] )

In [8]:
%%time
show_pyldavis(list([a,b,c,d]), 15)

CPU times: user 1min 22s, sys: 7.66 s, total: 1min 30s
Wall time: 1min 10s


In [7]:
def formatText(text):
    text = ''.join([i for i in text if not i.isdigit()])
    text = " ".join(text.split())
    return ' '.join( [w for w in text.split() if len(w)>1] )

In [10]:
a = formatText(df['text'].iloc[4])
b = formatText(df['text'].iloc[5])
c = formatText(df['text'].iloc[6])
d = formatText(df['text'].iloc[7])

show_pyldavis(list([a,b,c,d]), 15)

In [11]:
a = formatText(df['text'].iloc[8])
b = formatText(df['text'].iloc[9])
c = formatText(df['text'].iloc[10])
d = formatText(df['text'].iloc[11])

show_pyldavis(list([a,b,c,d]), 15)

In [12]:
a = formatText(df['text'].iloc[12])
b = formatText(df['text'].iloc[13])
c = formatText(df['text'].iloc[14])
d = formatText(df['text'].iloc[15])

show_pyldavis(list([a,b,c,d]), 15)

In [13]:
a = formatText(df['text'].iloc[16])
b = formatText(df['text'].iloc[17])
c = formatText(df['text'].iloc[18])
d = formatText(df['text'].iloc[19])

show_pyldavis(list([a,b,c,d]), 15)

In [14]:
a = formatText(df['text'].iloc[20])
b = formatText(df['text'].iloc[21])
c = formatText(df['text'].iloc[22])
d = formatText(df['text'].iloc[23])

show_pyldavis(list([a,b,c,d]), 15)