In [2]:
# Import Libraries

# We need NLTK and Gensim for LDA Topic Modelling
from nltk import word_tokenize, pos_tag

import gensim
from gensim import matutils, models
from gensim import corpora

import scipy.sparse
import os # to access files for cleaning
from collections import Counter # to count word occurance
import re # Regix to remove punctuation from strings I split
from shutil import copyfile # For copying clean files
from sklearn.feature_extraction.text import CountVectorizer # For creating document-term matrix & excluding stop words
from sklearn.feature_extraction import text # For getting stop words
from wordcloud import WordCloud # For creating word clouds
from textblob import TextBlob # For sentiment analysis
import numpy as np # For dataframe analysis
import pandas as pd # For dataframe analysis
import matplotlib.pyplot as plt # For graphs
import seaborn as sns # For graphs
%matplotlib inline

### Cleaning and Pre-processing Corpus

In [3]:
# take all the text files and compile into one textList
textList = []
for dirname, _, filenames in os.walk('./textfiles'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        myfile = os.path.join(dirname, filename)
        with open(myfile, 'rb') as fopen:
            q = fopen.read().decode('ISO-8859-1')
            textList.append(q) 

In [4]:
# create a dataframe of the corpus
df = pd.DataFrame(textList)
df.head()

Unnamed: 0,0
0,"CHOLERA IN INDIA, 1862 TO 1881. BENGAL PROVINC..."
1,CHOLERA IN SOUTHERN INDIA. A RECORD OF THE PRO...
2,A REPORT OF AN INVESTIGATION INTO THE CAUSES O...
3,"From Commissioner, Benares Division, to Secret..."
4,"No. 1111 (Sanitary), dated Ootacamund, the 6th..."


In [5]:
df.columns = ['text'] #rename col
df.head()

Unnamed: 0,text
0,"CHOLERA IN INDIA, 1862 TO 1881. BENGAL PROVINC..."
1,CHOLERA IN SOUTHERN INDIA. A RECORD OF THE PRO...
2,A REPORT OF AN INVESTIGATION INTO THE CAUSES O...
3,"From Commissioner, Benares Division, to Secret..."
4,"No. 1111 (Sanitary), dated Ootacamund, the 6th..."


In [6]:
df.reset_index(level=0, inplace=True)
df

Unnamed: 0,index,text
0,0,"CHOLERA IN INDIA, 1862 TO 1881. BENGAL PROVINC..."
1,1,CHOLERA IN SOUTHERN INDIA. A RECORD OF THE PRO...
2,2,A REPORT OF AN INVESTIGATION INTO THE CAUSES O...
3,3,"From Commissioner, Benares Division, to Secret..."
4,4,"No. 1111 (Sanitary), dated Ootacamund, the 6th..."
5,5,?AN INQUIRY INTO THE CIRCUMSTANCES ATTENDING...
6,6,ACCOUNT OF PLAGUE ADMINISTRATION IN THE BOMBAY...


In [7]:
def cleanTextInDf(mystring):
    mystring = mystring.lower() #Text normalization: make string lowercase
    mystring = re.sub(r'[^\w\s]','', mystring) #Text normalization: remove punctuation
    mystring = re.sub('\[.*?\]', '', mystring) #Text normalization: remove text in square brackets
    mystring = re.sub('https?://\S+|www\.\S+', '', mystring) #Text normalization: remove links
    mystring = re.sub('\n', '', mystring) #Text normalization: 
    mystring = re.sub('\w*\d\w*', '', mystring)#Text normalization: 
    
    return mystring

In [8]:
# Tokenize and Lemmatize the text (NLP standardisation methods)
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(mystring):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(mystring)]

In [9]:
# apply the cleanTextDf function
cleanText = lambda text: cleanTextInDf(text) # Lambda function applies to all cells in a column
cleandf = pd.DataFrame(df.text.apply(cleanText)) # .apply() the function to all cells
df['text'] = cleandf['text']
df

Unnamed: 0,index,text
0,0,cholera in india to bengal province to and...
1,1,cholera in southern india a record of the prog...
2,2,a report of an investigation into the causes o...
3,3,from commissioner benares division to secretar...
4,4,no sanitary dated ootacamund the october fr...
5,5,an inquiry into the circumstances attending ...
6,6,account of plague administration in the bombay...


### Topic Modelling

- It is a process to automatically identify topics present, and to derive hidden patterns exhibited by a text corpus. 

- Topic Modelling is an unsupervised approach used for finding and observing the bunch of words (called “topics”) in large clusters of texts. 

- Topics can be defined as “a repeating pattern of co-occurring terms in a corpus”. A good topic model should result in – “health”, “doctor”, “patient”, “hospital” for a topic – Healthcare, and “farm”, “crops”, “wheat” for a topic – “Farming”.

- Topic Models are very useful for the purpose for document clustering, organizing large blocks of textual data, information retrieval from unstructured text and feature selection. 

Resources:
https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [10]:
# Tokenize corpus and Filter nouns and adjectives from the corpus (POS tagging)
def partsOfSpeechFilter(text):
    isNounAdj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nounsAdj = [word for (word, pos) in pos_tag(tokenized) if isNounAdj(pos)] 
    return ' '.join(nounsAdj)

### 1. Preparing Document-Term Matrix

To run any mathematical model on text corpus, it is a good practice to convert it into a matrix representation. LDA model looks for repeating term patterns in the entire DT matrix.

- “gensim” is a clean and beautiful library to handle text data. It is scalable, robust and efficient.
- convert a corpus into a document-term matrix

In [21]:
 def BritishIndiaTopicsOverTime(country):
    dfNa = pd.DataFrame(df.text.apply(partsOfSpeechFilter))
    dfNa.set_index('index', inplace=True, drop=True)
    vectorizerNA = CountVectorizer(stop_words='english')
    dataVectorizerNA = vectorizerNA.fit_transform(dfPos.text)
    dataDtmNA = pd.DataFrame(dataVectorizerNA.toarray(), columns = vectorizerNA.get_feature_names())
    dataDtmNA.index = dfPos.index
    corpusNA = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dataDtmNA.transpose()))
    id2wordNA = dict((v, k) for k, v in vectorizerNA.vocabulary_.items()) #create a dictionary
    ldaNA = models.LdaModel(corpus=corpusNA, num_topics=5, id2word=id2wordNA, passes=80) #build LDA model
    return [ldaNA.print_topics(), ldaNA, corpusNA] #print keywords in topics

In [23]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [25]:
# Apply parts of speech filter to filter out nouns and adjectives 
dfPos = pd.DataFrame(df.text.apply(partsOfSpeechFilter)) 


In [26]:
# Get the document term matrix
vectorizerNA = CountVectorizer(stop_words='english')
dataVectorizerNA = vectorizerNA.fit_transform(dfPos.text)
dataDtmNA = pd.DataFrame(dataVectorizerNA.toarray(), columns = vectorizerNA.get_feature_names())
dataDtmNA # This is the document term matrix

Unnamed: 0,________,____________,______________,aad,aas,aazar,ab,abad,abandonment,abate,...,zillah,zinc,zizyphus,zomindaries,zomindary,zone,zoogla,zool,zoological,zulus
0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,8,0,0,1,1,0,0,0,0,0
2,0,0,0,0,0,1,0,0,2,0,...,0,0,0,0,0,2,2,2,1,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,1,0,0,1,14,2,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Map ID to words
corpusNA = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dataDtmNA.transpose()))
id2wordNA = dict((v, k) for k, v in vectorizerNA.vocabulary_.items())

### 2. Running LDA Model

In [28]:
# Run the LDA model

ldaNA = models.LdaModel(corpus=corpusNA, num_topics=5, id2word=id2wordNA, passes=80)
ldaNA.print_topics()

[(0,
  '0.039*"cholera" + 0.017*"district" + 0.015*"deaths" + 0.011*"year" + 0.010*"total" + 0.010*"disease" + 0.008*"cases" + 0.008*"march" + 0.008*"april" + 0.008*"case"'),
 (1,
  '0.024*"plague" + 0.015*"cases" + 0.009*"bombay" + 0.007*"medical" + 0.007*"case" + 0.007*"government" + 0.006*"hospital" + 0.006*"houses" + 0.006*"persons" + 0.006*"officer"'),
 (2,
  '0.016*"disease" + 0.013*"cases" + 0.010*"leprosy" + 0.009*"district" + 0.008*"lepers" + 0.007*"case" + 0.007*"number" + 0.006*"years" + 0.006*"surgeon" + 0.004*"treatment"'),
 (3,
  '0.000*"plague" + 0.000*"cases" + 0.000*"case" + 0.000*"cholera" + 0.000*"time" + 0.000*"district" + 0.000*"deaths" + 0.000*"bombay" + 0.000*"people" + 0.000*"town"'),
 (4,
  '0.015*"cholera" + 0.010*"water" + 0.008*"barracks" + 0.007*"outbreak" + 0.007*"regiment" + 0.007*"hussars" + 0.006*"men" + 0.005*"hospital" + 0.005*"secunderabad" + 0.005*"bazaar"')]

### Topics

1. cholera distric deaths year total diease cases march april case - total cases and deaths by Cholera, mention of March & April.
2. plague cases bombay medical case government hospital houses persons officer - plague cases in Bombay, mention of the government. 
3. disease cases leprosy district lepers case number years surgeon treatment - number of cases of leprosy, mention of surgery treatment.
4. plague cases case cholera time district deaths bombay people town - cases and deaths by cholera and the plague in Bombay|
5. cholera water barracks outbreak regiment hussars men hospital secunderabad bazaar - perhaps an outbreak of cholera in the water barracks of the The Royal Hussars men in Secunderabad Bazaar.

(The Royal Hussars was a cavalry regiment of the British Army)