regular expression help: https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285

### Import Required Libraries

In [None]:
#General
from pprint import pprint

# Data cleaning
import re # RegEx for regular expression

# Preprocess
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
stemmer = SnowballStemmer("english") # Choose a language
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

# Topic modelling LDA
import gensim
from gensim import corpora, models

# Visualisation
import pyLDAvis.gensim

# custom
from mab_text_utils import MabTextUtils

### Load Data

In [None]:
BASE = 'D:\\ResearchDataGtx1060\\RRR_Data\\CorporaToAnalysis\\'
fins = ['Flower1907\\Flower1907.txt', 'BarnardDavis\\BarnardDavis.txt']
track=0

In [None]:
full_text = ''
with open(BASE+fins[track], mode='r', encoding='utf8') as FI:
    full_text = FI.read()

### Clean Text

In [None]:
def clearn_text_for_lda(full_text):
    result_text = ''.join(i for i in full_text if ord(i) < 128) # remove non english characters
    result_text = re.sub(r'\d+', '', result_text) # remove numbers
    result_text = result_text.replace('”', '').replace('“', '').replace('‘', '').replace('’', '') # remove beauty quoute
    result_text = re.sub(r' +', ' ', result_text) # remove extra space
    result_text = re.sub(r'(\n )+', '\n', result_text) # remove space after newline
    result_text = re.sub(r'(\n){2,}', '<p>', result_text) # remove multiple newlines and mark by <p>
    result_text = re.sub(r'(<p>+\W?\w? <p>*)+', '<p>', result_text) #remove interlanced multiple newliens
    result_text = re.sub(r'\n', ' ', result_text) # replace single newline with space as these newlines are page wrapings
    result_text = re.sub(r'[^\w\s<>]', '', result_text) # remove all punctuations
    result_text = re.sub(r'( <p>)+', '<p>', result_text) # remove reitative <p>
    result_text = re.sub(r' +', ' ', result_text) # remove repitative spaces
    result_text = result_text.replace('<p>', '\n') # replace <p> with newline as <p> marks new paragraph
    result_text = re.sub(r'[<>]', '', result_text) # remove all punctuations
    result_text = result_text.lower() # convert to lower case
    return [line.strip() for line in result_text.split('\n')] # split into paragraphs and remove extra spaces

result_text = clearn_text_for_lda(full_text)
#print(result_text)

In [None]:
result_text

### Preprocess

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [None]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
# Select a paragraph to preview after preprocessing.

para_sample = result_text[40]
print('original document: ')
words = []
for word in para_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
# Now preprocess all the paragraphs
map_result_text = list(map(preprocess, result_text))
pp_paragraphs = []
pp_parag2ori_para = []
for idx, para in enumerate(map_result_text):
    if len(para)>0:
        pp_paragraphs.append(para)
        pp_parag2ori_para.append(idx)
        
pp_paragraphs

### Convert the paragraphs into Bag of Words representation

In [None]:
# Create a dictionary from "pp_paragraphs" containing the number of times a word appears in the training set.
dictionary = gensim.corpora.Dictionary(pp_paragraphs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
'''
You can filter out tokens that appear in
    1. less than 15 documents (absolute number) or
    2. more than 0.5 documents (fraction of total corpus size, not absolute number).
    3. after the above two steps, keep only the first 100,000 most frequent tokens.
'''
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(para) for para in pp_paragraphs]
bow_corpus[3410]

In [None]:
# Preview Bag Of Words for our sample preprocessed document.

bow_doc_4310 = bow_corpus[3410]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

### Convert the paragraphs into TF-IDF representation

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
   
pprint(corpus_tfidf[3410])

### LDA using Bag of Words

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
# For each topic, explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

### LDA using TF-IDF

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

### Visualise topics using pyLDAvis

#### Note in the visualisation

Saliency: a measure of how much the term tells you about the topic.

Relevance: a weighted average of the probability of the word given the topic and the word given the topic normalized by the probability of the topic.

The size of the bubble measures the importance of the topics, relative to the data.

First, we got the most salient terms, means terms mostly tell us about what’s going on relative to the topics. 

In [None]:
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
lda_display = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

### Let us look at topic distribution to paragraphs.

In [None]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
def get_top_topic_rel(para_idx):
    topic_idx, rel = sorted(lda_model_tfidf[bow_corpus[para_idx]], key=lambda tup: -1*tup[1])[0]
    return topic_idx, rel

para_id = 4310
get_top_topic_rel(para_id)

In [None]:
para2topic = []
for para_idx in range(len(pp_paragraphs)):
    topic_idx, rel = get_top_topic_rel(para_idx)
    para2topic.append(topic_idx)

for idx, p2t in enumerate(para2topic):
    if p2t==7:
        print(result_text[pp_parag2ori_para[idx]])