### Topic Modelling Demo Code

#### Things I want to do -
- Identify a package to build / train LDA model
- Use visualization to explore Documents -> Topics Distribution -> Word distribution

In [None]:
!pip install pyLDAvis --user

In [3]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import pyLDAvis.gensim

# Text Preprocessing and model building
from gensim.corpora import Dictionary
import nltk
from nltk.stem import WordNetLemmatizer
import re
# Iteratively read files
import glob
import os

# For displaying images in ipython
from IPython.display import HTML, display

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [4]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14.0, 8.7)
#warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

<h2>Latent Dirichlet Allocation</h2>
<h3>From Documents -- DTM -- LDA Model</h3>

Topic modeling aims to automatically summarize large collections of documents to facilitate organization and management, as well as search and recommendations. At the same time, it can enable the understanding of documents to the extent that humans can interpret the descriptions of topics

<img src="images/lda2.png" alt="lda" style="width:60%">
<img src="images/docs_to_lda.png" alt="ldaflow" style="width:100%">

### Load Data

In [5]:
# User defined function to read and store bbc data from multipe folders
def load_data(folder_names,root_path):
    fileNames = [path + '/' + 'bbc' +'/'+ folder + '/*.txt' for path,folder in zip([root_path]*len(folder_names),
                                                                               folder_names )]
    doc_list = []
    tags = folder_names
    for docs in fileNames:
        #print(docs)
        #print(type(docs))
        doc = glob.glob(docs) # glob method iterates through the all the text documents in a folder
        for text in doc:
            with open(text, encoding='latin1') as f:
                topic = docs.split('/')[-2]

                lines = f.readlines()
                heading = lines[0].strip()
                body = ' '.join([l.strip() for l in lines[1:]])
                doc_list.append([topic, heading, body])
        print("Completed loading data from folder: %s"%topic)
    
    print("Completed Loading entire text")
    
    return doc_list

In [6]:
folder_names = ['business','entertainment','politics','sport','tech']
docs = load_data(folder_names = folder_names, root_path = os.getcwd())

Completed loading data from folder: business
Completed loading data from folder: entertainment
Completed loading data from folder: politics
Completed loading data from folder: sport
Completed loading data from folder: tech
Completed Loading entire text


In [7]:
docs = pd.DataFrame(docs, columns=['Category', 'Heading', 'Article'])
print(docs.head())
print('\nShape of data is {}\n'.format(docs.shape))

   Category                            Heading  \
0  business    UK economy facing 'major risks'   
1  business  Aids and climate top Davos agenda   
2  business   Asian quake hits European shares   
3  business   India power shares jump on debut   
4  business    Lacroix label bought by US firm   

                                             Article  
0   The UK manufacturing sector will continue to ...  
1   Climate change and the fight against Aids are...  
2   Shares in Europe's leading reinsurers and tra...  
3   Shares in India's largest power producer, Nat...  
4   Luxury goods group LVMH has sold its loss-mak...  

Shape of data is (2225, 3)



### Extract Raw Corpus

In [8]:
articles = docs.Article.tolist()

In [None]:
print(type(articles))
print(articles[0:2])

In [10]:
wordnet_lemmatizer = WordNetLemmatizer()

### Preprocessing of Raw Text

In [11]:
from nltk.corpus import stopwords
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [None]:
# nltk.download('stopwords')

In [13]:
stopwords = stopwords.words('english')

In [16]:
# Method to preprocess my raw data
def preprocessText(x):
    temp = x.lower()
    temp = re.sub(r'[^\w]', ' ', temp)
    temp = nltk.word_tokenize(temp)
    temp = [wordnet_lemmatizer.lemmatize(w) for w in temp]
    temp = [word for word in temp if word not in stopwords ]
    return temp

### Stemming
readily - !ily --> read
volley  -  !y --> volle

### Lemmetaization Statistical method of reducing words to root / base form -
volley --> volley


In [17]:
articles_final = [preprocessText(article) for article in articles]

In [None]:
articles_final[0:2]

### Transformation of Preprocessed text into Vector form using Gensim

In [19]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(articles_final)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [20]:
print(dictionary)

Dictionary(3101 unique tokens: ['12', '18', '2', '2003', '2004']...)


In [21]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in articles_final]

In [22]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 3101
Number of documents: 2225


### Train LDA model using Gensim

In [28]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 7
chunksize = 2000
passes = 10
# iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token
# print(id2word)

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
#     iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

### Model exploration: Top K words in each topic

In [29]:
import pprint

In [30]:
# Print the Keyword in the 10 topics
pprint.pprint(model.print_topics(num_words= 20))
doc_lda = model[corpus]

[(0,
  '0.023*"mobile" + 0.020*"phone" + 0.013*"people" + 0.010*"technology" + '
  '0.009*"gadget" + 0.009*"could" + 0.008*"use" + 0.008*"one" + '
  '0.006*"digital" + 0.006*"new" + 0.006*"make" + 0.006*"mr" + 0.006*"player" '
  '+ 0.006*"pc" + 0.006*"chip" + 0.006*"used" + 0.006*"service" + '
  '0.005*"computer" + 0.005*"market" + 0.005*"handset"'),
 (1,
  '0.012*"software" + 0.011*"people" + 0.010*"user" + 0.009*"system" + '
  '0.008*"search" + 0.008*"virus" + 0.008*"firm" + 0.008*"computer" + '
  '0.008*"microsoft" + 0.008*"program" + 0.008*"file" + 0.007*"site" + '
  '0.007*"security" + 0.007*"information" + 0.007*"website" + 0.007*"online" + '
  '0.007*"mail" + 0.007*"internet" + 0.006*"many" + 0.006*"new"'),
 (2,
  '0.011*"best" + 0.010*"film" + 0.008*"first" + 0.008*"award" + 0.007*"world" '
  '+ 0.007*"one" + 0.006*"last" + 0.006*"two" + 0.005*"win" + 0.005*"top" + '
  '0.005*"time" + 0.005*"second" + 0.004*"three" + 0.004*"four" + 0.004*"u" + '
  '0.004*"england" + 0.004*"new"

### Model Visualization using PyLDAvis

In [31]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary=dictionary)
vis

### Assign Topic Model Numbers to original Data Frame as Column

In [32]:
# Assigns the topics to the documents in corpus
lda_corpus = model[corpus]

In [33]:
topics = []

for doc in lda_corpus:
    temp_id = []
    temp_score = []
    for doc_tuple in doc:
        temp_id.append(doc_tuple[0])
        temp_score.append(doc_tuple[1])
    index = np.argmax(temp_score)
    topics.append(temp_id[index])

In [34]:
docs["Topic_num"] = topics

In [36]:
docs.head(n= 40)

Unnamed: 0,Category,Heading,Article,Topic_num
0,business,UK economy facing 'major risks',The UK manufacturing sector will continue to ...,3
1,business,Aids and climate top Davos agenda,Climate change and the fight against Aids are...,3
2,business,Asian quake hits European shares,Shares in Europe's leading reinsurers and tra...,3
3,business,India power shares jump on debut,"Shares in India's largest power producer, Nat...",3
4,business,Lacroix label bought by US firm,Luxury goods group LVMH has sold its loss-mak...,3
5,business,Insurance bosses plead guilty,Another three US insurance executives have pl...,3
6,business,Turkey-Iran mobile deal 'at risk',Turkey's investment in Iran's mobile industry...,3
7,business,Parmalat to return to stockmarket,"Parmalat, the Italian dairy company which wen...",3
8,business,WorldCom director admits lying,The former chief financial officer at US tele...,3
9,business,Ebbers denies WorldCom fraud,Former WorldCom chief Bernie Ebbers has denie...,3
