### Topic Modelling Demo Code

#### Things I want to do -
- Identify a package to build / train LDA model
- Use visualization to explore Documents -> Topics Distribution -> Word distribution

In [1]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import pyLDAvis.gensim

# Text Preprocessing and model building
from gensim.corpora import Dictionary
import nltk
from nltk.stem import WordNetLemmatizer
import re
# Iteratively read files
import glob
import os

# For displaying images in ipython
from IPython.display import HTML, display

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14.0, 8.7)
#warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

### Load Data

In [3]:
# User defined function to read and store bbc data from multipe folders
def load_data(folder_names,root_path):
    fileNames = [path + '/' + 'bbc' +'/'+ folder + '/*.txt' for path,folder in zip([root_path]*len(folder_names),
                                                                               folder_names )]
    doc_list = []
    tags = folder_names
    for docs in fileNames:
        #print(docs)
        #print(type(docs))
        doc = glob.glob(docs) # glob method iterates through the all the text documents in a folder
        for text in doc:
            with open(text, encoding='latin1') as f:
                topic = docs.split('/')[-2]

                lines = f.readlines()
                heading = lines[0].strip()
                body = ' '.join([l.strip() for l in lines[1:]])
                doc_list.append([topic, heading, body])
        print("Completed loading data from folder: %s"%topic)
    
    print("Completed Loading entire text")
    
    return doc_list

In [4]:
folder_names = ['business','entertainment','politics','sport','tech']
docs = load_data(folder_names = folder_names, root_path = os.getcwd())

Completed loading data from folder: business
Completed loading data from folder: entertainment
Completed loading data from folder: politics
Completed loading data from folder: sport
Completed loading data from folder: tech
Completed Loading entire text


In [7]:
docs = pd.DataFrame(docs, columns=['Category', 'Heading', 'Article'])
print(docs.head())
print('\nShape of data is {}\n'.format(docs.shape))

   Category                            Heading  \
0  business    UK economy facing 'major risks'   
1  business  Aids and climate top Davos agenda   
2  business   Asian quake hits European shares   
3  business   India power shares jump on debut   
4  business    Lacroix label bought by US firm   

                                             Article  
0   The UK manufacturing sector will continue to ...  
1   Climate change and the fight against Aids are...  
2   Shares in Europe's leading reinsurers and tra...  
3   Shares in India's largest power producer, Nat...  
4   Luxury goods group LVMH has sold its loss-mak...  

Shape of data is (2225, 3)



### Extract Raw Corpus

In [8]:
articles = docs.Article.tolist()

In [None]:
print(type(articles))
print(articles[0:2])

In [11]:
wordnet_lemmatizer = WordNetLemmatizer()

### Preprocessing of Raw Text

In [25]:
# Method to preprocess my raw data
def preprocessText(x):
    temp = x.lower()
    temp = re.sub(r'[^\w]', ' ', temp)
    temp = nltk.word_tokenize(temp)
    temp = [wordnet_lemmatizer.lemmatize(w) for w in temp]
    return temp

In [26]:
articles_final = [preprocessText(article) for article in articles]

In [None]:
articles_final[0:2]

### Transformation of Preprocessed text into Vector form using Gensim

In [28]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(articles_final)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [30]:
print(dictionary)

Dictionary(3202 unique tokens: ['12', '18', '2', '2003', '2004']...)


In [32]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in articles_final]

In [34]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 3202
Number of documents: 2225


### Train LDA model using Gensim

In [35]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 10
# iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
#     iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

### Model exploration: Top K words in each topic

In [77]:
# Print the Keyword in the 10 topics
pprint(model.print_topics(num_words= 20))
doc_lda = model[corpus]

[(0,
  '0.009*"market" + 0.009*"than" + 0.008*"u" + 0.007*"chip" + 0.007*"1" + '
  '0.007*"more" + 0.006*"new" + 0.006*"company" + 0.006*"technology" + '
  '0.006*"world" + 0.006*"china" + 0.005*"3" + 0.005*"growth" + 0.005*"ibm" + '
  '0.005*"cell" + 0.005*"about" + 0.005*"million" + 0.005*"firm" + '
  '0.005*"could" + 0.005*"2"'),
 (1,
  '0.014*"game" + 0.012*"more" + 0.012*"music" + 0.011*"mobile" + '
  '0.009*"player" + 0.009*"technology" + 0.008*"tv" + 0.007*"dvd" + '
  '0.007*"gadget" + 0.007*"digital" + 0.007*"new" + 0.007*"high" + '
  '0.007*"video" + 0.007*"device" + 0.006*"than" + 0.006*"can" + '
  '0.006*"people" + 0.006*"sony" + 0.006*"mr" + 0.005*"market"'),
 (2,
  '0.013*"people" + 0.009*"or" + 0.009*"can" + 0.008*"user" + 0.008*"more" + '
  '0.008*"phone" + 0.007*"service" + 0.007*"software" + 0.007*"system" + '
  '0.006*"network" + 0.006*"firm" + 0.006*"computer" + 0.006*"net" + '
  '0.006*"one" + 0.006*"could" + 0.006*"technology" + 0.006*"about" + '
  '0.006*"site" + 

### Model Visualization using PyLDAvis

In [45]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary=dictionary)
vis

### Assign Topic Model Numbers to original Data Frame as Column

In [47]:
# Assigns the topics to the documents in corpus
lda_corpus = model[corpus]

In [69]:
topics = []

for doc in lda_corpus:
    temp_id = []
    temp_score = []
    for doc_tuple in doc:
        temp_id.append(doc_tuple[0])
        temp_score.append(doc_tuple[1])
    index = np.argmax(temp_score)
    topics.append(temp_id[index])

In [74]:
docs["Topic_num"] = topics

In [75]:
docs.head()

Unnamed: 0,Category,Heading,Article,Topic_num
0,business,UK economy facing 'major risks',The UK manufacturing sector will continue to ...,0
1,business,Aids and climate top Davos agenda,Climate change and the fight against Aids are...,4
2,business,Asian quake hits European shares,Shares in Europe's leading reinsurers and tra...,0
3,business,India power shares jump on debut,"Shares in India's largest power producer, Nat...",0
4,business,Lacroix label bought by US firm,Luxury goods group LVMH has sold its loss-mak...,4
