# Topic Modeling, LDA

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm_notebook

import nltk
import spacy

import gensim
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Load Datasets

In [3]:
df_news = pd.read_json('./datasets/news_groups.json') # grasp from https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json
df_news.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [4]:
print(f'TOPIC EXAMPLES ({len(df_news.target_names.unique())} Topics):\n')
print(', '.join(df_news.target_names.unique()))

TOPIC EXAMPLES (20 Topics):

rec.autos, comp.sys.mac.hardware, rec.motorcycles, misc.forsale, comp.os.ms-windows.misc, alt.atheism, comp.graphics, rec.sport.baseball, rec.sport.hockey, sci.electronics, sci.space, talk.politics.misc, sci.med, talk.politics.mideast, soc.religion.christian, comp.windows.x, comp.sys.ibm.pc.hardware, talk.politics.guns, talk.religion.misc, sci.crypt


## Preprocess Datasets

In [None]:
from nltk.corpus import stopwords

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

#### 1) Removing e-mails, new line characters and single quotes

In [6]:
content = df_news['content'].values.tolist()

# removing e-mails
cleaned_content = [re.sub('\S*@\S*\s?', '', sentence) for sentence in content]

# removing new line characters
cleaned_content = [re.sub('\s+', ' ', sentence) for sentence in cleaned_content]

# removing single quotes
cleaned_content = [re.sub("\'", "", sentence) for sentence in cleaned_content]

In [7]:
df_news['cleaned_content'] = cleaned_content
df_news[['content', 'cleaned_content']].head()

Unnamed: 0,content,cleaned_content
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,From: (wheres my thing) Subject: WHAT car is t...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,From: (Guy Kuo) Subject: SI Clock Poll - Final...
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,From: (Irwin Arnstein) Subject: Re: Recommenda...
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,From: (Tsung-Kun Chen) Subject: ** Software fo...
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,From: (Don A.B. Lindbergh) Subject: Diamond SS...


#### 2) Tokenizing each sentence and removing punctuations, unnecessary characters also stop words

In [None]:
from gensim.utils import simple_preprocess

In [8]:
def tokenize_sentences(sentences):
    for sentence in tqdm_notebook(sentences):
        tokenized_sentence = simple_preprocess(str(sentence), deacc=True)
        tokenized_sentence = [word for word in tokenized_sentence if word not in stop_words]
        yield(tokenized_sentence) # true means removing punctuations

In [9]:
tokenized_sentences = list(tokenize_sentences(cleaned_content))

HBox(children=(IntProgress(value=0, max=11314), HTML(value='')))




In [10]:
df_news['tokenized_content'] = tokenized_sentences
df_news[['content', 'tokenized_content']].head()

Unnamed: 0,content,tokenized_content
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,"[wheres, thing, car, nntp, posting, host, rac,..."
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,"[guy, kuo, si, clock, poll, final, call, summa..."
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,"[irwin, arnstein, recommendation, duc, summary..."
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,"[tsung, kun, chen, software, forsale, lots, nn..."
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,"[lindbergh, diamond, ss, win, mouse, cursor, o..."


#### 3) Making Bigram & Tigram Words

In [11]:
# create the model first
bigram = gensim.models.Phrases(tokenized_sentences, min_count=5, threshold=100) # higher threshold fewer phrases
trigram = gensim.models.Phrases(bigram[tokenized_sentences], threshold=100)

bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

KeyboardInterrupt: 

In [None]:
def make_bigram_words(text): return [bigram_model[word] for word in tqdm_notebook(text)]
def make_trigram_words(text): return [trigram_model[bigram_model[word]] for word in tqdm_notebook(text)]

In [None]:
bigram_words = make_bigram_words(tokenized_sentences)

print('BIGRAM EXAMPLES:\n')
print(', '.join(bigram_words[0]))

In [None]:
trigram_words = make_trigram_words(tokenized_sentences)

print('TRIGRAM EXAMPLES:\n')
print(', '.join(trigram_words[0]))

#### 4) Lemmatizing Words

In [None]:
def lemmatize_words(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    nlp = spacy.load('en', disable=['parser', 'ner'])
        
    lemmatized_words = []
    
    for sentence in tqdm_notebook(text):
        doc = nlp(' '.join(sentence))
        lemmatized_words.append([word.lemma_ for word in doc if word.pos_ in allowed_postags])
        
    return lemmatized_words

In [None]:
lemmatized_words = lemmatize_words(bigram_words)

In [None]:
print('LEMMATIZATION EXAMPLES:\n')
print(', '.join(lemmatized_words[0]))

#### 5) Creating the dictionary and corpus

In [None]:
import gensim.corpora as corpora

In [None]:
index2word = corpora.Dictionary(lemmatized_words)

#### 6) Preparing text features using bag of words

In [None]:
features = [index2word.doc2bow(text) for text in tqdm_notebook(lemmatized_words)]

In [None]:
features_look = []
for index, frequency in features[:1][0][:10]: features_look.append(f'{index2word[index]} {frequency}')
print(', '.join(features_look))

## Build LDA Topic Model

---