In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/michael/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## The Data

In [3]:
import pandas as pd
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

<img src='https://www.machinelearningplus.com/wp-content/uploads/2018/03/Inferring-Topic-from-Keywords-1024x666.png' img/>

## Data Pre-processing

In [4]:
stemmer = SnowballStemmer('english')

In [5]:
def lemmatize_stemming(text):
    word = WordNetLemmatizer().lemmatize(text, pos='v')
#     print('token',word)
    return stemmer.stem(word)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
df.iloc[0].values[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

### Select a document to preview after preprocessing.

In [7]:
doc_sample = df.iloc[0].values[0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['From:', 'lerxst@wam.umd.edu', "(where's", 'my', 'thing)\nSubject:', 'WHAT', 'car', 'is', 'this!?\nNntp-Posting-Host:', 'rac3.wam.umd.edu\nOrganization:', 'University', 'of', 'Maryland,', 'College', 'Park\nLines:', '15\n\n', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'I', 'saw\nthe', 'other', 'day.', 'It', 'was', 'a', '2-door', 'sports', 'car,', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/\nearly', '70s.', 'It', 'was', 'called', 'a', 'Bricklin.', 'The', 'doors', 'were', 'really', 'small.', 'In', 'addition,\nthe', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body.', 'This', 'is', '\nall', 'I', 'know.', 'If', 'anyone', 'can', 'tellme', 'a', 'model', 'name,', 'engine', 'specs,', 'years\nof', 'production,', 'where', 'this', 'car', 'is', 'made,', 'history,', 'or', 'whatever', 'info', 'you\nhave', 'on', 'this', 'funky', 'looking', 'car,', 'please', 'e-mail.\n\nThanks,\n-', 

In [8]:
processed_docs = df['content'].map(preprocess)
processed_docs[:10]

0        [lerxst, thing, subject, nntp, post, host, org...
1        [guykuo, carson, washington, subject, clock, p...
10       [irwin, cmptrc, lonestar, irwin, arnstein, sub...
100      [tchen, magnus, ohio, state, tsung, chen, subj...
1000     [dabl, lindbergh, subject, diamond, mous, curs...
10000    [dseg, robert, loper, subject, nntp, post, hos...
10001    [kimman, magnus, ohio, state, richard, subject...
10002    [kwilson, casbah, acn, kirtley, wilson, subjec...
10003    [subject, innoc, death, penalti, bobb, vice, r...
10004    [livesey, solntz, livesey, subject, genocid, c...
Name: content, dtype: object

### Bag of Words on the Data set
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten


### Filter out tokens that appear in:

- less than 15 documents (absolute number) 
- more than 0.5 documents (fraction of total corpus size, not absolute number).
- after the above two steps, keep only the first 100000 most frequent tokens.

In [10]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


### Gensim doc2bow

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [11]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(11, 1),
 (20, 1),
 (26, 1),
 (30, 1),
 (138, 1),
 (142, 3),
 (244, 1),
 (271, 5),
 (403, 2),
 (593, 1),
 (598, 1),
 (630, 1),
 (657, 1),
 (701, 1),
 (815, 1),
 (836, 1),
 (877, 1),
 (921, 1),
 (985, 1),
 (1046, 1),
 (1388, 1),
 (1397, 1),
 (1568, 1),
 (1617, 1),
 (1685, 1),
 (1839, 2),
 (1897, 1),
 (2004, 1),
 (2502, 1),
 (2662, 1),
 (2709, 1),
 (2847, 1),
 (2957, 1),
 (3303, 1),
 (4583, 1),
 (4806, 1),
 (5079, 2),
 (5763, 1)]

#### Preview Bag Of Words for our sample preprocessed document.

In [12]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 11 ("host") appears 1 time.
Word 20 ("nntp") appears 1 time.
Word 26 ("spec") appears 1 time.
Word 30 ("univers") appears 1 time.
Word 138 ("state") appears 1 time.
Word 142 ("window") appears 3 time.
Word 244 ("richard") appears 1 time.
Word 271 ("program") appears 5 time.
Word 403 ("silver") appears 2 time.
Word 593 ("secur") appears 1 time.
Word 598 ("true") appears 1 time.
Word 630 ("gate") appears 1 time.
Word 657 ("econom") appears 1 time.
Word 701 ("high") appears 1 time.
Word 815 ("support") appears 1 time.
Word 836 ("meet") appears 1 time.
Word 877 ("correct") appears 1 time.
Word 921 ("task") appears 1 time.
Word 985 ("current") appears 1 time.
Word 1046 ("major") appears 1 time.
Word 1388 ("oper") appears 1 time.
Word 1397 ("promis") appears 1 time.
Word 1568 ("dept") appears 1 time.
Word 1617 ("server") appears 1 time.
Word 1685 ("user") appears 1 time.
Word 1839 ("multi") appears 2 time.
Word 1897 ("expect") appears 1 time.
Word 2004 ("assur") appears 1 time.
Word 250

### TF-IDF
Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [13]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.16531831488632115),
 (1, 0.1678553823993299),
 (2, 0.15020052155842978),
 (3, 0.28581344134222897),
 (4, 0.11439130765694391),
 (5, 0.1516798683845623),
 (6, 0.3893283888350302),
 (7, 0.16890780449478127),
 (8, 0.12279484913135752),
 (9, 0.2634573656114004),
 (10, 0.16446021701525967),
 (11, 0.041907566184917734),
 (12, 0.13943742962367772),
 (13, 0.053228561863657146),
 (14, 0.17840678372959912),
 (15, 0.1614581045972935),
 (16, 0.10182359643822467),
 (17, 0.23339500537007382),
 (18, 0.1622034571062096),
 (19, 0.28046098400184816),
 (20, 0.04264750560541665),
 (21, 0.18494250912032378),
 (22, 0.14867573439400095),
 (23, 0.15971285457776704),
 (24, 0.18156677399111007),
 (25, 0.14477104789605966),
 (26, 0.20972103713602588),
 (27, 0.19290536120043997),
 (28, 0.08432238906696132),
 (29, 0.08377240739564006),
 (30, 0.04441833427243015),
 (31, 0.13721646110054028),
 (32, 0.08417832496773713)]


## Running LDA using Bag of Words

Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [14]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

For each topic, we will explore the words occuring in that topic and its relative weight.

In [15]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"think" + 0.007*"know" + 0.007*"peopl" + 0.007*"wire" + 0.006*"articl" + 0.006*"like" + 0.006*"point" + 0.005*"problem" + 0.004*"question" + 0.004*"time"
Topic: 1 
Words: 0.007*"like" + 0.007*"columbia" + 0.006*"articl" + 0.006*"nntp" + 0.006*"host" + 0.005*"univers" + 0.005*"think" + 0.005*"gari" + 0.004*"know" + 0.004*"peopl"
Topic: 2 
Words: 0.006*"like" + 0.005*"server" + 0.005*"think" + 0.005*"unit" + 0.004*"appear" + 0.004*"state" + 0.004*"articl" + 0.004*"look" + 0.004*"window" + 0.004*"know"
Topic: 3 
Words: 0.008*"articl" + 0.007*"say" + 0.006*"like" + 0.006*"peopl" + 0.006*"good" + 0.005*"know" + 0.005*"think" + 0.005*"time" + 0.005*"come" + 0.004*"year"
Topic: 4 
Words: 0.008*"articl" + 0.008*"like" + 0.007*"peopl" + 0.007*"univers" + 0.007*"think" + 0.006*"know" + 0.005*"year" + 0.005*"right" + 0.004*"time" + 0.004*"host"
Topic: 5 
Words: 0.006*"peopl" + 0.006*"articl" + 0.006*"like" + 0.005*"think" + 0.005*"mail" + 0.005*"year" + 0.004*"good" + 0.004

### Running LDA using TF-IDF


In [16]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"window" + 0.002*"file" + 0.002*"dyer" + 0.002*"alaska" + 0.002*"sale" + 0.002*"program" + 0.002*"card" + 0.002*"know" + 0.002*"problem" + 0.002*"like"
Topic: 1 Word: 0.003*"buffalo" + 0.002*"virginia" + 0.002*"jesus" + 0.002*"peopl" + 0.002*"bontchev" + 0.002*"univers" + 0.002*"hamburg" + 0.002*"cramer" + 0.002*"articl" + 0.002*"brian"
Topic: 2 Word: 0.002*"psuvm" + 0.002*"drive" + 0.002*"window" + 0.002*"univers" + 0.002*"know" + 0.002*"card" + 0.002*"caltech" + 0.002*"think" + 0.002*"game" + 0.002*"iastat"
Topic: 3 Word: 0.003*"card" + 0.003*"scsi" + 0.003*"wire" + 0.002*"driver" + 0.002*"access" + 0.002*"diamond" + 0.002*"univers" + 0.002*"hook" + 0.002*"christian" + 0.002*"video"
Topic: 4 Word: 0.004*"sandvik" + 0.003*"drive" + 0.002*"card" + 0.002*"netcom" + 0.002*"david" + 0.002*"appl" + 0.002*"clipper" + 0.002*"kent" + 0.002*"softwar" + 0.002*"inform"
Topic: 5 Word: 0.003*"hulman" + 0.002*"window" + 0.002*"file" + 0.002*"simm" + 0.002*"univers" + 0.002*"wid

### Performance evaluation by classifying sample document using LDA Bag of Words model
We will check where our test document would be classified.

In [17]:
processed_docs[4310]

['graham',
 'toal',
 'gtoal',
 'gtoal',
 'subject',
 'hard',
 'drive',
 'secur',
 'target',
 'origin',
 'gtoal',
 'pizzabox',
 'demon',
 'keyword',
 'entropi',
 'nntp',
 'post',
 'host',
 'pizzabox',
 'demon',
 'repli',
 'graham',
 'toal',
 'gtoal',
 'gtoal',
 'organ',
 'cuddlehog',
 'anonym',
 'line',
 'articl',
 'kean',
 'write',
 'matter',
 'fact',
 'random',
 'file',
 'disk',
 'reason',
 'special',
 'purpos',
 'hardwar',
 'take',
 'long',
 'time',
 'generat',
 'good',
 'random',
 'bit',
 'program',
 'crank',
 'coupl',
 'bit',
 'minut',
 'pretti',
 'conserv',
 'time',
 'need',
 'sound',
 'like',
 'use',
 'program',
 'interest',
 'post',
 'sourc']

In [18]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6391569375991821	 
Topic: 0.010*"program" + 0.007*"work" + 0.007*"window" + 0.006*"need" + 0.006*"like" + 0.006*"card" + 0.006*"imag" + 0.006*"univers" + 0.006*"time" + 0.005*"articl"

Score: 0.2436474859714508	 
Topic: 0.020*"file" + 0.017*"window" + 0.005*"think" + 0.005*"know" + 0.005*"list" + 0.005*"time" + 0.005*"server" + 0.005*"host" + 0.005*"mail" + 0.004*"articl"

Score: 0.09947734326124191	 
Topic: 0.017*"game" + 0.013*"team" + 0.009*"play" + 0.009*"year" + 0.008*"univers" + 0.007*"hockey" + 0.006*"player" + 0.006*"season" + 0.006*"host" + 0.006*"nntp"


### Performance evaluation by classifying sample document using LDA TF-IDF model.

In [73]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.691240668296814	 
Topic: 0.003*"window" + 0.003*"file" + 0.002*"encrypt" + 0.002*"chip" + 0.002*"clipper" + 0.002*"peopl" + 0.002*"program" + 0.002*"sandvik" + 0.002*"think" + 0.002*"know"

Score: 0.2920892536640167	 
Topic: 0.003*"game" + 0.002*"drive" + 0.002*"sale" + 0.002*"card" + 0.002*"thank" + 0.002*"univers" + 0.002*"window" + 0.002*"know" + 0.002*"driver" + 0.002*"need"


### Testing model on unseen document


In [74]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.7749394774436951	 Topic: 0.007*"articl" + 0.006*"peopl" + 0.005*"know" + 0.005*"think" + 0.004*"like"
Score: 0.025014860555529594	 Topic: 0.008*"chip" + 0.007*"encrypt" + 0.005*"know" + 0.005*"inform" + 0.005*"data"
Score: 0.02500915713608265	 Topic: 0.008*"say" + 0.007*"peopl" + 0.006*"know" + 0.006*"come" + 0.005*"go"
Score: 0.025007275864481926	 Topic: 0.008*"armenian" + 0.007*"articl" + 0.006*"peopl" + 0.006*"right" + 0.006*"univers"
Score: 0.025006413459777832	 Topic: 0.009*"think" + 0.008*"know" + 0.008*"peopl" + 0.007*"christian" + 0.006*"jesus"
Score: 0.025005873292684555	 Topic: 0.007*"articl" + 0.006*"state" + 0.005*"nntp" + 0.005*"univers" + 0.005*"host"
Score: 0.025005169212818146	 Topic: 0.009*"team" + 0.008*"year" + 0.007*"think" + 0.007*"game" + 0.007*"know"
Score: 0.02500431053340435	 Topic: 0.015*"window" + 0.009*"like" + 0.009*"articl" + 0.006*"host" + 0.006*"know"
Score: 0.025004137307405472	 Topic: 0.013*"drive" + 0.008*"articl" + 0.007*"univers" + 0.007*"s

## Compute Model Perplexity and Coherence Score
Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is.  Topic coherence score, in particular, has been more helpful.

In [80]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.720291271228724

Coherence Score:  0.4186846136526364


https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
    https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24