In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [5]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:2])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ',
 'From: (Guy Kuo) Subject: SI Clock Poll - Final Call Summary: Final call for '
 'SI clock reports Keywords: SI,acceleration,clock,upgrade Article-I.D.: '
 'shelley.1qvfo9INNc3s Organization: University of Washington Lines: 11 '
 'NNTP-Posting-Host: carson.u.washington

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

[['where', 's', 'thing', 'car', 'nntp_post', 'host', 'rac_wam', 'umd', 'organization', 'university', 'maryland_college', 'park', 'line', 'wonder', 'anyone', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'whatev', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'report', 'keyword', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'washington', 'line', 'nntp_post', 'host', 'carson_washington', 'fair', 'number', 'brave', 'soul', 'upgrade', 'clock', 'oscillator', 'share', 'experience', 'poll', 'send', 'brief', 'message', 'detail', 'experience', 'procedure', 'top', 'sp

In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)]]


In [11]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [23]:
enumerate(lda_model[corpus])

<enumerate at 0x7f8342e9aab0>

In [15]:
sent_topics_df = pd.DataFrame()

In [16]:
# def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
#     # Init output
#     sent_topics_df = pd.DataFrame()

#     # Get main topic in each document
#     for i, row in enumerate(ldamodel[corpus]):
#         row = sorted(row, key=lambda x: (x[1]), reverse=True)
#         # Get the Dominant topic, Perc Contribution and Keywords for each document
#         for j, (topic_num, prop_topic) in enumerate(row):
#             if j == 0:  # => dominant topic
#                 wp = ldamodel.show_topic(topic_num)
#                 topic_keywords = ", ".join([word for word, prop in wp])
#                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
#             else:
#                 break
#     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

#     # Add original text to the end of the output
#     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
#     return(sent_topics_df)


# df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
# df_topic_sents_keywords.head(10)



In [28]:
 for i, row in enumerate(lda_model[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        pprint(row)

[(7, 0.64467794),
 (18, 0.085144386),
 (13, 0.069755636),
 (6, 0.04737547),
 (15, 0.047276575),
 (17, 0.026379324),
 (12, 0.013462496),
 (2, 0.010394949)]
[(15, 0.5084771),
 (7, 0.22333898),
 (13, 0.06803319),
 (18, 0.063513406),
 (2, 0.028726906),
 (12, 0.021970697),
 (17, 0.019905198),
 (14, 0.018316045)]
[(7, 0.65665025),
 (15, 0.060899448),
 (4, 0.060131248),
 (3, 0.047700837),
 (13, 0.03697864),
 (18, 0.036348462),
 (19, 0.017608158),
 (11, 0.01570526),
 (2, 0.010481014)]
[(15, 0.2842912),
 (8, 0.18096356),
 (2, 0.17059924),
 (7, 0.11683393),
 (9, 0.087553345),
 (1, 0.065002464),
 (18, 0.023475686),
 (13, 0.018920485),
 (6, 0.01858585)]
[(7, 0.4416731),
 (2, 0.21326183),
 (18, 0.11259749),
 (12, 0.05105455),
 (15, 0.045396604),
 (13, 0.038323283),
 (6, 0.0120469555)]
[(7, 0.550711),
 (11, 0.12809527),
 (13, 0.07690785),
 (18, 0.061367273),
 (1, 0.051186148),
 (2, 0.03190013),
 (6, 0.023568815),
 (17, 0.02236253),
 (15, 0.013428039),
 (19, 0.010450006)]
[(7, 0.3194291),
 (2, 0.1793

[(5, 0.43255597),
 (13, 0.29683742),
 (18, 0.13244598),
 (12, 0.053024773),
 (7, 0.03990614)]
[(17, 0.3683524),
 (13, 0.22833297),
 (15, 0.1088354),
 (11, 0.08834856),
 (7, 0.073335044),
 (18, 0.048714396),
 (12, 0.026377713)]
[(18, 0.3897424),
 (13, 0.23242505),
 (7, 0.16912416),
 (17, 0.08270585),
 (15, 0.032763157),
 (2, 0.0130422395),
 (6, 0.011346427)]
[(2, 0.3393061),
 (13, 0.22728884),
 (7, 0.16753566),
 (18, 0.13791607),
 (15, 0.036943436),
 (0, 0.01606211)]
[(8, 0.41672832),
 (19, 0.19395442),
 (7, 0.14892259),
 (13, 0.068277344),
 (18, 0.065428235),
 (11, 0.02463634),
 (14, 0.0153623875),
 (15, 0.015244388)]
[(14, 0.38705662),
 (7, 0.21644232),
 (16, 0.08582009),
 (18, 0.07564819),
 (13, 0.062586926),
 (3, 0.03425777),
 (15, 0.032869793),
 (12, 0.024489198),
 (8, 0.014225688),
 (2, 0.014004608),
 (6, 0.011091693)]
[(8, 0.24456069),
 (7, 0.19199684),
 (18, 0.1506486),
 (12, 0.11298739),
 (13, 0.0853437),
 (15, 0.061670337),
 (4, 0.04351274),
 (6, 0.022118773),
 (5, 0.021266855

[(7, 0.2357034),
 (14, 0.16673215),
 (18, 0.147155),
 (1, 0.13762367),
 (9, 0.078053765),
 (13, 0.05232604),
 (16, 0.051190037),
 (2, 0.032928444),
 (5, 0.019874847),
 (6, 0.016850544),
 (19, 0.01656727),
 (15, 0.015099082)]
[(9, 0.24804586),
 (7, 0.20082386),
 (15, 0.16330013),
 (2, 0.13821118),
 (8, 0.08341),
 (18, 0.04457938),
 (13, 0.040038016),
 (3, 0.018911975)]
[(7, 0.21340401),
 (19, 0.14753956),
 (2, 0.14324877),
 (13, 0.13078406),
 (5, 0.08607861),
 (18, 0.06258714),
 (15, 0.042584028),
 (10, 0.04081234),
 (8, 0.02819869),
 (11, 0.023658725),
 (1, 0.02213628),
 (16, 0.016026719),
 (6, 0.011325461)]
[(16, 0.38418242),
 (7, 0.21146995),
 (13, 0.084290646),
 (2, 0.08022762),
 (3, 0.06690902),
 (18, 0.042678777),
 (15, 0.031085745),
 (12, 0.012866888),
 (6, 0.012725845)]
[(14, 0.32320413),
 (7, 0.2016222),
 (3, 0.12192103),
 (15, 0.05129474),
 (18, 0.048860323),
 (11, 0.04624518),
 (9, 0.04007276),
 (19, 0.039889175),
 (10, 0.032891184),
 (13, 0.027896332),
 (6, 0.021927072)]
[(1

[(7, 0.27211466),
 (15, 0.21103752),
 (16, 0.13305654),
 (2, 0.11376828),
 (18, 0.10761751),
 (13, 0.06455322),
 (8, 0.036786035),
 (9, 0.017229415),
 (12, 0.010332803)]
[(2, 0.5629974),
 (7, 0.15254249),
 (18, 0.10273837),
 (15, 0.044760708),
 (1, 0.024535732),
 (13, 0.023197457),
 (4, 0.022596002),
 (16, 0.018723186),
 (12, 0.013110989)]
[(14, 0.35885495),
 (7, 0.16045144),
 (17, 0.089694984),
 (18, 0.07607065),
 (13, 0.070004545),
 (9, 0.06530534),
 (12, 0.043256745),
 (15, 0.024070185),
 (2, 0.018523721),
 (6, 0.015964393),
 (16, 0.0114366645),
 (8, 0.01117845),
 (19, 0.011095501)]
[(7, 0.33863324),
 (2, 0.1760677),
 (15, 0.15809095),
 (18, 0.105228454),
 (13, 0.049953543),
 (6, 0.045009118),
 (12, 0.016437273),
 (9, 0.012200637),
 (16, 0.012118588),
 (8, 0.011196824),
 (19, 0.011125137),
 (17, 0.010909126),
 (14, 0.010127515)]
[(15, 0.29472348),
 (7, 0.2705939),
 (18, 0.13353801),
 (9, 0.111740254),
 (13, 0.03951217),
 (16, 0.0359894),
 (12, 0.016311271),
 (2, 0.015239556),
 (6, 0

[(13, 0.6254428),
 (17, 0.13066277),
 (18, 0.085248515),
 (7, 0.06722418),
 (1, 0.019332878),
 (8, 0.018632317),
 (15, 0.014061666)]
[(7, 0.36661902),
 (1, 0.16599622),
 (12, 0.12972143),
 (8, 0.08487132),
 (13, 0.084624894),
 (15, 0.062216535),
 (18, 0.03985372),
 (9, 0.02147621)]
[(7, 0.39425033),
 (18, 0.22114776),
 (15, 0.091161616),
 (13, 0.09079855),
 (4, 0.06539734),
 (17, 0.05100046),
 (12, 0.02773351),
 (6, 0.024490999)]
[(7, 0.3374266),
 (18, 0.21867856),
 (13, 0.13317843),
 (5, 0.053379714),
 (17, 0.040543076),
 (19, 0.039123274),
 (14, 0.038848795),
 (9, 0.03838185),
 (4, 0.018578356),
 (15, 0.017903684),
 (2, 0.015138356),
 (12, 0.012526939)]
[(7, 0.32880175),
 (19, 0.16022591),
 (18, 0.11251475),
 (13, 0.079606734),
 (17, 0.0770588),
 (8, 0.048023563),
 (9, 0.034208614),
 (6, 0.033850912),
 (0, 0.027949784),
 (15, 0.022956736),
 (2, 0.015780361),
 (5, 0.012755203),
 (14, 0.011165589),
 (16, 0.010646297),
 (1, 0.010509553)]
[(7, 0.535053),
 (18, 0.09643468),
 (2, 0.0892828

[(13, 0.36297935),
 (18, 0.1712615),
 (7, 0.10339699),
 (19, 0.06106776),
 (1, 0.060507502),
 (15, 0.051826075),
 (8, 0.046910852),
 (6, 0.029283188),
 (17, 0.028475301),
 (16, 0.025316104),
 (2, 0.015987022),
 (12, 0.010897998)]
[(7, 0.33176172),
 (18, 0.30266333),
 (13, 0.096713156),
 (15, 0.08582144),
 (16, 0.042317737),
 (4, 0.03493074),
 (5, 0.019802611),
 (2, 0.013295171),
 (12, 0.010254973),
 (6, 0.010196818)]
[(12, 0.35039708),
 (7, 0.2532141),
 (18, 0.13219114),
 (15, 0.06151599),
 (13, 0.061047487),
 (14, 0.045870498),
 (2, 0.012914469),
 (6, 0.011044684)]
[(12, 0.3075779),
 (7, 0.17934632),
 (13, 0.14372231),
 (18, 0.13269047),
 (5, 0.08741963),
 (19, 0.02642216),
 (15, 0.019379359),
 (6, 0.01898392),
 (2, 0.01375406),
 (17, 0.012423286)]
[(7, 0.22509849),
 (13, 0.20379108),
 (14, 0.117019035),
 (5, 0.09605049),
 (18, 0.09135602),
 (12, 0.065683745),
 (15, 0.047993828),
 (17, 0.039464783),
 (19, 0.030477555),
 (2, 0.015659375),
 (6, 0.012282062),
 (9, 0.010609558)]
[(4, 0.28

KeyboardInterrupt: 

In [24]:
for i, row in enumerate(lda_model[corpus]):
    new_row = sorted(row, key=lambda x: (x[1]), reverse=True)
    if i < 5:
        pprint(new_row)

TypeError: '<' not supported between instances of 'int' and 'tuple'