In [1]:
import pandas as pd
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [5]:
import re

# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]
print(data[:1])


['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- ']


In [7]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [9]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 5), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 3), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 3), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 3), (49, 2), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 6), (70, 1), (71, 1), (72, 5), (73, 2), (74, 1), (75, 1), (76, 1), (77, 4), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1)]]


In [10]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('all', 1),
  ('anyone', 2),
  ('be', 1),
  ('body', 1),
  ('bricklin', 1),
  ('brought', 1),
  ('bumper', 1),
  ('by', 1),
  ('called', 1),
  ('can', 1),
  ('car', 5),
  ('college', 1),
  ('could', 1),
  ('day', 1),
  ('door', 1),
  ('doors', 1),
  ('early', 1),
  ('edu', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('from', 3),
  ('front', 1),
  ('funky', 1),
  ('have', 1),
  ('history', 1),
  ('host', 1),
  ('if', 2),
  ('il', 1),
  ('in', 1),
  ('info', 1),
  ('is', 3),
  ('it', 2),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('lines', 1),
  ('looked', 1),
  ('looking', 1),
  ('made', 1),
  ('mail', 1),
  ('maryland', 1),
  ('me', 1),
  ('model', 1),
  ('my', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp', 1),
  ('of', 3),
  ('on', 2),
  ('or', 1),
  ('organization', 1),
  ('other', 1),
  ('out', 1),
  ('park', 1),
  ('please', 1),
  ('posting', 1),
  ('production', 1),
  ('rac', 1),
  ('really', 1),
  ('rest', 1),
  ('saw', 1),
  ('separate', 1),
  ('s

In [12]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           minimum_probability = 0,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [13]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.169*"windows" + 0.096*"dos" + 0.051*"ms" + 0.034*"os" + 0.018*"kit" + '
  '0.016*"animation" + 0.009*"derek" + 0.008*"evans" + 0.008*"developers" + '
  '0.006*"trident"'),
 (1,
  '0.057*"lines" + 0.056*"subject" + 0.055*"organization" + 0.054*"from" + '
  '0.035*"re" + 0.030*"university" + 0.029*"posting" + 0.028*"article" + '
  '0.027*"for" + 0.027*"writes"'),
 (2,
  '0.085*"card" + 0.044*"mb" + 0.036*"video" + 0.035*"ram" + 0.030*"bus" + '
  '0.025*"mouse" + 0.025*"scsi" + 0.024*"driver" + 0.022*"controller" + '
  '0.020*"mhz"'),
 (3,
  '0.083*"chips" + 0.018*"sam" + 0.014*"vw" + 0.009*"um" + 0.008*"ross" + '
  '0.001*"perot" + 0.000*"simm" + 0.000*"tl" + 0.000*"dram" + 0.000*"pu"'),
 (4,
  '0.047*"the" + 0.035*"and" + 0.031*"to" + 0.030*"for" + 0.020*"on" + '
  '0.019*"is" + 0.016*"with" + 0.013*"use" + 0.012*"or" + 0.011*"of"'),
 (5,
  '0.084*"car" + 0.034*"cars" + 0.034*"engine" + 0.026*"gay" + '
  '0.020*"spacecraft" + 0.020*"marriage" + 0.020*"vax" + 0.015*"sexual" + '