In [2]:
from IPython.display import HTML
css_str = '<style> \
.jp-Button path { fill: #616161;} \
text.terms { fill: #616161;} \
.jp-icon-warn0 path {fill: var(--jp-warn-color0);} \
.bp3-button-text path { fill: var(--jp-inverse-layout-color3);} \
.jp-icon-brand0 path { fill: var(--jp-brand-color0);} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str ))

In [3]:
#Import packages
import numpy as np
import pandas as pd
import re
from pprint import pprint
from numpy import random

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#Text-preprocessing (lemmatization) from spacy
import spacy

#Visualization tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  from imp import reload


In [4]:
#Stopwords from NLTK
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [5]:
#Import data 'Newsgroup'
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [6]:
#Convert to list and remove emails, newline, single quotes
data = df.content.values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
print(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- ']


In [7]:
#Tokenize words by gensim's simple_preprocess()
def tokenization(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(tokenization(data))
print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [8]:
#Remove stopwords by spacy
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data_words_nostops = remove_stopwords(data_words)
print(data_words_nostops[:1])

[['wheres', 'thing', 'car', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'organization', 'university', 'maryland', 'college', 'park', 'lines', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'door', 'sports', 'car', 'looked', 'late', 'early', 'called', 'bricklin', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst']]


In [9]:
#Create Bigram by gensim's Phrases()
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) 
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

data_words_bigrams = make_bigrams(data_words_nostops)
print(data_words_bigrams[:1])

[['wheres', 'thing', 'car', 'nntp_posting', 'host', 'rac_wam', 'umd', 'organization', 'university', 'maryland_college', 'park', 'lines', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'door', 'sports', 'car', 'looked', 'late', 'early', 'called', 'bricklin', 'doors', 'really', 'small', 'addition', 'front_bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst']]


In [10]:
#Lemmatization by spacy's en_core_web_sm, only keep noun, adj, vb, adv
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

nlp = spacy.load("en_core_web_sm")
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['s', 'thing', 'car', 'nntp_poste', 'host', 'umd', 'organization', 'park', 'line', 'wonder', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [11]:
#Create Dictionary (id2word), Dataset (texts), Document-Word frequency(corpus)
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

# View (Word_index, Word), Frequency
print([[((id, id2word[id]), freq) for id, freq in cp] for cp in corpus[:1]])

[[((0, 'addition'), 1), ((1, 'body'), 1), ((2, 'bricklin'), 1), ((3, 'bring'), 1), ((4, 'call'), 1), ((5, 'car'), 5), ((6, 'day'), 1), ((7, 'door'), 2), ((8, 'early'), 1), ((9, 'engine'), 1), ((10, 'enlighten'), 1), ((11, 'funky'), 1), ((12, 'history'), 1), ((13, 'host'), 1), ((14, 'info'), 1), ((15, 'know'), 1), ((16, 'late'), 1), ((17, 'lerxst'), 1), ((18, 'line'), 1), ((19, 'look'), 2), ((20, 'mail'), 1), ((21, 'make'), 1), ((22, 'model'), 1), ((23, 'name'), 1), ((24, 'neighborhood'), 1), ((25, 'nntp_poste'), 1), ((26, 'organization'), 1), ((27, 'park'), 1), ((28, 'production'), 1), ((29, 'really'), 1), ((30, 'rest'), 1), ((31, 's'), 1), ((32, 'see'), 1), ((33, 'separate'), 1), ((34, 'small'), 1), ((35, 'spec'), 1), ((36, 'sport'), 1), ((37, 'tellme'), 1), ((38, 'thank'), 1), ((39, 'thing'), 1), ((40, 'umd'), 1), ((41, 'wonder'), 1), ((42, 'year'), 1)]]


In [12]:
print('Number of documents: ' + str(len(corpus)))
print('Number of words: ' + str(len(id2word)))

Number of documents: 11314
Number of words: 50111


In [13]:
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]


In [19]:
num_documents = len(corpus)        
num_tokens = len(id2word)  
num_topics = 10               

alpha = 1 / num_topics         
beta = 1 / num_topics 

z_d_n = [[0 for _ in range(len(d))] for d in corpus]  # z_i_j
theta_d_z = np.zeros((num_documents, num_topics))
phi_z_w = np.zeros((num_topics, num_tokens))
n_d = np.zeros((num_documents))
n_z = np.zeros((num_topics))

In [20]:
## Initialize the parameters
# m: doc id
for d, doc in enumerate(corpus):  
    # n: id of word inside document, w: id of the word globally
    for n, w in enumerate(doc):
        # assign a topic randomly to words
        z_d_n[d][n] = n % num_topics
        # get the topic for word n in document m
        z = z_d_n[d][n]
        # keep track of our counts
        theta_d_z[d][z] += 1
        phi_z_w[z, w] += 1
        n_z[z] += 1
        n_d[d] += 1

In [21]:
for iteration in range(10):
    for d, doc in enumerate(corpus):
        for n, w in enumerate(doc):
            # get the topic for word n in document m
            z = z_d_n[d][n]
            w = w[0]

            # decrement counts for word w with associated topic z
            theta_d_z[d][z] -= 1
            phi_z_w[z, w] -= 1
            n_z[z] -= 1

            # sample new topic from a multinomial according to our formular
            p_d_t = (theta_d_z[d] + alpha) / (n_d[d] - 1 + num_topics * alpha)
            p_t_w = (phi_z_w[:, w] + beta) / (n_z + num_tokens * beta)
            p_z = p_d_t * p_t_w
            p_z /= np.sum(p_z)
            new_z = np.random.multinomial(1, p_z).argmax()

            # set z as the new topic and increment counts
            z_d_n[d][n] = new_z
            theta_d_z[d][new_z] += 1
            phi_z_w[new_z, w] += 1
            n_z[new_z] += 1

In [22]:
print(theta_d_z)
print('----------')
print(phi_z_w)

[[33.  0.  6. ...  2.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0. 14. 42. ...  0.  7.  2.]
 ...
 [ 5.  0.  3. ...  0.  0.  4.]
 [ 0.  8.  3. ...  0. 17.  0.]
 [ 0.  0. 23. ...  0.  0.  0.]]
----------
[[1.1200e+02 7.1351e+04 1.1093e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [7.0000e+00 7.0614e+04 1.0997e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.0000e+01 6.9688e+04 1.0986e+04 ... 0.0000e+00 0.0000e+00 1.0000e+00]
 ...
 [2.7000e+01 6.4799e+04 1.0530e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 6.3661e+04 1.0535e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 6.2819e+04 1.0290e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
