# Metadata

```yaml
Course:   DS 5001
Module:   08a Visualization
Topic:    Other Tools
Author:   R.C. Alvarado
Date:     23 March 2023
```

# Set Up

## Config

In [1]:
num_topics = 100
data_dir = "../data/newsgroups/20news-18828"

## Imports

In [2]:
import pandas as pd
import numpy as np
from gensim import corpora, models
from collections import defaultdict
import plotly_express as px
from glob import glob
import re 

# Import Data

In [3]:
def import_data():
    data = []
    for d in glob(data_dir+"/*"):
        label = d.split("/")[-1]
        print(label)
        for f in glob(d+"/*"):
            fid = f.split("/")[-1]
            flines = open(f, 'r', encoding="latin-1").read().split("\n")
            from_line = ':'.join(flines[0].split(':')[1:])
            subj_line = ':'.join(flines[1].split(':')[1:])
            data.append((fid, label, from_line, subj_line, ' '.join(flines[2:])))
    LIB = pd.DataFrame(data, columns=['doc_id','doc_label','doc_from', 'doc_subj', 'doc_content'])
    LIB.doc_id = LIB.doc_id.astype('int')
    LIB = LIB.set_index(['doc_label','doc_id'])
    return LIB

In [4]:
LIB = import_data()

talk.politics.mideast
rec.autos
comp.sys.mac.hardware
alt.atheism
rec.sport.baseball
comp.os.ms-windows.misc
rec.sport.hockey
sci.crypt
sci.med
talk.politics.misc
rec.motorcycles
comp.windows.x
comp.graphics
comp.sys.ibm.pc.hardware
sci.electronics
talk.politics.guns
sci.space
soc.religion.christian
misc.forsale
talk.religion.misc


In [5]:
LIB

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_from,doc_subj,doc_content
doc_label,doc_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
talk.politics.mideast,75895,hm@cs.brown.edu (Harry Mamaysky),Heil Hernlem,In article <1993Apr14.125813.21737@ncsu.edu> ...
talk.politics.mideast,76248,waldo@cybernet.cse.fau.edu (Todd J. Dicker),Re: Israel's Expansion II,"ab4z@Virginia.EDU (""Andi Beyer"") writes: > F..."
talk.politics.mideast,76277,C.L.Gannon@newcastle.ac.uk (Space Cadet),"Re: To be exact, 2.5 million readers enlighte...",Andrew Varvel writes: > > > Serdar Argic >...
talk.politics.mideast,76045,shaig@Think.COM (Shai Guday),"Basil, opinions? (Re: Water on the brain)",In article <1993Apr15.204930.9517@thunder.mcr...
talk.politics.mideast,77197,ez000281@hamlet.ucdavis.edu (),Re: The Stage is Being Set,Srinivas Suder writes: >If the Haitian peopl...
...,...,...,...,...
talk.religion.misc,83934,porta@wam.umd.edu (David Palmer),Re: 14 Apr 93 God's Promise in 1 John 1: 7,In article <1qknu0INNbhv@shelley.u.washington...
talk.religion.misc,82812,decay@cbnewsj.cb.att.com (dean.kaflowitz),Re: Spreading Christianity (Re: Christian Ext...,"In article <C51puA.K2u@mailer.cc.fsu.edu>, dl..."
talk.religion.misc,84127,ekr@kyle.eitech.com (Eric Rescorla),"Re: What part of ""No"" don't you understand?",In article <1993Apr24.214843.10940@midway.uch...
talk.religion.misc,84315,"""David R. Sacco"" <dsav+@andrew.cmu.edu>",Re: ABORTION and private health coverage -- l...,On 21-Apr-93 in Re: ABORTION and private he.....


In [6]:
LIB.to_csv("../data/newsgroups/LIB.csv")

# Pre-Process the Gensim Way

Create a set of frequent words

In [7]:
stoplist = set('for a of the and to in is i that it you this be on are'.split(' '))

Lowercase each document, split it by white space, remove non-alphanumeric characters, and filter out stopwords

In [8]:
texts = [[re.sub(r"[\W_]+", "", word) for word in document.lower().split() if word not in stoplist]
         for document in LIB.doc_content.values]

Count word frequencies

In [9]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

Only keep words that appear more than once

In [10]:
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

Create a "dictionary," which associates a term string with a numeric identifier.

In [11]:
dictionary = corpora.Dictionary(processed_corpus)

Create the BOW corpus from the text using the dictionary.

In [12]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [31]:
# bow_corpus[0]

# Train models

## TFIDF

In [13]:
tfidf = models.TfidfModel(bow_corpus)

In [14]:
# tfidf[bow_corpus[5]]

## LDA

In [15]:
model = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=num_topics)

In [16]:
model2 = models.HdpModel(bow_corpus, id2word=dictionary)

# Convert

## VOCAB

In [17]:
VOCAB = pd.DataFrame([(k, v) for k, v in dictionary.token2id.items()], columns=['term_str','term_id']) #.set_index('term_id')
VOCAB['n'] = VOCAB.term_str.map(lambda x: frequency[x])
VOCAB = VOCAB.set_index('term_id').sort_index()

In [18]:
VOCAB.sample(5)

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
17700,isolates,2
13222,lac,3
18841,imminent,22
62735,jyangsscvx1bitnet,2
35044,oquendos,3


## TFIDF

In [19]:
tfidf_data = []
for doc_id, doc in enumerate(bow_corpus):
    for term in tfidf[doc]:
        tfidf_data.append((doc_id, term[0], term[1]))
TFIDF = pd.DataFrame(tfidf_data, columns=['doc_id','term_id', 'tfidf']).set_index(['doc_id','term_id'])

In [20]:
TFIDF.tfidf.unstack(fill_value=0)

term_id,0,1,2,3,4,5,6,7,8,9,...,79154,79155,79156,79157,79158,79159,79160,79161,79162,79163
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.121893,0.042943,0.014431,0.066946,0.041293,0.013847,0.013055,0.054541,0.064667,0.011687,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.039125,0.000000,0.000000,0.056313,0.035394,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.012670,0.000000,0.000000,0.000000,0.021386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.031419,0.014811,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18823,0.000000,0.000000,0.020391,0.000000,0.000000,0.000000,0.018447,0.000000,0.000000,0.049539,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18824,0.000000,0.000000,0.031593,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.012792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18825,0.000000,0.000000,0.000000,0.000000,0.000000,0.012472,0.000000,0.000000,0.000000,0.031579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18826,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## BOW

In [21]:
bow_data = []
for i, doc in enumerate(bow_corpus):
    for term in doc:
        bow_data.append((i, term[0], term[1]))
BOW = pd.DataFrame(bow_data, columns=['doc_id','term_id', 'n']).set_index(['doc_id','term_id'])     
DTM = BOW.n.unstack(fill_value=0)

In [32]:
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n
doc_id,term_id,Unnamed: 2_level_1
0,0,1
0,1,1
0,2,1
0,3,1
0,4,1


In [33]:
DTM.head()

term_id,0,1,2,3,4,5,6,7,8,9,...,79154,79155,79156,79157,79158,79159,79160,79161,79162,79163
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,2,0,0,3,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## LDA

### PHI

In [22]:
PHI = pd.DataFrame(model.get_topics()).T
PHI.index.name = 'term_id'

In [23]:
PHI

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000004,0.000007,1.209821e-07,0.000007,8.446320e-08,0.000006,0.000008,0.000008,0.000006,4.388106e-07,...,3.999588e-08,0.000006,6.887807e-07,0.000002,7.444429e-08,6.166574e-07,9.159745e-07,0.000002,7.711055e-07,0.000005
1,0.000020,0.000034,1.436248e-03,0.000046,9.286494e-07,0.000025,0.000026,0.000032,0.000018,8.289085e-05,...,2.190440e-05,0.000012,3.785224e-04,0.000009,1.471395e-03,6.170237e-04,2.884323e-05,0.000023,8.576413e-04,0.000012
2,0.000351,0.000369,2.993358e-03,0.001102,2.379572e-03,0.000974,0.000525,0.000290,0.000284,3.591591e-03,...,2.677884e-03,0.000186,4.007390e-03,0.001588,3.450958e-03,6.141013e-04,1.822793e-03,0.001482,3.273964e-03,0.001092
3,0.000004,0.000012,1.414892e-07,0.000009,4.547562e-05,0.000006,0.000010,0.000063,0.000014,1.592202e-06,...,4.810859e-05,0.000005,8.219026e-07,0.000003,1.853169e-04,1.632976e-05,1.333521e-06,0.000007,3.112860e-04,0.000006
4,0.000015,0.000060,9.328199e-04,0.000149,9.394193e-05,0.000059,0.000179,0.000013,0.000033,2.474635e-04,...,5.921134e-04,0.000052,4.065890e-04,0.000058,1.309366e-04,2.917023e-05,6.910593e-04,0.000736,1.616540e-04,0.000020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79159,0.000004,0.000007,1.209821e-07,0.000007,8.446320e-08,0.000006,0.000008,0.000008,0.000006,3.876675e-07,...,3.999588e-08,0.000005,4.806589e-07,0.000002,7.444429e-08,3.146401e-07,9.159745e-07,0.000001,1.640756e-07,0.000005
79160,0.000004,0.000007,1.209821e-07,0.000007,8.446320e-08,0.000006,0.000008,0.000008,0.000006,3.876675e-07,...,3.999588e-08,0.000005,4.806589e-07,0.000002,7.444429e-08,3.146401e-07,9.159745e-07,0.000001,1.640756e-07,0.000005
79161,0.000004,0.000007,1.209821e-07,0.000007,8.446320e-08,0.000006,0.000008,0.000008,0.000006,3.876675e-07,...,3.999588e-08,0.000005,4.806589e-07,0.000002,7.444429e-08,3.146401e-07,9.159745e-07,0.000001,1.640756e-07,0.000005
79162,0.000004,0.000007,1.209821e-07,0.000007,8.446320e-08,0.000006,0.000008,0.000008,0.000006,3.876675e-07,...,3.999588e-08,0.000005,4.806589e-07,0.000002,7.444429e-08,3.146401e-07,9.159745e-07,0.000001,1.640756e-07,0.000005


### THETA

In [24]:
theta_data = []
for doc_id, doc_bow in enumerate(bow_corpus):
    for topic in model.get_document_topics(doc_bow):
        theta_data.append((doc_id, topic[0], topic[1]))
THETA = pd.DataFrame(theta_data, columns=['doc_id', 'topic_id', 'topic_weight']).set_index(['doc_id','topic_id']).unstack(fill_value=0)

In [25]:
THETA

Unnamed: 0_level_0,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight
topic_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
doc_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.0,0.0,0.000000,0.0,0.023057,0.0,0.0,0.0,0.0,0.068088,...,0.494118,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.307738,0.0
1,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.377174,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
2,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.328368,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
3,0.0,0.0,0.000000,0.0,0.047245,0.0,0.0,0.0,0.0,0.044121,...,0.096579,0.0,0.000000,0.000000,0.026952,0.0,0.0,0.015918,0.000000,0.0
4,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.157478,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.096272,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18823,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
18824,0.0,0.0,0.000000,0.0,0.104636,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.032088,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
18825,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0
18826,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.174497,0.0,0.000000,0.000000,0.018394,0.0,0.0,0.000000,0.000000,0.0


### TOPIC

In [26]:
topic_data = []
for t in range(num_topics):
    for term_rank, term in enumerate(model.get_topic_terms(t)):
        term_id = term[0]
        topic_data.append((t, term_rank, dictionary.id2token[term_id]))

In [27]:
TOPIC = pd.DataFrame(topic_data, columns=['topic_id', 'term_rank', 'term_str'])\
    .set_index(['topic_id','term_rank']).term_str.unstack()

In [28]:
TOPIC.head(20)

term_rank,0,1,2,3,4,5,6,7,8,9
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,image,color,ho,formats,palette,size,outputs,replies,display,header
1,joy,canadian,bullets,rubber,rifle,censorship,onethird,sluggish,oversized,soul
2,have,my,with,or,,me,if,can,get,but
3,gays,seller,buffalo,lynn,suny,traded,cones,,peninsula,towel
4,god,jesus,christ,bible,lord,christians,he,we,christian,christianity
5,film,greg,instructions,allocation,festival,269,debris,712,drug,gathering
6,7th,beneficial,leak,init,til,income,tagged,,was,fines
7,102,hack,carpet,distinctions,commenting,killers,episode,143,sloan,architect
8,bds,export,japanese,intensive,transparent,inflated,gibson,straightforward,fuer,growth
9,,radio,air,by,tube,at,with,or,from,280
