# Metadata

```yaml
Course:   DS 5001
Module:   08a Visualization
Topic:    Other Tools
Author:   R.C. Alvarado
Date:     23 March 2023
```

# Set Up

## Config

In [186]:
num_topics = 100
data_dir = "../data/newsgroups/20news-18828"

## Imports

In [178]:
import pandas as pd
import numpy as np
from gensim import corpora, models, similarities, downloader
from collections import defaultdict
import plotly_express as px
from glob import glob
import re 

# Import Data

In [None]:
def import_data():
    global LIB
    data = []
    for d in glob(data_dir+"/*"):
        label = d.split("/")[-1]
        print(label)
        for f in glob(d+"/*"):
            fid = f.split("/")[-1]
            flines = open(f, 'r', encoding="latin-1").read().split("\n")
            from_line = ':'.join(flines[0].split(':')[1:])
            subj_line = ':'.join(flines[1].split(':')[1:])
            data.append((fid, label, from_line, subj_line, ' '.join(flines[2:])))
    LIB = pd.DataFrame(data, columns=['doc_id','doc_label','doc_from', 'doc_subj', 'doc_content'])
    LIB.doc_id = LIB.doc_id.astype('int')
    LIB = LIB.set_index(['doc_label','doc_id'])

In [10]:
# import_data()

In [177]:
LIB

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_from,doc_subj,doc_content
doc_label,doc_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
talk.politics.mideast,75895,hm@cs.brown.edu (Harry Mamaysky),Heil Hernlem,In article <1993Apr14.125813.21737@ncsu.edu> ...
talk.politics.mideast,76248,waldo@cybernet.cse.fau.edu (Todd J. Dicker),Re: Israel's Expansion II,"ab4z@Virginia.EDU (""Andi Beyer"") writes: > F..."
talk.politics.mideast,76277,C.L.Gannon@newcastle.ac.uk (Space Cadet),"Re: To be exact, 2.5 million readers enlighte...",Andrew Varvel writes: > > > Serdar Argic >...
talk.politics.mideast,76045,shaig@Think.COM (Shai Guday),"Basil, opinions? (Re: Water on the brain)",In article <1993Apr15.204930.9517@thunder.mcr...
talk.politics.mideast,77197,ez000281@hamlet.ucdavis.edu (),Re: The Stage is Being Set,Srinivas Suder writes: >If the Haitian peopl...
...,...,...,...,...
talk.religion.misc,83934,porta@wam.umd.edu (David Palmer),Re: 14 Apr 93 God's Promise in 1 John 1: 7,In article <1qknu0INNbhv@shelley.u.washington...
talk.religion.misc,82812,decay@cbnewsj.cb.att.com (dean.kaflowitz),Re: Spreading Christianity (Re: Christian Ext...,"In article <C51puA.K2u@mailer.cc.fsu.edu>, dl..."
talk.religion.misc,84127,ekr@kyle.eitech.com (Eric Rescorla),"Re: What part of ""No"" don't you understand?",In article <1993Apr24.214843.10940@midway.uch...
talk.religion.misc,84315,"""David R. Sacco"" <dsav+@andrew.cmu.edu>",Re: ABORTION and private health coverage -- l...,On 21-Apr-93 in Re: ABORTION and private he.....


# Pre-Process the Gensim Way

Create a set of frequent words

In [255]:
stoplist = set('for a of the and to in is i that it you this be on are'.split(' '))

Lowercase each document, split it by white space, remove non-alphanumeric characters, and filter out stopwords

In [256]:
texts = [[re.sub(r"[\W_]+", "", word) for word in document.lower().split() if word not in stoplist]
         for document in LIB.doc_content.values]

Count word frequencies

In [257]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

Only keep words that appear more than once

In [258]:
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

Create a "dictionary," which associates a term string with a numeric identifier.

In [259]:
dictionary = corpora.Dictionary(processed_corpus)

Create the BOW corpus from the text using the dictionary.

In [260]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

# Train models

## TFIDF

In [261]:
tfidf = models.TfidfModel(bow_corpus)

In [262]:
# tfidf[bow_corpus[5]]

## LDA

In [264]:
model = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=num_topics)

In [268]:
model2 = models.HdpModel(bow_corpus, id2word=dictionary)

# Convert

## VOCAB

In [332]:
VOCAB = pd.DataFrame([(k, v) for k, v in dictionary.token2id.items()], columns=['term_str','term_id']) #.set_index('term_id')
VOCAB['n'] = VOCAB.term_str.map(lambda x: frequency[x])
VOCAB = VOCAB.set_index('term_id').sort_index()

In [333]:
VOCAB.sample(5)

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
69230,72020037otterhplhpcom,2
9609,walsh,39
36795,ftpserver,2
22299,palo,32
68173,vlf,5


## TFIDF

In [270]:
tfidf_data = []
for doc_id, doc in enumerate(bow_corpus):
    for term in tfidf[doc]:
        tfidf_data.append((doc_id, term[0], term[1]))
TFIDF = pd.DataFrame(tfidf_data, columns=['doc_id','term_id', 'tfidf']).set_index(['doc_id','term_id'])

In [271]:
TFIDF.tfidf.unstack(fill_value=0)

term_id,0,1,2,3,4,5,6,7,8,9,...,79154,79155,79156,79157,79158,79159,79160,79161,79162,79163
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.121893,0.042943,0.014431,0.066946,0.041293,0.013847,0.013055,0.054541,0.064667,0.011687,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.039125,0.000000,0.000000,0.056313,0.035394,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.012670,0.000000,0.000000,0.000000,0.021386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.031419,0.014811,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18823,0.000000,0.000000,0.020391,0.000000,0.000000,0.000000,0.018447,0.000000,0.000000,0.049539,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18824,0.000000,0.000000,0.031593,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.012792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18825,0.000000,0.000000,0.000000,0.000000,0.000000,0.012472,0.000000,0.000000,0.000000,0.031579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18826,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## BOW

In [272]:
bow_data = []
for i, doc in enumerate(bow_corpus):
    for term in doc:
        bow_data.append((i, term[0], term[1]))
BOW = pd.DataFrame(bow_data, columns=['doc_id','term_id', 'n']).set_index(['doc_id','term_id'])     
DTM = BOW.n.unstack(fill_value=0)

## LDA

### PHI

In [273]:
PHI = pd.DataFrame(model.get_topics()).T
PHI.index.name = 'term_id'

In [274]:
PHI

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000002,0.000005,0.000007,0.000006,3.926680e-07,9.037194e-08,0.000003,0.000003,1.037565e-06,0.000005,...,0.000003,0.000003,0.000006,0.000006,2.842222e-07,0.000007,6.112523e-07,0.000008,0.000004,0.000001
1,0.000037,0.000021,0.000059,0.000012,1.519050e-04,1.281187e-05,0.000348,0.000173,1.903097e-03,0.000036,...,0.000038,0.000008,0.000011,0.000053,2.107965e-05,0.000018,1.002845e-04,0.000086,0.000053,0.000348
2,0.001862,0.000372,0.000905,0.000600,4.738433e-03,3.264392e-03,0.000462,0.001488,5.709137e-04,0.000822,...,0.002160,0.000603,0.000247,0.001628,5.371002e-03,0.000496,2.616039e-03,0.000564,0.000593,0.001038
3,0.000004,0.000009,0.000034,0.000007,7.181044e-07,1.044096e-07,0.000005,0.000004,2.296212e-06,0.000007,...,0.000004,0.000003,0.000011,0.000013,9.330894e-07,0.000008,9.697133e-07,0.000009,0.000015,0.000010
4,0.000016,0.000079,0.000069,0.000009,3.706363e-04,9.896184e-04,0.000023,0.000077,1.454088e-04,0.000114,...,0.000313,0.000016,0.000011,0.000052,3.447248e-03,0.000051,2.602181e-04,0.000034,0.000141,0.000328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79159,0.000002,0.000005,0.000007,0.000006,3.926680e-07,7.700906e-08,0.000003,0.000002,9.675773e-07,0.000005,...,0.000003,0.000003,0.000005,0.000003,2.842222e-07,0.000007,6.112523e-07,0.000008,0.000004,0.000001
79160,0.000002,0.000005,0.000007,0.000006,3.926680e-07,7.700906e-08,0.000003,0.000002,9.675773e-07,0.000005,...,0.000003,0.000003,0.000005,0.000003,2.842222e-07,0.000007,6.112523e-07,0.000008,0.000004,0.000001
79161,0.000002,0.000005,0.000007,0.000006,3.926680e-07,7.700906e-08,0.000003,0.000002,9.675773e-07,0.000005,...,0.000003,0.000003,0.000005,0.000003,2.842222e-07,0.000007,6.112523e-07,0.000008,0.000004,0.000001
79162,0.000002,0.000005,0.000007,0.000006,3.926680e-07,7.700906e-08,0.000003,0.000002,9.675773e-07,0.000005,...,0.000003,0.000003,0.000005,0.000003,2.842222e-07,0.000007,6.112523e-07,0.000008,0.000004,0.000001


### THETA

In [275]:
theta_data = []
for doc_id, doc_bow in enumerate(bow_corpus):
    for topic in model.get_document_topics(doc_bow):
        theta_data.append((doc_id, topic[0], topic[1]))
THETA = pd.DataFrame(theta_data, columns=['doc_id', 'topic_id', 'topic_weight']).set_index(['doc_id','topic_id']).unstack(fill_value=0)

In [276]:
THETA

Unnamed: 0_level_0,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight,topic_weight
topic_id,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
doc_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.064721,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.010944,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.000000,0.118939,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18823,0.000000,0.0,0.0,0.0,0.000000,0.037849,0.0,0.011216,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
18824,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
18825,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
18826,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.011771,0.000000,0.0,0.0,0.0,0.0,0.0


### TOPIC

In [277]:
topic_data = []
for t in range(num_topics):
    for term_rank, term in enumerate(model.get_topic_terms(t)):
        term_id = term[0]
        topic_data.append((t, term_rank, dictionary.id2token[term_id]))

In [278]:
TOPIC = pd.DataFrame(topic_data, columns=['topic_id', 'term_rank', 'term_str'])\
    .set_index(['topic_id','term_rank']).term_str.unstack()

In [279]:
TOPIC.head(20)

term_rank,0,1,2,3,4,5,6,7,8,9
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,page,turkey,books,tanks,atmospheric,theory,vol,fred,chuck,brightness
1,hudson,mon,apr,messageid,flyers,gmt,bonus,organization,guides,ist
2,radius,atmosphere,stamps,lions,partial,randomly,boast,mature,deposit,feeble
3,turbo,54,sparc,folly,packages,gates,taxes,416,hong,baltimore
4,please,email,shipping,send,me,thanks,any,info,sale,if
5,he,was,his,had,they,him,were,did,at,said
6,ca,gravity,infinite,compile,philadelphia,,static,privately,batman,prompt
7,list,mailing,davis,interrupt,insight,lists,slick,sale,added,ti
8,7,san,annual,,copies,vs,conference,washington,new,6
9,catholics,protestants,hebrews,reflects,attached,gardner,editorial,piano,novice,continuity
