In [1]:
from polo2 import PoloDb
import numpy as np
import pandas as pd

In [2]:
corpus = 'zuboff'
trial = '40topics'
polo_pub = "/Users/rca2t1/Dropbox/CODE/polo2-test/PUB"
db_file = f"{polo_pub}/{corpus}/{corpus}-mallet-{trial}.db"
db = PoloDb(db_file)

# Significance

In [3]:
n_terms = 7
λ = .25

word = db.get_table('word').set_index('word_id')
topicword = db.get_table('topicword').set_index(['word_id', 'topic_id'])
topic = db.get_table('topic').set_index('topic_id')
WTM = topicword.unstack(fill_value=0) + .001
WTM.columns = WTM.columns.droplevel(0)
PWT = WTM / WTM.sum()
PW = WTM.T.sum() / WTM.T.sum().sum()
SIG = λ * np.log(PWT.T) + (1 - λ) * np.log(PWT.T/PW.T)
PHI = SIG.T.join(word.word_str).set_index('word_str')
topic['topic_words'] = topic.apply(lambda x: ', '.join(PHI[x.name].sort_values(ascending=False).head(n_terms).index), axis=1)        

In [4]:
TOP_SIGS = topic.apply(lambda x: ', '.join(PHI[x.name].sort_values(ascending=False).head(n_terms).index), axis=1)        

# TFIDF

In [5]:
TF = topicword.unstack(fill_value=0)  
DF = TF.astype(bool).T.sum()
N = len(TF.T)
TFIDF = (TF.T * np.log2(N/DF)).T
TFIDF.columns = TFIDF.columns.droplevel(0)
TFIDF = TFIDF.join(word.word_str, on='word_id').set_index('word_str')

In [6]:
# topic['topic_words'] = topic.apply(lambda x: ', '.join(PHI[x.name].sort_values(ascending=False).head(n_terms).index), axis=1)        
TOP_TFIDF = topic.apply(lambda x: ', '.join(TFIDF[x.name].sort_values(ascending=False).head(n_terms).index), axis=1)        

In [7]:
pd.concat([TOP_SIGS, TOP_TFIDF], keys=['sig', 'tfidf'], axis=1)

Unnamed: 0_level_0,sig,tfidf
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"facial, emotions, emotion, affective, picard, ...","emotions, facial, emotion, emotional, affectiv..."
1,"ready, personalization, territory, psychic, nu...","life, human, market, lives, nature, rendition,..."
2,"montes, black, fire, room, dulce, family, warm","home, house, room, family, montes, black, fire"
3,"years, insights, last, first, later, came, spread","years, first, time, world, later, many, across"
4,"verizon, news, cookies, button, disinformation...","facebook, verizon, news, users, content, compa..."
5,"million, billion, valley, silicon, president, ...","google, billion, company, million, silicon, va..."
6,"bank, couple, credit, kippings, agents, brown,...","credit, bank, rendering, couple, something, ag..."
7,"surplus, products, predictions, product, behav...","behavioral, surplus, products, prediction, pre..."
8,"ipod, protests, jobs, riots, london, police, l...","apple, ipod, want, protests, london, inequalit..."
9,"labor, industrial, twentieth, wealth, polanyi,...","labor, industrial, society, century, productio..."


# Run Function

In [45]:
from polo2 import PoloMallet, PoloConfig

In [None]:
config_file = f"{polo_pub}/{corpus}/config.ini"
cfg = PoloConfig(config_file, slug=corpus)
pm = PoloMallet(cfg, trial)
pm.add_topic_significance()

In [17]:
TOPIC = db.get_table('topic')
TOPIC.topic_words

0     emotions, facial, emotion, emotional, affectiv...
1     life, human, market, lives, nature, rendition,...
2        home, house, room, family, montes, black, fire
3        years, first, time, world, later, many, across
4     facebook, verizon, news, users, content, compa...
5     google, billion, company, million, silicon, va...
6     credit, bank, rendering, couple, something, ag...
7     behavioral, surplus, products, prediction, pre...
8     apple, ipod, want, protests, london, inequalit...
9     labor, industrial, society, century, productio...
10    behavior, another, better, pattern, forms, app...
11    google, search, advertising, users, queries, p...
12    app, mobile, apps, android, phone, location, d...
13    percent, compared, found, countries, survey, h...
14    ubiquitous, data, world, reality, things, mach...
15    privacy, information, policies, personal, ftc,...
16    government, agencies, google, regulation, publ...
17    social, facebook, others, media, psycholog

# Get all the projects

In [None]:
import os

In [49]:
mallet_path = '/Users/rca2t1/opt/mallet/bin/mallet'
polo_pub = os.environ['POLO_PUB']
projects = [proj for proj in os.listdir(polo_pub) if '.' not in proj]

In [51]:
projects

['winereviews',
 'airbnb',
 'out',
 'newzy',
 'okcupid',
 'wosml',
 'aclu',
 'aitf',
 'zuboff',
 'aizwei',
 'anphoblacht',
 'novels']

In [53]:
for proj in sorted(projects):
    cfg_file = f"{polo_pub}/{proj}/config.ini"
    cfg = PoloConfig(cfg_file, slug=proj)
    cfg.ini['DEFAULT']['mallet_path'] = mallet_path
    for trial in cfg.ini:
        if trial == 'DEFAULT': continue
        print(proj, trial)
        try:
            pm = PoloMallet(cfg, trial)
            pm.add_topic_significance()
        except FileNotFoundError as e:
            print(e)
        except ValueError as e:
            print(e)

winereviews trial1
Import file: trials/mallet-corpus.csv
winereviews 20topics
Import file: trials/mallet-corpus.csv
winereviews 30topics
Import file: trials/mallet-corpus.csv
winereviews 80topics
Import file: trials/mallet-corpus.csv
airbnb trial1
Import file: trials/mallet-corpus.csv
out trial1
[Errno 2] No such file or directory: '/Users/rca2t1/Dropbox/CODE/polo2/polo2/testing/out/corpus'
newzy trial1
Import file: trials/mallet-corpus.csv
newzy trial60
Import file: trials/mallet-corpus.csv
okcupid trial1
Import file: trials/mallet-corpus.csv
wosml trial1
Import file: trials/mallet-corpus.csv
aclu trial1
Import file: trials/mallet-corpus.csv
aclu trial40
Import file: trials/mallet-corpus.csv
aitf trial1
Import file: trials/mallet-corpus.csv
aitf trial60
Import file: trials/mallet-corpus.csv
zuboff trial1
Import file: trials/mallet-corpus.csv
zuboff 40topics
Import file: trials/mallet-corpus.csv
aizwei trial1
Import file: trials/mallet-corpus.csv
aizwei trial_20
Import file: trials/mal

ValueError: Columns must be same length as key