In [1]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
#import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm_notebook
from functools import partial
#import mwparserfromhell

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

import guidedlda

In [2]:
pd.__version__

'0.25.3'

In [3]:
PATH_TO_DATA = "/scratch/nh1724/"

In [4]:
ls $PATH_TO_DATA

base_model_eta.pkl              [0m[38;5;27mInference-topic-model[0m/
base_model_guidedlda_all.pkl    results_lda_top.pth
base_model_guidedlda.pkl        results.pth
base_model_guidedlda_top.pkl    run-jupyter.sbatch
base_model.pkl                  slurm-6162026.out
en_outlinks_tokens_df.pkl       [38;5;27msquad[0m/
en_tokens_lem_stem_df.pkl       [48;5;10;38;5;21msquad-QA-char[0m/
features.pkl                    [38;5;27mtest[0m/
features_top.pkl                text_embed_en.pkl
graph_df.pkl                    [38;5;34mwikitext_tokenized_text_sections_outlinks_en.p[0m*
inference_project_teresa.ipynb


## Create dataframe

In [5]:
wiki_df = pkl.load(open(PATH_TO_DATA + "wikitext_tokenized_text_sections_outlinks_en.p", "rb"))

## LDA

In [6]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(57)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /home/nh1724/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def lemmatize_stemming(list_of_tokens):
    output = []
    for token in list_of_tokens:
        output.append(stemmer.stem(lemmatizer.lemmatize(token, pos='v')))
    return output

In [8]:
# test
lemmatize_stemming("I did doing procrastination wolves cats".split())

['i', 'do', 'do', 'procrastin', 'wolv', 'cat']

In [9]:
# del wiki_df["raw_tokens"]
wiki_df.tokens = wiki_df.tokens.apply(lemmatize_stemming)

In [10]:
wiki_df["tokens_lem_stem"] = wiki_df["tokens"]
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens,sections_tokens,raw_outlinks,outlinks,tokens_lem_stem
0,Q6199,"[History_And_Society.History and society, Hist...","[anarch, anti, authoritarian, anti, authoritar...","[etymology, terminology, definition, history, ...","[[[Anti-authoritarianism|anti-authoritarian]],...","[Anti-authoritarianism, political philosophy, ...","[anarch, anti, authoritarian, anti, authoritar..."
1,Q38404,"[STEM.Medicine, STEM.Biology, History_And_Soci...","[autism, development, disord, character, diffi...","[characteristics, social, development, communi...","[[[Psychiatry]], [[Interpersonal relationship|...","[Psychiatry, Interpersonal relationship, commu...","[autism, development, disord, character, diffi..."
2,Q101038,"[STEM.Physics, STEM.Space, History_And_Society...","[sunlight, relat, various, surfac, condit, alb...","[terrestrial, albedo, white, sky, black, sky, ...",[[[File:Albedo-e hg.svg|thumb|upright=1.3|The ...,"[File:Albedo-e hg.svg, diffuse reflection, sun...","[sunlight, relat, various, surfac, condit, alb..."
3,Q173,[Geography.Americas],"[alabama, alabama, nicknam, northern, flicker,...","[etymology, history, pre, european, settlement...","[[[Coat of arms of Alabama|Coat of arms]], [[N...","[Coat of arms of Alabama, Northern flicker, Di...","[alabama, alabama, nicknam, northern, flicker,..."
4,Q41746,"[Culture.People, Geography.Europe, History_And...","[date, three, zero, zero, bc, achill, kill, et...","[etymology, birth, early, years, names, hidden...",[[[File:Achilles fighting against Memnon Leide...,[File:Achilles fighting against Memnon Leiden ...,"[date, three, zero, zero, bc, achill, kill, et..."


In [11]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(wiki_df.mid_level_categories)
print(f"Number of categories: {len(mlb.classes_)}, \n\n{mlb.classes_}")

Number of categories: 45, 

['Culture.Architecture' 'Culture.Arts' 'Culture.Broadcasting'
 'Culture.Crafts and hobbies' 'Culture.Entertainment'
 'Culture.Food and drink' 'Culture.Games and toys'
 'Culture.Internet culture' 'Culture.Language and literature'
 'Culture.Media' 'Culture.Music' 'Culture.People'
 'Culture.Performing arts' 'Culture.Philosophy and religion'
 'Culture.Sports' 'Culture.Visual arts' 'Geography.Africa'
 'Geography.Americas' 'Geography.Antarctica' 'Geography.Asia'
 'Geography.Bodies of water' 'Geography.Europe' 'Geography.Landforms'
 'Geography.Maps' 'Geography.Oceania' 'Geography.Parks'
 'History_And_Society.Business and economics'
 'History_And_Society.Education' 'History_And_Society.History and society'
 'History_And_Society.Military and warfare'
 'History_And_Society.Politics and government'
 'History_And_Society.Transportation' 'STEM.Biology' 'STEM.Chemistry'
 'STEM.Engineering' 'STEM.Geosciences' 'STEM.Information science'
 'STEM.Mathematics' 'STEM.Medicine' '

In [12]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(wiki_df, y, test_size=0.1, random_state=42) #all train, output features
#X_train.shape, X_test.shape

In [13]:
#to output features, use full set for X, y
X_train = wiki_df
y_train = y

In [14]:
#compute prior eta based on a subset of whole corpus
def init_eta(y, bow_corpus, dictionary, num_examples=1000):
    '''
    y: MultiLabelBinarizer array, (num_doc, num_topic)
    bow_corpus: bag of word corpus
    dictionary: word vocab
    num_examples: restrict on a smaller subset as initialization
    '''
    eta = np.zeros(y.T.shape)
    y = y.astype(int) #ensure integer type
    for i in range(num_examples):
        key_idx = np.array(list(dict(bow_corpus[i]).keys()))
        counts = np.array(list(dict(bow_corpus[i]).values()))
        key_idx = key_idx.astype(int) #ensure integer type
        eta[np.ix_(y[i]==1,key_idx)] += counts
    return eta/np.sum(eta) #normalization

In [15]:
dictionary = gensim.corpora.Dictionary(X_train.tokens)
dictionary.filter_extremes(no_below=15, no_above=0.5,keep_n=y_train.shape[0]) #keep same shape as num of examples to initialize the prior (can improve)

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in X_train.tokens]

In [17]:
len(bow_corpus), len(dictionary)

(33823, 33823)

In [18]:
y_train.shape

(33823, 45)

In [92]:
eta0 = init_eta(y_train, bow_corpus,dictionary, num_examples=3000) #use small subset to guess seeded words

In [93]:
eta0.shape

(45, 33823)

## guidedLDA

In [94]:
#create seeded topic for GuidedLDA (list of list)
topic_words = []
n_top_words=10
for i in range(45): #45 topics
    topic_words_i = [dictionary[idx] for idx in np.array(dictionary)[np.argsort(eta0[i])][:-(n_top_words+1):-1]]
    topic_words.append(topic_words_i)

In [95]:
#show topic and top topic_words
df_topic = pd.DataFrame(topic_words)
df_topic = df_topic.T
df_topic.columns = mlb.classes_
df_topic.head(10)

Unnamed: 0,Culture.Architecture,Culture.Arts,Culture.Broadcasting,Culture.Crafts and hobbies,Culture.Entertainment,Culture.Food and drink,Culture.Games and toys,Culture.Internet culture,Culture.Language and literature,Culture.Media,...,STEM.Geosciences,STEM.Information science,STEM.Mathematics,STEM.Medicine,STEM.Meteorology,STEM.Physics,STEM.Science,STEM.Space,STEM.Technology,STEM.Time
0,build,art,big,card,film,food,game,game,languag,music,...,island,librari,mean,cell,ice,energi,system,star,system,ndash
1,design,museum,brother,bank,anim,product,player,internet,write,media,...,earth,leibniz,algorithm,human,forc,nbsp,nbsp,earth,comput,day
2,door,collect,seri,debit,seri,milk,chess,search,word,game,...,miner,univers,distribut,effect,atmospher,electron,centuri,space,engin,american
3,centuri,bc,season,knot,dub,plant,franklin,user,book,film,...,water,copyright,frac,diseas,temperatur,physic,theori,orbit,oper,calendar
4,art,centuri,show,reserv,charact,produc,play,network,english,blue,...,nbsp,carnegi,data,studi,earth,forc,comput,system,design,month
5,jpg,jpg,mtv,feder,batman,process,hand,googl,publish,jazz,...,rock,columbia,normal,medic,effect,particl,color,light,data,english
6,architectur,file,music,currenc,comic,water,nintendo,irc,letter,advertis,...,carbon,book,statist,caus,corioli,system,scienc,nbsp,program,politician
7,wall,british,doctor,nickel,show,fruit,kasparov,video,centuri,news,...,ice,public,comput,system,water,atom,univers,moon,nbsp,player
8,museum,room,film,attle,releas,beer,video,meme,univers,communic,...,metal,dictionari,right,blood,rotat,mass,languag,observ,languag,footbal
9,file,ad,televis,system,award,chocol,tile,troll,film,mickey,...,iron,digit,peirc,organ,climat,univers,human,planet,air,author


In [85]:
seed_topics = {}
for t_id, st in enumerate(topic_words):
    for word in st:
        seed_topics[dictionary.token2id[word]] = t_id

In [27]:
#GuidedLDA with seed words init
#model = guidedlda.GuidedLDA(n_topics=45, n_iter=100, random_state=7, refresh=20)

In [45]:
topic_map = {idx:0 for idx in range(16)}
topic_map.update({idx:1 for idx in range(16,26,1)})
topic_map.update({idx:2 for idx in range(26,32,1)})
topic_map.update({idx:3 for idx in range(32,45,1)})

In [47]:
len(topic_map)

45

In [50]:
#GuidedLDA with only 4 broad topics
seed_topics_top = {}
for idx, st in enumerate(topic_words):
    for word in st:
        top_id = topic_map[idx]
        seed_topics_top[dictionary.token2id[word]] = top_id

In [88]:
#create sparse input matrix from corpus
X = gensim.matutils.corpus2csc(bow_corpus,dtype=int).T

In [89]:
X.shape

(33823, 33823)

In [90]:
model = guidedlda.GuidedLDA(n_topics=45, n_iter=100, random_state=7, refresh=20)

In [96]:
model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)
#model.fit(X, seed_topics=seed_topics_top, seed_confidence=0.15)

In [56]:
#save model
pkl.dump(model, open(PATH_TO_DATA + "base_model_guidedlda_all.pkl", "wb"))

In [97]:
#load model: all the words, to create visualization
model = pkl.load(open(PATH_TO_DATA + "base_model_guidedlda_all.pkl", "rb"))

In [25]:
model.topic_word_.shape

(45, 33823)

In [26]:
#word topic assignment
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = [dictionary[t] for t in np.argsort(topic_dist)[:-(n_top_words+1):-1]]
    print(str(i), topic_words)

0 ['england', 'london', 'british', 'king', 'kingdom', 'royal', 'john', 'william', 'hous', 'english']
1 ['templ', 'centuri', 'build', 'jpg', 'king', 'architectur', 'file', 'stone', 'palac', 'ancient']
2 ['women', 'social', 'children', 'sexual', 'person', 'sex', 'group', 'child', 'age', 'femal']
3 ['bank', 'tax', 'currenc', 'financi', 'money', 'coin', 'account', 'rate', 'exchang', 'issu']
4 ['film', 'award', 'seri', 'charact', 'best', 'releas', 'star', 'role', 'actor', 'comic']
5 ['food', 'plant', 'product', 'fruit', 'produc', 'oil', 'grow', 'crop', 'seed', 'milk']
6 ['hand', 'game', 'wear', 'ball', 'often', 'chess', 'move', 'fire', 'usual', 'player']
7 ['user', 'window', 'network', 'servic', 'softwar', 'web', 'internet', 'version', 'releas', 'googl']
8 ['languag', 'word', 'english', 'write', 'speak', 'dialect', 'linguist', 'script', 'vowel', 'letter']
9 ['day', 'televis', 'media', 'report', 'news', 'broadcast', 'show', 'tv', 'channel', 'radio']
10 ['album', 'song', 'music', 'record', 'b

In [59]:
#document topics assignment
doc_topic = model.transform(X)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [60]:
pkl.dump(doc_topic, open(PATH_TO_DATA + "features.pkl", "wb"))

In [34]:
from sklearn.metrics import precision_recall_fscore_support

def get_metrics_dict(y_true, y_pred):
    """
    Help function that tests the model's performance on a dataset.
    """
    # macro precision, recall, f-score
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true,
        y_pred,
        average="macro"
    )
    # micro precision, recall, f-score
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        y_true,
        y_pred,
        average="micro"
    )
    # combine all metrics in a dict
    dict_metrics = {
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro, 
        "recall_micro": recall_micro, 
        "f1_micro": f1_micro,
    }
    # round
    n_digits = 3
    dict_metrics = {
        metric_name: round(value, n_digits) 
        for metric_name, value in dict_metrics.items()
    }
    return dict_metrics

In [35]:
#threshold: each doc topic sum to 1, threshold topic w/ prob > 1/45
from copy import deepcopy
y_pred = deepcopy(doc_topic)
y_pred = np.where(y_pred>1/45, 1, 0)

In [None]:
get_metrics_dict(y_train, y_pred)

### Create visualization

In [38]:
doc_topic = pkl.load(open(PATH_TO_DATA + "features.pkl", "rb"))

In [61]:
#due to the huge dataset and a small vocab, all zero row in document-term matrix exists. This is to ensure we look at the valid doc cases
valid_idx = np.where(row_sum==1)[0]

In [65]:
topic_word.shape, doc_topic[valid_idx].shape

((45, 33823), (2749, 45))

In [28]:
vocab = list(dictionary.values()) #list of terms in the dictionary
vocab_tf = [dict(i) for i in bow_corpus]
vocab_tf = list(pd.DataFrame(vocab_tf).sum(axis=0)) #list of term frequencies

In [67]:
#calculate document lenghts based on bow corpus
doc_lengths = np.array([len(article) for article in bow_corpus])

In [68]:
#save results for visz:https://github.com/vi3k6i5/GuidedLDA/issues/23
visz = {'topic_term_dists':topic_word,
        'doc_topic_dists':doc_topic[valid_idx],
        'doc_lengths': doc_lengths[valid_idx],
        'vocab':vocab, 
        'term_frequency':vocab_tf}

In [69]:
pkl.dump(visz,open(PATH_TO_DATA + "visz_all.pkl", "wb"))

In [None]:
#import visz data
data = pkl.load(open(PATH_TO_DATA + "visz_all.pkl", "rb"))
import pyLDAvis
# prepare the data
tef_vis_data = pyLDAvis.prepare(**data)

# this bit needs to be run after running the earlier code for reasons
pyLDAvis.display(tef_vis_data)

# save to HTML
pyLDAvis.save_html(tef_vis_data, "LDAvis_all.html")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
