In [4]:
################################################################ TO CHANGE ################################################################
# Stolen from Sneha
# FILE PATHS AND FIELDS
path_json = "../comp767_papers_sample.jsonl"  #3154 papers
fields = ["title", "abstract", "authors"] # fields to include in training
# TRAINING PARAM
num_topics = 10 # truthfully we want to see 13 topics
chunksize = 2000 # how many docs are processed at a time set to 2000 as default
passes = 20 # how often the model is trained on all the docs set to 20 as default
iterations = 400 # how often do we iterate over each doc set to 400 as default
eval_every = None  # Don't evaluate model perplexity, takes too much time.
################################################################ TO CHANGE ################################################################


import json #
import nltk # for preprocessing
nltk.download('wordnet')

from nltk.tokenize import RegexpTokenizer # for tokenization
from nltk.stem.wordnet import WordNetLemmatizer # for lemmatizing
from gensim.corpora import Dictionary # to construct dictionary
from gensim.models import LdaModel # to make LDA model
from pprint import pprint # print output in a readable way
from nltk.util import ngrams

import pyLDAvis.gensim

import numpy as np


with open(path_json) as fp:
    papers = [json.loads(line) for line in fp.readlines()]

np.random.seed(767) 

[nltk_data] Downloading package wordnet to /Users/baddie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def ident(z,*args):
    '''dummy identity function'''
    if (type(z) is not list):
        return z
    else:
        return ' '.join(z)
    
def author_iden(z,*args):
    return z 

def add_ngrams(inpt_sentence, n=[1]):
    
    if inpt_sentence is not None:
        
        out=inpt_sentence
        
        for i in n:
            
            grams=ngrams(inpt_sentence, i)
            
            out.extend(['_'.join(x) for x in grams])
        
        return ' '.join(out)
    return ''

def author_ngram(input_list, *args):
    return [x.replace(' ', '_').lower() for x in input_list]

def destroy_param(z,*args):
    return []

def preprocess_data(all_docs, min_word_len=2,
                    title_pp=ident,arg_title=None,
                    abstract_pp=ident, arg_abstract=None,
                    author_pp=author_iden, arg_author=None):
    ret_ar=[]
    
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()
    for doc in all_docs:
        
        #title
        title= ' '.join([lemmatizer.lemmatize (x) for x in doc['title'].split(' ')])
        
        #abstract
        abstract= [lemmatizer.lemmatize (x) for x in str(doc['abstract']).split(' ')] #list
        
        abstract = [x for x in abstract if len(x)>min_word_len]
        
        # concatenate all strings 
        representation = title_pp(title,arg_title) + ' \n '+ abstract_pp(abstract,arg_abstract).lower()
        
        # get rid of punctuation & tokenize
        representation=tokenizer.tokenize(representation.lower()) + author_pp(doc['authors'],arg_author)
        
        # take out numbers (but not numbers within words)
        representation = [token for token in representation if not token.isnumeric()]

        # take out words that are at least 3 characters long character
        representation = [token for token in representation if len(token) > min_word_len] 

        # channge code here to not lemmatize ngrams
        #representation = [lemmatizer.lemmatize(token) for token in representation]

        representation=[x.strip('_') for x in representation]
        ret_ar.append(representation)
    
    return ret_ar


In [6]:
def hLDA(documents, layers, pp_args, ret=None, depth=0 ):
    
    print(layers)
    
    # start the with the base model
    if len(layers) >1:
        
        if(ret is None):
            ret=dict([(i, []) for i in range(len(layers))])
        
        full_data=preprocess_data(documents,**pp_args)
        #constructs word to ID mapping 
        dictionary = Dictionary(full_data)

        # filters out words that occur less than 20 times or are in more than 50% of docs
        dictionary.filter_extremes(no_below=20, no_above=0.5)

        # transform to vectorized form to put in model
        corpus = [dictionary.doc2bow(doc) for doc in full_data]

        # Finds how many unique tokens we've found and how many docs we have
        print('Number of unique tokens: %d' % len(dictionary))
        print('Number of documents: %d' % len(corpus))

        # index to word dictionary
        temp = dictionary[0] 
        id2word = dictionary.id2token
        
        model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=layers[0],
        passes=passes,
        eval_every=eval_every
        )
        
        ret[depth].append((model, corpus,dictionary))
        
        #list of list of documents in each topic 
        topic_sep = [[] for i in range(layers[0])]
        for i in range(len(corpus)):
            #find out which topic document corresponds to
            doc_topic=np.argmax(model.inference(corpus[i:i+1])[0])
            
            #append document to 
            topic_sep[doc_topic].append(documents[i])
        
        # for each subtopic, we redo hLDA with new parameter
        sub_models=[]
        for top_docs in topic_sep:
            ret=hLDA(top_docs, layers[1:] , pp_args, ret=ret, depth=depth+1)
            
        return ret
    # leaves of tree
    
    else:
        full_data=preprocess_data(documents,**pp_args)
        #constructs word to ID mapping 
        dictionary = Dictionary(full_data)

        # filters out words that occur less than 20 times or are in more than 50% of docs
        dictionary.filter_extremes(no_below=20, no_above=0.5)

        # transform to vectorized form to put in model
        corpus = [dictionary.doc2bow(doc) for doc in full_data]

        # Finds how many unique tokens we've found and how many docs we have
        print('Number of unique tokens: %d' % len(dictionary))
        print('Number of documents: %d' % len(corpus))

        # index to word dictionary
        temp = dictionary[0] 
        id2word = dictionary.id2token
        
        model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=layers[0],
        passes=passes,
        eval_every=eval_every
        )
        
        ret[depth].append((model, corpus,dictionary))
        
        return ret
        
        
    
    

In [7]:
preprocess_args={'author_pp':author_ngram,
                 'abstract_pp':add_ngrams,
                 'arg_abstract':[3]}

print('Test 1')
test_1=hLDA(papers, [2,13], pp_args=preprocess_args, ret=None, depth=0 )

print('Test 2')
test_2=hLDA(papers, [13,2], pp_args=preprocess_args, ret=None, depth=0 )

print('Test 3')
test_3=hLDA(papers, [5,5], pp_args=preprocess_args, ret=None, depth=0 )

print('Test 4')
test_4=hLDA(papers, [2,10], pp_args=preprocess_args, ret=None, depth=0 )

print('Test 5')
test_5=hLDA(papers, [10,2], pp_args=preprocess_args, ret=None, depth=0 )


Test 1
[2, 13]
Number of unique tokens: 2700
Number of documents: 3154
[13]
Number of unique tokens: 1541
Number of documents: 1533
[13]
Number of unique tokens: 1494
Number of documents: 1621
Test 2
[13, 2]
Number of unique tokens: 2700
Number of documents: 3154
[2]
Number of unique tokens: 50
Number of documents: 96
[2]
Number of unique tokens: 130
Number of documents: 155
[2]
Number of unique tokens: 76
Number of documents: 127
[2]
Number of unique tokens: 358
Number of documents: 343
[2]
Number of unique tokens: 361
Number of documents: 381
[2]
Number of unique tokens: 294
Number of documents: 330
[2]
Number of unique tokens: 65
Number of documents: 128
[2]
Number of unique tokens: 57
Number of documents: 151
[2]
Number of unique tokens: 534
Number of documents: 547
[2]
Number of unique tokens: 3
Number of documents: 49
[2]
Number of unique tokens: 55
Number of documents: 152
[2]
Number of unique tokens: 257
Number of documents: 303
[2]
Number of unique tokens: 344
Number of docume

In [8]:
def eval_Hmodel(hMod, label):
    
    #hmod is dict tree output from hLDA
    ret={'ave_level_coherences':[], 'ave_tree_coherence':0}
    
    level_coherence=[]
    for key in hMod.keys():
        
        # all models and corresponding corpi at a specific depth
        all_forks=hMod[key]
        
        all_coherences=[]
        
        #go through each fork and all corpus in each fork 
        sub_index=0
        for mod, corps,dic in all_forks:
            
            #find average coherence within the fork
            avg_topic_coherence = sum([t[1] for t in mod.top_topics(corps)]) / mod.num_topics 
            all_coherences.append(avg_topic_coherence)
            
            #viz current model
            visualisation = pyLDAvis.gensim.prepare(mod, corps, dic)
            
            param_changes= 'depth_'+str(key)+'_topic_'+str(sub_index)+"_atc_"+str(round(avg_topic_coherence,4))
            full_output_path =  "./hierarchical_visualization/hLDA_Visualization_" + label+ '_' + param_changes + ".html"
            pyLDAvis.save_html(visualisation, full_output_path)
            
            mod.save("./hierarchical_models/"+label+"/LDA_" + param_changes + ".model")
            sub_index+=1
            
        ave_atc=sum(all_coherences)/len(all_coherences)
        ret['ave_level_coherences'].append(ave_atc)
        
        level_coherence.append(ave_atc)
        
    tree_aatc=sum(level_coherence)/len(level_coherence)
    ret['ave_tree_coherence']=tree_aatc
    
    return ret

In [9]:
eval_Hmodel(test_1, 'test1')

{'ave_level_coherences': [-1.3321691344417008, -1.7692647396472396],
 'ave_tree_coherence': -1.5507169370444702}

In [10]:
eval_Hmodel(test_2, 'test2')

{'ave_level_coherences': [-2.5281125972957343, -1.1508300254505417],
 'ave_tree_coherence': -1.839471311373138}

In [11]:
eval_Hmodel(test_3, 'test3')

{'ave_level_coherences': [-1.5233495124080236, -1.4375305113037367],
 'ave_tree_coherence': -1.48044001185588}

In [12]:
eval_Hmodel(test_4, 'test4')

{'ave_level_coherences': [-1.4558969042236125, -1.8511646120094434],
 'ave_tree_coherence': -1.653530758116528}

In [13]:
eval_Hmodel(test_5, 'test5')

{'ave_level_coherences': [-1.8809799200481678, -1.1763031533491783],
 'ave_tree_coherence': -1.528641536698673}