In [134]:
import os
import pickle
import pandas as pd
import numpy as np
import nbimporter
import preprocessing # import Jupyter notebook
#from preprocessing import clean_pdf

import gensim.corpora as corpora
import collections


In [285]:
import tomotopy as tp
print(tp.isa) # prints 'avx2', 'avx', 'sse2' or 'none'

avx2


In [159]:
# Load full dataset
with open('./Dataset/merged/textdata_all.pkl', 'rb') as f:
    text_df = pickle.load(f)
text_df.reset_index(drop=True, inplace=True)

In [160]:
text_df.head(5)

Unnamed: 0,paper_id,abstract,key_words,body_text,whole_text,citations
0,18980380,This technical note studies Markov decision pr...,"[Distributional robustness, Markov decision p...","[{'section': 'II. PRELIMINARIES', 'text': 'Thr...","Throughout the technical note, we use capital ...",{'BIBREF0': {'title': 'Distributionally robust...
1,56031008,Abstract-The present study attempted to find o...,"[listening comprehension, pre-task activities...",[{'section': 'A. Listening Materials and Activ...,Morley (1991) has explained that in developing...,"{'BIBREF0': {'title': 'Listening', 'authors': ..."
2,88484504,"In this paper, we address robust design of sym...","[Downlink MU-MISO, imperfect CSI, symbolleve...",[{'section': 'II. SYSTEM AND UNCERTAINTY MODEL...,We consider an MU-MISO wireless broadcast chan...,{'BIBREF0': {'title': 'Convex optimization-bas...
3,88485902,ABSTRACT A kinematic equation of profiling flo...,"[Profiling float, depth control, low power c...","[{'section': 'I. INTRODUCTION', 'text': 'With ...",With the increase in the cognition of marine a...,{'BIBREF0': {'title': 'AUV buoyancy regulating...
4,204197524,We characterize practical optical signal recei...,"[Optical wireless communications, multi-stage...","[{'section': 'A. PMT Principle Review', 'text'...",The typical structure of a PMT is shown in Fig...,{'BIBREF0': {'title': 'A statistical non-linea...


In [165]:
docs_cleaned = clean_pdf(text_df)

changing index to paper_id
0.1245725154876709
0.1245725154876709
0.1245725154876709
0.1245725154876709
0.1245725154876709
0.1245725154876709
Tokenizing
53.112943172454834
53.112943172454834
53.112943172454834
Lemmatizing
53.112943172454834
Bag of Words Representation
length of dct before filter_extreme:  158730
length of dct after filter_extreme:  25185
159.25587630271912


In [169]:
print(len(docs_cleaned['ids']), len(docs_cleaned['corpus']))

4443 4443


## Cleaning sections

In [60]:
sections_df = process_sections(text_df)
sections_df.head(10)

In [269]:
sections_cleaned = clean_section(sections_df, file_name='section_level_kw', output_dir='./Dataset/cleaned/cs-med/',section_lvl = True)

changing index to paper_id
0.004000663757324219
0.004000663757324219
0.004000663757324219
0.004000663757324219
0.004000663757324219
0.004000663757324219
Tokenizing
5.905099868774414
5.905099868774414
5.905099868774414
Lemmatizing
5.906134366989136
Bag of Words Representation
length of dct before filter_extreme:  31656
length of dct after filter_extreme:  7942
14.717201471328735


In [270]:
corpus = sections_cleaned['corpus']
dct = sections_cleaned['dct']
paper_ids = sections_cleaned['ids']
docs = sections_cleaned['docs']

corpus_doc = docs_cleaned['corpus']
dct_doc = docs_cleaned['dct']
paper_ids_doc = docs_cleaned['ids']

In [255]:
# Calculate weights
weights = np.zeros((len(corpus), len(dct.cfs)))
for d_idx,d in enumerate(corpus):

    #print(d_idx)
    for w_idx, c in enumerate(d):
        
        # section length
        section_length = sum([x[1] for x in d])
        
        # Calculate section-level weight
        weight_section = (c[1] + 1) / (dct.cfs[w_idx] + section_length)
        
        # Calculate document-level weight
        paper_id = paper_ids[d_idx]
        ind = paper_ids_doc.index(paper_id)
        
        # document length
        document_length = sum([x[1] for x in corpus_doc[ind]])
        
        w_idx_doc = dct_doc.token2id[dct[w_idx]]
        
        count_doc = 0
        if w_idx_doc in dict(corpus_doc[ind]).keys():
            count_doc = dict(corpus_doc[ind])[w_idx_doc]
        
        
        weight_document = (count_doc + 1) / (dct_doc.cfs[w_idx_doc] + document_length)
        
        # Combined weight
        weight_combined = weight_section*weight_document
    
        weights[d_idx][w_idx] = weight_combined
weights = (weights-np.min(weights)) / (np.max(weights)-np.min(weights))

In [273]:
# Update gensim corpus with weighted values
for d_idx,d in enumerate(corpus):

    for w_idx, c in enumerate(d):
    
        corpus[d_idx][w_idx] = (c[0], float(c[1])*weights[d_idx][w_idx])

## LDA Model on Document level data

In [296]:
mdl = tp.LDAModel(k=20)
ndocs = len(docs_cleaned['docs'])
for i in range(ndocs):
    mdl.add_doc(docs_cleaned['docs'][i])

In [297]:
for i in range(0, 100, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=10))

mdl.summary()

Iteration: 0	Log-likelihood: -9.47663281550987
Iteration: 10	Log-likelihood: -8.86721221355413
Iteration: 20	Log-likelihood: -8.74557154302517
Iteration: 30	Log-likelihood: -8.687673267927476
Iteration: 40	Log-likelihood: -8.653538452790892
Iteration: 50	Log-likelihood: -8.627985088054904
Iteration: 60	Log-likelihood: -8.610567729187778
Iteration: 70	Log-likelihood: -8.598197550015616
Iteration: 80	Log-likelihood: -8.587164934505635
Iteration: 90	Log-likelihood: -8.577987917970056
Top 10 words of topic #0
[('network', 0.022928457707166672), ('model', 0.019273199141025543), ('training', 0.015577062033116817), ('layer', 0.014868954196572304), ('learning', 0.01240178570151329), ('image', 0.010084941983222961), ('input', 0.0096141891553998), ('feature', 0.008896850980818272), ('neural', 0.007194491568952799), ('method', 0.006826592143625021)]
Top 10 words of topic #1
[('frequency', 0.009286560118198395), ('cell', 0.007737402804195881), ('signal', 0.0074073923751711845), ('fig', 0.007169868

## LDA Model on Section level data

In [288]:
mdl_section = tp.LDAModel(k=20)
ndocs = len(sections_cleaned['docs'])
for i in range(ndocs):
    mdl_section.add_doc(sections_cleaned['docs'][i])

In [289]:
for i in range(0, 100, 10):
    mdl_section.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl_section.ll_per_word))

for k in range(mdl_section.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl_section.get_topic_words(k, top_n=10))

mdl_section.summary()

Iteration: 0	Log-likelihood: -9.079915884555385
Iteration: 10	Log-likelihood: -8.701078695364467
Iteration: 20	Log-likelihood: -8.564733159888059
Iteration: 30	Log-likelihood: -8.485860552421332
Iteration: 40	Log-likelihood: -8.437313880667375
Iteration: 50	Log-likelihood: -8.396943245529172
Iteration: 60	Log-likelihood: -8.368769717149952
Iteration: 70	Log-likelihood: -8.346425243730684
Iteration: 80	Log-likelihood: -8.323723462836833
Iteration: 90	Log-likelihood: -8.30783522650672
Top 10 words of topic #0
[('control', 0.04231368750333786), ('system', 0.03068280778825283), ('method', 0.018112976104021072), ('controller', 0.013630240224301815), ('tracking', 0.01278215553611517), ('state', 0.012388401664793491), ('model', 0.01217638049274683), ('target', 0.011812916025519371), ('wind', 0.011722049675881863), ('speed', 0.01126771792769432)]
Top 10 words of topic #1
[('student', 0.023132802918553352), ('test', 0.012613781727850437), ('learning', 0.011423846706748009), ('teacher', 0.009377

In [295]:
## Modify corpus: adding in weights

array([1216, 1094, 1151, ...,    1,    1,    1], dtype=uint64)

In [290]:
mdl_section.perplexity

4055.5241830036794

## LDA model with gensim

In [274]:
# Step 4: Train the LDA model
from gensim.models import LdaModel, LdaMulticore
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric
import logging

field = 'cs-med-kw-only'
num_topics = 5
model_dir= f'./models/{field}/k_{num_topics}/'
os.makedirs(model_dir,exist_ok=True)

model_file = model_dir + 'model'
# The filename is the file that will be created with the log.
# If the file already exists, the log will continue rather than being overwritten.
log_file = model_dir + 'model_callbacks.log'
logging.basicConfig(filename=log_file,
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.NOTSET)

perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
convergence_logger = ConvergenceMetric(logger='shell')
coherence_cv_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'c_v', texts = docs) # very compute intensive
coherence_umass_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'u_mass')

lda_model = LdaModel(corpus=corpus,
                         id2word=dct,
                         random_state=2020,
                         num_topics=num_topics,
                         passes=100,
                         chunksize=100,
#                          batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=150,
                         gamma_threshold=0.001,
                         per_word_topics=True,
                         callbacks=[perplexity_logger,
                                    convergence_logger,
#                                     coherence_cv_logger,
                                    coherence_umass_logger])

lda_model.save(model_file)

In [275]:
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.005*"model" + 0.003*"fig" + 0.003*"method" + 0.003*"power" + 0.003*"data" '
  '+ 0.003*"system" + 0.002*"value" + 0.002*"algorithm" + 0.002*"time" + '
  '0.002*"user"'),
 (1,
  '0.000*"epns" + 0.000*"elcc" + 0.000*"insar" + 0.000*"gasser" + 0.000*"gait" '
  '+ 0.000*"mtu" + 0.000*"career" + 0.000*"anns" + 0.000*"fibre" + '
  '0.000*"coregularization"'),
 (2,
  '0.000*"ambiguity" + 0.000*"mdp" + 0.000*"confidence" + '
  '0.000*"representable" + 0.000*"reward" + 0.000*"lifting" + '
  '0.000*"distributionally" + 0.000*"setc" + 0.000*"rich" + 0.000*"belong"'),
 (3,
  '0.000*"epns" + 0.000*"elcc" + 0.000*"insar" + 0.000*"gasser" + 0.000*"gait" '
  '+ 0.000*"mtu" + 0.000*"anns" + 0.000*"fibre" + 0.000*"career" + '
  '0.000*"coregularization"'),
 (4,
  '0.000*"epns" + 0.000*"elcc" + 0.000*"insar" + 0.000*"gasser" + 0.000*"gait" '
  '+ 0.000*"mtu" + 0.000*"anns" + 0.000*"fibre" + 0.000*"career" + '
  '0.000*"coregularization"')]


In [276]:
def get_top_documents(corpus,ids,topic):
    num_topics = 5
    topic_proportions = np.zeros(shape=(len(corpus),num_topics+1))
    topic_proportions[:,0] = ids
    response = lda_model.get_document_topics(corpus)
    for i,doc in enumerate(response):
        for t,prop in doc:
            topic_proportions[i][t+1] = prop 
    ind = topic_proportions[:,topic+1].argsort()[::-1]
    return topic_proportions[ind]