# Model Training

Load clean data

In [1]:
import pickle
import os
import numpy as np
import pandas as pd
import collections
import train_helpers

In [2]:
# Load documents dataset

docs_cleaned_dir = './Dataset/cleaned/final/docs/'
docs_cleaned_file = docs_cleaned_dir + 'docs_cleaned.pkl'
with open(docs_cleaned_file, 'rb') as f:
    docs_cleaned = pickle.load(f)

In [3]:
docs_dct = docs_cleaned['dct']
docs_corpus = docs_cleaned['corpus']
docs_counter = docs_cleaned['counter']
docs_docs = docs_cleaned['docs']
docs_ids = docs_cleaned['ids']
docs_key_words = docs_cleaned['key_words']

In [4]:
# Load sections dataset

sections_cleaned_dir = './Dataset/cleaned/final/sections/'
sections_cleaned_file = sections_cleaned_dir + 'sections_cleaned.pkl'
with open(sections_cleaned_file, 'rb') as f:
    sections_cleaned = pickle.load(f)

In [5]:
sections_dct = sections_cleaned['dct']
sections_corpus = sections_cleaned['corpus']
sections_counter = sections_cleaned['counter']
sections_docs = sections_cleaned['docs']
paper_ids = sections_cleaned['ids']
sections_lens = list(map(len, sections_corpus))

In [6]:
# Get summary statistics
print(len(sections_dct), " = vocabulary size")
print(len(docs_corpus), " = number of docs in corpus")
print(len(sections_corpus), " = number of sections in corpus")
print(round(np.mean(sections_lens),3), " = mean len of sections after filtering")

13165  = vocabulary size
4536  = number of docs in corpus
52020  = number of sections in corpus
114.974  = mean len of sections after filtering


In [7]:
# Calculate log weighting scheme
V = len(sections_dct)
weights_log = train_helpers.log_weights(sections_dct, V)

In [9]:
# Calculate PMI weighting scheme
weights_pmi = train_helpers.pmi_weights(sections_cleaned)

normalizing


In [10]:
# Calculate High-res weighting scheme
weights_highres = train_helpers.highres_weights(docs_cleaned, sections_cleaned)

normalizing


In [11]:
# Create copies of the corpus
import copy
sections_corpus_log = copy.deepcopy(sections_corpus)
sections_corpus_pmi = copy.deepcopy(sections_corpus)
sections_corpus_highres = copy.deepcopy(sections_corpus)

In [12]:
# Update corpusi based on the reweighting schemes
sections_corpus_log = train_helpers.update_corpus(sections_corpus_log, weights_log, weight_type='log')
sections_corpus_pmi = train_helpers.update_corpus(sections_corpus_pmi, weights_pmi, weight_type='pmi')
sections_corpus_highres = train_helpers.update_corpus(sections_corpus_highres, weights_highres, weight_type='highres')

## Model Training

### Document level analysis

In [54]:
corpuses = [docs_corpus]
dct = docs_dct
docs = docs_docs
ids = docs_ids

In [55]:
num_topics_vec = [5, 10, 20, 30, 40, 50, 60, 150]
coherences = np.zeros((len(num_topics_vec), len(corpuses)))
perplexity = np.zeros(((len(num_topics_vec)), len(corpuses)))

for i in range(len(num_topics_vec)):
    num_topics = num_topics_vec[i]
    print("Number of topics = ", str(num_topics))
    for j in range(len(corpuses)):
        print(j)
        corpus = corpuses[j]
        lda_model = train_helpers.train(corpus, dct, docs, ids, num_topics, field='cs-med')
        coherences[i,j] = train_helpers.avg_coherence(lda_model, corpus)
        perplexity[i,j] = np.mean(lda_model.log_perplexity(corpus))
        print("Perplexity is " + str(perplexity[i,j]))

Number of topics =  60
0
Average topic coherence: -0.2333.
Perplexity is -11.720652654658071
Number of topics =  150
0
Average topic coherence: -0.2338.
Perplexity is -16.892127067065978


### Section level analysis

In [52]:
# Data Input
corpuses = [sections_corpus, sections_corpus_log, sections_corpus_pmi, sections_corpus_highres]
dct = sections_dct
docs = sections_docs
ids = paper_ids

In [53]:
num_topics_vec = [5, 10, 20, 30, 40, 50, 60, 150]
coherences = np.zeros((len(num_topics_vec), len(corpuses)))
perplexity = np.zeros(((len(num_topics_vec)), len(corpuses)))

for i in range(len(num_topics_vec)):
    num_topics = num_topics_vec[i]
    print("Number of topics = ", str(num_topics))
    for j in range(len(corpuses)):
        print(j)
        corpus = corpuses[j]
        lda_model = train_helpers.train(corpus, dct, docs, ids, num_topics, field='cs-med')
        coherences[i,j] = train_helpers.avg_coherence(lda_model, corpus)
        perplexity[i,j] = np.mean(lda_model.log_perplexity(corpus))
        print("Perplexity is " + str(perplexity[i,j]))

Number of topics =  60
0
Average topic coherence: -1.0664.
Perplexity is -9.046540413737691
1
Average topic coherence: -1.8111.
Perplexity is -9.995494488001164
2
Average topic coherence: -1.0453.
Perplexity is -8.906661399146259
3
Average topic coherence: -1.0701.
Perplexity is -9.16886707884499
Number of topics =  150
0
Average topic coherence: -1.0758.
Perplexity is -9.890494410847174
1
Average topic coherence: -1.8980.
Perplexity is -11.014512987379394
2
Average topic coherence: -1.0489.
Perplexity is -9.941397016680234
3
Average topic coherence: -1.0719.
Perplexity is -10.265054689312224


In [22]:
pd.DataFrame(coherences, columns = ['None', 'Log', 'PMI', 'Highres']).to_csv('./results/coherences2.csv')