In [1]:
################################################################ TO CHANGE ################################################################
# Stolen from Sneha
# FILE PATHS AND FIELDS
path_json = "comp767_papers_sample.jsonl"  #3154 papers
fields = ["title", "abstract", "authors"] # fields to include in training
# TRAINING PARAM
num_topics = 10 # truthfully we want to see 13 topics
chunksize = 2000 # how many docs are processed at a time set to 2000 as default
passes = 20 # how often the model is trained on all the docs set to 20 as default
iterations = 400 # how often do we iterate over each doc set to 400 as default
eval_every = None  # Don't evaluate model perplexity, takes too much time.
################################################################ TO CHANGE ################################################################


import json #
import nltk # for preprocessing
nltk.download('wordnet')

from nltk.tokenize import RegexpTokenizer # for tokenization
from nltk.stem.wordnet import WordNetLemmatizer # for lemmatizing
from gensim.corpora import Dictionary # to construct dictionary
from gensim.models import LdaModel # to make LDA model
from pprint import pprint # print output in a readable way
from nltk.util import ngrams

with open(path_json) as fp:
    papers = [json.loads(line) for line in fp.readlines()]


[nltk_data] Downloading package wordnet to /Users/baddie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing functions 

## Preprocessing Methods - Abstract
1. n-grams
1. remove articles + small words

## Preprocessing Methods - Authors
1. n-grams
1. Throwing out authors all together


In [2]:
def ident(z,*args):
    '''dummy identity function'''
    if (type(z) is not list):
        return z
    else:
        return ' '.join(z)
    
def author_iden(z,*args):
    return z 

def add_ngrams(inpt_sentence, n=1):
    
    if inpt_sentence is not None:
        
        out=inpt_sentence
        
        for i in range(n,1,-1):
            
            grams=ngrams(inpt_sentence, i)
            
            out.extend(['_'.join(x) for x in grams])
        
        return ' '.join(out)
    return ''

def author_ngram(input_list, *args):
    return [x.replace(' ', '_').lower() for x in input_list]

def destroy_param(z,*args):
    return []

def preprocess_data(all_docs, min_word_len=2,
                    title_pp=ident,arg_title=None,
                    abstract_pp=ident, arg_abstract=None,
                    author_pp=author_iden, arg_author=None):
    ret_ar=[]
    
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()
    for doc in all_docs:
        
        #title
        title= ' '.join([lemmatizer.lemmatize (x) for x in doc['title'].split(' ')])
        
        #abstract
        abstract= [lemmatizer.lemmatize (x) for x in str(doc['abstract']).split(' ')] #list
        
        # concatenate all strings 
        representation = title_pp(title,arg_title) + ' \n '+ abstract_pp(abstract,arg_abstract).lower()
        
        # get rid of punctuation & tokenize
        representation=tokenizer.tokenize(representation.lower()) + author_pp(doc['authors'],arg_author)
        
        # take out numbers (but not numbers within words)
        representation = [token for token in representation if not token.isnumeric()]

        # take out words that are at least 3 characters long character
        representation = [token for token in representation if len(token) > min_word_len] 

        # channge code here to not lemmatize ngrams
        #representation = [lemmatizer.lemmatize(token) for token in representation]

        representation=[x.strip('_') for x in representation]
        ret_ar.append(representation)
    
    return ret_ar


### Experiment 1 : 
Author ngram  + standard abstract

In [3]:
#corpus preprocessing
full_data=preprocess_data(papers,
                          author_pp= author_ngram )

#constructs word to ID mapping 
dictionary = Dictionary(full_data)

# filters out words that occur less than 20 times or are in more than 50% of docs
dictionary.filter_extremes(no_below=20, no_above=0.5)

# transform to vectorized form to put in model
corpus = [dictionary.doc2bow(doc) for doc in full_data]

# Finds how many unique tokens we've found and how many docs we have
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# index to word dictionary
temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

# sum of topic coherences of all topics, divided by the number of topics
avg_topic_coherence = sum([t[1] for t in model.top_topics(corpus)]) / num_topics 
print('Average topic coherence: %.4f.' % avg_topic_coherence)
pprint(model.print_topics())


Number of unique tokens: 2395
Number of documents: 3154
Average topic coherence: -2.0674.
[(0,
  '0.029*"text" + 0.022*"analysis" + 0.014*"topic" + 0.012*"method" + '
  '0.011*"content" + 0.008*"model" + 0.007*"political" + 0.007*"word" + '
  '0.007*"approach" + 0.007*"news"'),
 (1,
  '0.029*"energy" + 0.016*"power" + 0.015*"system" + 0.010*"cost" + '
  '0.009*"environmental" + 0.008*"process" + 0.008*"design" + 0.008*"emission" '
  '+ 0.007*"water" + 0.006*"optimization"'),
 (2,
  '0.036*"disclosure" + 0.020*"firm" + 0.019*"environmental" + 0.016*"company" '
  '+ 0.016*"corporate" + 0.016*"carbon" + 0.015*"information" + '
  '0.013*"reporting" + 0.012*"financial" + 0.012*"study"'),
 (3,
  '0.054*"data" + 0.033*"learning" + 0.020*"machine" + 0.011*"can" + '
  '0.010*"method" + 0.009*"using" + 0.009*"based" + 0.008*"our" + '
  '0.007*"approach" + 0.007*"algorithm"'),
 (4,
  '0.024*"political" + 0.018*"party" + 0.014*"policy" + 0.011*"their" + '
  '0.009*"more" + 0.008*"public" + 0.008*"

### Experiment 2 : 
Author ngram  + Abstract ngram

In [4]:
#corpus preprocessing
full_data=preprocess_data(papers,
                          author_pp= author_ngram,
                          abstract_pp=add_ngrams,arg_abstract=3)

#constructs word to ID mapping 
dictionary = Dictionary(full_data)

# filters out words that occur less than 20 times or are in more than 50% of docs
dictionary.filter_extremes(no_below=20, no_above=0.5)

# transform to vectorized form to put in model
corpus = [dictionary.doc2bow(doc) for doc in full_data]

# Finds how many unique tokens we've found and how many docs we have
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# index to word dictionary
temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

# sum of topic coherences of all topics, divided by the number of topics
avg_topic_coherence = sum([t[1] for t in model.top_topics(corpus)]) / num_topics 
print('Average topic coherence: %.4f.' % avg_topic_coherence)
pprint(model.print_topics())


Number of unique tokens: 5201
Number of documents: 3154
Average topic coherence: -1.8978.
[(0,
  '0.019*"model" + 0.015*"sst" + 0.014*"temperature" + 0.011*"enso" + '
  '0.011*"skill" + 0.011*"climate" + 0.009*"precipitation" + 0.009*"ocean" + '
  '0.009*"however" + 0.009*"seasonal"'),
 (1,
  '0.021*"disclosure" + 0.015*"carbon" + 0.013*"firm" + 0.012*"ghg" + '
  '0.009*"company" + 0.009*"firms" + 0.008*"csr" + 0.008*"companies" + '
  '0.008*"cdp" + 0.008*"information"'),
 (2,
  '0.022*"however" + 0.011*"term" + 0.011*"social" + 0.010*"study" + '
  '0.009*"non" + 0.009*"this_study" + 0.008*"based" + 0.008*"crisis" + '
  '0.007*"health" + 0.007*"long"'),
 (3,
  '0.014*"we" + 0.011*"party" + 0.010*"political" + 0.010*"in" + '
  '0.008*"however" + 0.007*"policy" + 0.007*"government" + 0.006*"in_the" + '
  '0.005*"public" + 0.005*"which"'),
 (4,
  '0.014*"model" + 0.013*"data" + 0.012*"we" + 0.011*"learning" + 0.010*"in" + '
  '0.009*"based" + 0.009*"network" + 0.008*"which" + 0.008*"metho

### Experiment 3 : 
No author  + Abstract ngram


In [5]:
#corpus preprocessing
full_data=preprocess_data(papers,
                          author_pp= destroy_param,
                          abstract_pp=add_ngrams,arg_abstract=3)

#constructs word to ID mapping 
dictionary = Dictionary(full_data)

# filters out words that occur less than 20 times or are in more than 50% of docs
dictionary.filter_extremes(no_below=20, no_above=0.5)

# transform to vectorized form to put in model
corpus = [dictionary.doc2bow(doc) for doc in full_data]

# Finds how many unique tokens we've found and how many docs we have
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# index to word dictionary
temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

# sum of topic coherences of all topics, divided by the number of topics
avg_topic_coherence = sum([t[1] for t in model.top_topics(corpus)]) / num_topics 
print('Average topic coherence: %.4f.' % avg_topic_coherence)
pprint(model.print_topics())


Number of unique tokens: 5200
Number of documents: 3154
Average topic coherence: -1.9651.
[(0,
  '0.020*"first" + 0.016*"second" + 0.014*"climate" + 0.012*"we" + '
  '0.009*"however" + 0.009*"term" + 0.008*"here" + 0.007*"scale" + '
  '0.006*"long" + 0.006*"model"'),
 (1,
  '0.021*"data" + 0.016*"learning" + 0.015*"network" + 0.012*"time" + '
  '0.011*"in" + 0.011*"machine" + 0.008*"method" + 0.008*"based" + '
  '0.007*"which" + 0.007*"real"'),
 (2,
  '0.024*"change" + 0.019*"climate_change" + 0.015*"disclosure" + '
  '0.015*"climate" + 0.012*"carbon" + 0.009*"ghg" + 0.009*"firm" + '
  '0.007*"information" + 0.006*"findings" + 0.006*"education"'),
 (3,
  '0.042*"party" + 0.017*"parties" + 0.013*"right" + 0.009*"government" + '
  '0.009*"position" + 0.009*"political" + 0.008*"left" + 0.008*"iii" + '
  '0.007*"speech" + 0.007*"positions"'),
 (4,
  '0.025*"we" + 0.017*"data" + 0.014*"text" + 0.012*"analysis" + 0.009*"topic" '
  '+ 0.008*"however" + 0.008*"research" + 0.007*"our" + 0.007*"

The number of unique tokens not really changing may mean that the authors are not super important

### Experiment 4 : 
No author  + standard Abstract


In [6]:
#corpus preprocessing
full_data=preprocess_data(papers,
                          author_pp= destroy_param )

#constructs word to ID mapping 
dictionary = Dictionary(full_data)

# filters out words that occur less than 20 times or are in more than 50% of docs
dictionary.filter_extremes(no_below=20, no_above=0.5)

# transform to vectorized form to put in model
corpus = [dictionary.doc2bow(doc) for doc in full_data]

# Finds how many unique tokens we've found and how many docs we have
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# index to word dictionary
temp = dictionary[0] 
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

# sum of topic coherences of all topics, divided by the number of topics
avg_topic_coherence = sum([t[1] for t in model.top_topics(corpus)]) / num_topics 
print('Average topic coherence: %.4f.' % avg_topic_coherence)
pprint(model.print_topics())


Number of unique tokens: 2395
Number of documents: 3154
Average topic coherence: -1.9941.
[(0,
  '0.020*"party" + 0.017*"policy" + 0.014*"their" + 0.012*"political" + '
  '0.012*"more" + 0.011*"public" + 0.011*"government" + 0.008*"how" + '
  '0.008*"state" + 0.007*"group"'),
 (1,
  '0.034*"energy" + 0.014*"power" + 0.012*"emission" + 0.011*"environmental" + '
  '0.011*"cost" + 0.010*"system" + 0.008*"carbon" + 0.008*"gas" + '
  '0.008*"water" + 0.008*"cycle"'),
 (2,
  '0.031*"model" + 0.021*"learning" + 0.019*"data" + 0.013*"method" + '
  '0.013*"machine" + 0.010*"using" + 0.010*"can" + 0.010*"our" + '
  '0.009*"algorithm" + 0.009*"approach"'),
 (3,
  '0.032*"disclosure" + 0.026*"carbon" + 0.018*"firm" + 0.015*"environmental" '
  '+ 0.014*"corporate" + 0.014*"company" + 0.013*"information" + 0.012*"study" '
  '+ 0.011*"reporting" + 0.009*"financial"'),
 (4,
  '0.046*"climate" + 0.041*"change" + 0.013*"education" + 0.011*"study" + '
  '0.011*"their" + 0.008*"environmental" + 0.008*"res