In [39]:
import pandas as pd
import glob
import pycountry
import pickle
import re
from tqdm.notebook import tqdm
import gensim
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim import corpora
import spacy

In [2]:
# list of all bibtext files
file_list = glob.glob('bibtex_files/**/*.bib')
file_list

['bibtex_files/WRR/savedrecs.bib',
 'bibtex_files/WRR/savedrecs-29.bib',
 'bibtex_files/WRR/savedrecs-15.bib',
 'bibtex_files/WRR/savedrecs-14.bib',
 'bibtex_files/WRR/savedrecs-28.bib',
 'bibtex_files/WRR/savedrecs-16.bib',
 'bibtex_files/WRR/savedrecs-9.bib',
 'bibtex_files/WRR/savedrecs-8.bib',
 'bibtex_files/WRR/savedrecs-17.bib',
 'bibtex_files/WRR/savedrecs-13.bib',
 'bibtex_files/WRR/savedrecs-12.bib',
 'bibtex_files/WRR/savedrecs-10.bib',
 'bibtex_files/WRR/savedrecs-38.bib',
 'bibtex_files/WRR/savedrecs-39.bib',
 'bibtex_files/WRR/savedrecs-11.bib',
 'bibtex_files/WRR/savedrecs-34.bib',
 'bibtex_files/WRR/savedrecs-20.bib',
 'bibtex_files/WRR/savedrecs-3.bib',
 'bibtex_files/WRR/savedrecs-2.bib',
 'bibtex_files/WRR/savedrecs-21.bib',
 'bibtex_files/WRR/savedrecs-35.bib',
 'bibtex_files/WRR/savedrecs-23.bib',
 'bibtex_files/WRR/savedrecs-37.bib',
 'bibtex_files/WRR/savedrecs-36.bib',
 'bibtex_files/WRR/savedrecs-22.bib',
 'bibtex_files/WRR/savedrecs-26.bib',
 'bibtex_files/WRR/

In [3]:
# init storage
columns = ['DOI', 'Year', 'Journal', 'Title', 'Abstract', 'Affiliation']
corpus_df = pd.DataFrame(columns = columns)
corpus_df

Unnamed: 0,DOI,Year,Journal,Title,Abstract,Affiliation


In [4]:
# extract information from all bibtex entries -- store in dataframe

# loop through bib files
paper_count = -1
for working_file in tqdm(file_list):
    
    # read bibtex file by line
    with open(working_file) as fp:
        
        # read first line
        line = fp.readline()
        
        # loop until end of file
        while line:
            
            # read active line
            line = fp.readline()
            
            # find start of a new paper
            if line[:8] == '@article':
                paper_count = paper_count + 1
            
            # pull components
            if line[:5] == 'Title':
                title = line.split('{{')[1].split('}}')[0]
                corpus_df.loc[paper_count, 'Title'] = title
            if line[:7] == 'Journal':
                journal = line.split('{{')[1].split('}}')[0]
                corpus_df.loc[paper_count, 'Journal'] = journal
            if line[:8] == 'Abstract':
                text = line.split('{{')[1].split('\n')[0]
            
                # read until end of abstract (can be multi-line)
                end_of_abstract = False
                while not end_of_abstract:
                    line = fp.readline()
                    if len(line.split('}}')) == 1:
                        text = text + line.split('}}')[0].split('\n')[0]
                    else:
                        text = text + line.split('}}')[0].split('\n')[0]
                        end_of_abstract = True
                corpus_df.loc[paper_count, 'Abstract'] = text

            if line[:3] == 'DOI':
                doi = line.split('{{')[1].split('}}')[0]
                corpus_df.loc[paper_count, 'DOI'] = doi
            if line[:4] == 'Year':
                year = line.split('{{')[1].split('}}')[0]
                corpus_df.loc[paper_count, 'Year'] = year
            if line[:17] == 'Early Access Date':
                year = line.split('{{')[1].split('}}')[0][-4:]
                corpus_df.loc[paper_count, 'Year'] = year
            if line[:11] == 'Affiliation':
                affiliation = line.split('{{')[1].split('}}')[0]
                affiliation1 = affiliation.split('.')[0]
                affiliation2 = affiliation1.split(',')[-1]
                affiliation3 = affiliation2.split(' ')[-1]
                test_list = ['1','2','3','4','5','6','7','8','9','0']
  
                #Setting all the zipcodes to US of A!            
                if [ele for ele in test_list if(ele in affiliation3)]:
                    affiliation3 = 'USA'
                corpus_df.loc[paper_count, 'Affiliation'] = affiliation3
            
    # close file
    fp.close()

HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))




In [5]:
# pickle the output
corpus_df.to_pickle("data/raw_corpus.pkl")

In [6]:
# convert abstracts to strings
# required to use `re` commands
data = corpus_df['Abstract'].apply(str)

In [7]:
# manually clean the abstracts

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', text) for text in data]

# Remove new line characters
data = [re.sub('\s+', ' ', text) for text in data]

# Remove distracting single quotes
data = [re.sub("\'", "", text) for text in data]

# Hand clean everything we notice
data = [re.sub('ﬂ', 'fl', text) for text in data]
data = [re.sub('[ü←öä∀↓玉成‘’“”…©@-_/≤≥−–∂◦‡×γ∞∼±∞→⇔¨þ´≡íáˇý°φτττ¼.•˜ϕˆ˜°Á¯πłłβκßθıﬁ‰:;ψδ∈ρÞωÀηµεψ∝ðξα²®≈λσ¢·∗¤√]', '', text) for text in data]
data = [re.sub('fig', '', text) for text in data]
data = [re.sub('use', '', text) for text in data]
data = [re.sub('also', '', text) for text in data]
data = [re.sub('show', '', text) for text in data]
data = [re.sub('even', '', text) for text in data]
data = [re.sub('give', '', text) for text in data]
data = [re.sub('versant', '', text) for text in data]
data = [re.sub('statistique', '', text) for text in data]
data = [re.sub('des_crue', '', text) for text in data]
data = [re.sub('pour', '', text) for text in data]
data = [re.sub('alor', '', text) for text in data]
data = [re.sub('measurement', '', text) for text in data]
data = [re.sub('result', '', text) for text in data]
data = [re.sub('method', '', text) for text in data]
data = [re.sub('study', '', text) for text in data]
data = [re.sub('estimate', '', text) for text in data]
data = [re.sub('set', '', text) for text in data]
data = [re.sub('function', '', text) for text in data]
data = [re.sub('test', '', text) for text in data]
data = [re.sub('result', '', text) for text in data]
data = [re.sub('equation', '', text) for text in data]
data = [re.sub('mean', '', text) for text in data]
data = [re.sub('total', '', text) for text in data]
data = [re.sub('des_crue', '', text) for text in data]
data = [re.sub('mazone_obido', '', text) for text in data]
data = [re.sub('resultat', '', text) for text in data]
data = [re.sub('bido', '', text) for text in data]
data = [re.sub('debit', '', text) for text in data]
data = [re.sub('crue', '', text) for text in data]
data = [re.sub('include', '', text) for text in data]
data = [re.sub('faible', '', text) for text in data]
data = [re.sub('toujour', '', text) for text in data]
data = [re.sub('saturee', '', text) for text in data]
data = [re.sub('donnee', '', text) for text in data]
data = [re.sub('etude', '', text) for text in data]
data = [re.sub('rance', '', text) for text in data]
data = [re.sub('time', '', text) for text in data]
data = [re.sub('consist', '', text) for text in data]
data = [re.sub('update', '', text) for text in data]
data = [re.sub('opyright_ohn', '', text) for text in data]
data = [re.sub('copyright', '', text) for text in data]

data

['ransferable discharge permit () programs for controlling several pollutants may manage such pollutants as several individual commodities or as a single weighted sum of the various pollutants his paper s that the weighted sum permit program may be appropriate for managing pollutants that have an additive or a noninteractive effect on environmental quality owever, under this approach, administrators do not have direct control over the amount of each pollutant that is discharged, and environmental quality may be jeopardized unless the selected weighting factors induce a market equilibrium that satisfies environmental quality standards for the region stimates of the cost-effective weighting factors that would tend to satisfy environmental quality standards under such programs are developed here hese s require complete water quality information and, in the case of noninteractive pollutants, treatment costs n approach is described for estimating the cost-effective weighting factors for non

In [10]:
# change each document to a list of words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [11]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[1]]])

['ater', 'distribution', 'networks', 'can', 'fail', 'either', 'by', 'the', 'actual', 'demand', 'at', 'one', 'or', 'more', 'nodes', 'exceeding', 'the', 'design', 'demands', 'or', 'by', 'pipe', 'between', 'two', 'nodes', 'failing', 'he', 'implications', 'of', 'each', 'type', 'of', 'failure', 'can', 'be', 'assessed', 'by', 'the', 'shortfall', 'in', 'supply', 'cad', 'by', 'failure', 'together', 'with', 'the', 'probability', 'of', 'occurrence', 'of', 'the', 'and', 'can', 'be', 'represented', 'by', 'the', 'expected', 'volume', 'of', 'deficit', 'onverting', 'the', 'implications', 'of', 'the', 'two', 'failure', 'types', 'into', 'these', 'commensurate', 'units', 'permits', 'them', 'to', 'be', 'added', 'directly', 'to', 'single', 'ent', 'measure', 'of', 'reliability', 'he', 'assessment', 'of', 'shortfall', 'for', 'the', 'pipe', 'failure', 'mode', 'is', 'derived', 'from', 'the', 'observation', 'that', 'when', 'pipe_breaks', 'section', 'of', 'pipe', 'must', 'be', 'isolated', 'by', 'valves', 'to', 

In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [40]:
# NLTK Stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [41]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Must run this command separately, but only once: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized[:2]

[['ransferable', 'discharge', 'permit', 'program', 'control', 'several', 'pollutant', 'may', 'manage', 'pollutant', 'several', 'individual', 'commodity', 'single', 'weighted', 'sum', 'various', 'pollutant', 'paper', 'weight', 'sum', 'permit', 'program', 'may', 'appropriate', 'managing', 'pollutant', 'additive', 'noninteractive', 'effect', 'environmental', 'quality', 'owever', 'approach', 'administrator', 'direct', 'control', 'amount', 'pollutant', 'discharge', 'environmental', 'quality', 'may', 'jeopardize', 'select', 'weighting', 'factor', 'induce', 'market', 'equilibrium', 'satisfie', 'environmental', 'quality', 'standard', 'region', 'stimate', 'cost', 'effective', 'weighting', 'factor', 'would', 'tend', 'satisfy', 'environmental', 'quality', 'standard', 'program', 'develop', 'hese', 'require', 'complete', 'water', 'quality', 'information', 'case', 'noninteractive', 'pollutant', 'treatment', 'cost', 'approach', 'describe', 'estimate', 'cost', 'effective', 'weighting', 'factor', 'noni

In [42]:
# Save the lemmatized data
with open("data/data_lemmatized.pkl", 'wb') as f:
     pickle.dump(data_lemmatized, f)

In [43]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:2])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 3), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 2), (12, 5), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 2), (23, 1), (24, 5), (25, 1), (26, 1), (27, 4), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 3), (37, 3), (38, 1), (39, 1), (40, 1), (41, 3), (42, 1), (43, 8), (44, 1), (45, 4), (46, 1), (47, 5), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 2), (57, 1), (58, 1), (59, 4), (60, 1), (61, 1), (62, 2), (63, 1), (64, 1), (65, 1), (66, 2), (67, 2), (68, 4), (69, 1)], [(4, 1), (7, 1), (13, 5), (14, 1), (17, 1), (41, 2), (55, 1), (65, 1), (70, 2), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 3), (82, 2), (83, 1), (84, 1), (85, 3), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 2), (92, 1), (93, 1), (94, 2), (95, 4), (96, 1), (97, 1), (98, 2), (99, 2), (100, 1), (101, 4), (102, 1), (103, 

In [46]:
# Save the corpus and the id2word map
with open("data/cleaned_corpus.pkl", 'wb') as f:
     pickle.dump(corpus, f)
with open("data/id2word.pkl", 'wb') as f:
     pickle.dump(id2word, f)

In [45]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]

[[('achieve', 1),
  ('additive', 1),
  ('adequate', 1),
  ('administrator', 1),
  ('amount', 1),
  ('approach', 3),
  ('appropriate', 1),
  ('case', 2),
  ('certainty', 1),
  ('commodity', 1),
  ('complete', 1),
  ('control', 2),
  ('cost', 5),
  ('demand', 1),
  ('demonstrate', 1),
  ('demonstration', 1),
  ('describe', 1),
  ('determine', 1),
  ('develop', 1),
  ('direct', 1),
  ('discharge', 2),
  ('effect', 1),
  ('effective', 2),
  ('efficiency', 1),
  ('environmental', 5),
  ('equilibrium', 1),
  ('estimate', 1),
  ('factor', 4),
  ('hese', 1),
  ('individual', 1),
  ('induce', 1),
  ('information', 2),
  ('jeopardize', 1),
  ('manage', 1),
  ('managing', 1),
  ('market', 1),
  ('may', 3),
  ('noninteractive', 3),
  ('outcome', 1),
  ('owever', 1),
  ('paper', 1),
  ('permit', 3),
  ('phosphorus', 1),
  ('pollutant', 8),
  ('possible', 1),
  ('program', 4),
  ('protection', 1),
  ('quality', 5),
  ('ransferable', 1),
  ('region', 1),
  ('require', 1),
  ('satisfie', 1),
  ('satis