## Reading packages and data

In [17]:
# Load packages
import numpy as np
import pandas as pd
import json
import nltk
import io 
import os
import gzip
from tqdm import tqdm
from functools import partial 

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pickle

# spacy for lemmatization
import spacy

import itertools
import collections

import itertools as it
import re
import time
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aksen\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aksen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# def read_pdf(file_name, start_line, end_line, ids):
#     papers_ids_text = []; abstract = []; body_text = []; whole_text = []

#     with open(file_name) as f:
#         for _ in range(start_line):
#             next(f)
#         index = 0
#         for line in f:
#             paper = json.loads(line)
# #             if index > end_line - start_line:
# #                 break
# #             index += 1
#             if paper['paper_id'] in ids:
#                 print('in')
#                 papers_ids_text.append(paper['paper_id'])
#                 if paper['abstract']:
#                     print(paper['abstract'])
#                     abstract.append(paper['abstract'][0]['text'])
#                 else: 
#                     abstract.append('')
#                 text = []
#                 full_text = ''
#                 if paper['body_text']:
#                     for entry in paper['body_text']:
#                         if entry['section'] and entry['text']:
#                             section = {key: entry[key] for key in ['section', 'text']}
#                             text.append(section)
#                             if full_text:
#                                 full_text = full_text + '\n' + entry['text']
#                             else:
#                                 full_text = entry['text']
#                     body_text.append(text)
#                     whole_text.append(full_text)
#                 else:
#                     body_text.append([])
#                     whole_text.append('')
                
#         textdata = pd.DataFrame({'paper_id': papers_ids_text, 'abstract': abstract, 'body_text': body_text, 'whole_text': whole_text})

#         return textdata

## Preprocessing

Uncompressing papers from fields we want, saving the metadata and pdf_parses as pickled dictionaries

In [19]:
def process_batch(metadata_file,pdf_file, fields=None, output_dir = './processed/'):
    # Go through metadata files to get relevant paper ids and titles
    ids = []; title = []; 
    # if file is compressed
    if metadata_file[-3:] == '.gz':
        output_file = metadata_file[:-3]
        gz = gzip.open(metadata_file, 'rb')
        f = io.BufferedReader(gz)
        f_out = open(output_file,'wb')
    else:
        f = open(metadata_file)
        f_out = None

    for line in tqdm(f.readlines()):
        paper = json.loads(line)
        if not fields:
            ids.append(paper['paper_id'])
            title.append(paper['title'])
            if f_out:
                f_out.write(line)
        elif paper['mag_field_of_study']:
            field_in = any([x in fields for x in paper['mag_field_of_study']])
            if field_in:
                ids.append(paper['paper_id'])
                title.append(paper['title'])
                if f_out:
                    f_out.write(line)
    f.close()
    if f_out:
        f_out.close()
    # create and save dataframe in output_dir/meta_df
    meta_df = pd.DataFrame({'ids':ids, 'titles':title})
    meta_df_dir = output_dir + 'meta_df/'
    os.makedirs(meta_df_dir, exist_ok=True)
    file_name_without_path_or_ext = metadata_file.split('/')[-1].split('.')[0]
    meta_df_file = meta_df_dir + file_name_without_path_or_ext + '.pkl'
    with open(meta_df_file, 'wb') as f:
        pickle.dump(meta_df, f)

    # get the pdfs         
    papers_ids_text = []; abstract = []; body_text = []; whole_text = []; key_words = [];            
    # if file is compressed
    if pdf_file[-3:] == '.gz':
        output_file = pdf_file[:-3]
        gz = gzip.open(pdf_file, 'rb')
        f = io.BufferedReader(gz)
        f_out = open(output_file,'wb')
    else:
        f = open(pdf_file)
        f_out = None
        
    for line in tqdm(f.readlines()):
        paper = json.loads(line)
        if paper['paper_id'] in ids:
            if f_out:
                f_out.write(line)
            papers_ids_text.append(paper['paper_id'])
            abstract_text = ''
            terms = []
            if paper['abstract']:
                abstract_text = paper['abstract'][0]['text']
                if len(paper['abstract'])>1:   
                    if paper['abstract'][1]['text'][:11].lower() == 'index terms':
                        terms = paper['abstract'][1]['text'][12:].split(',') #remove "Index Terms-" or "INDEX TERMS " from string    
            abstract.append(abstract_text) 
            key_words.append(terms)
            text = []
            full_text = ''
            if paper['body_text']:
                for entry in paper['body_text']:
                    if entry['section'] and entry['text']:
                        section = {key: entry[key] for key in ['section', 'text']}
                        text.append(section)
                        if full_text:
                            full_text = full_text + '\n' + entry['text']
                        else:
                            full_text = entry['text']
            body_text.append(text)
            whole_text.append(full_text)
    f.close()
    if f_out:
        f_out.close()
    # create and save dataframe in output_dir/text_df
    text_df = pd.DataFrame({'paper_id': papers_ids_text, 'abstract': abstract,'key_words': key_words,'body_text': body_text, 'whole_text': whole_text})
    text_df_dir = output_dir + 'text_df/'
    os.makedirs(text_df_dir, exist_ok=True)
    file_name_without_path_or_ext = pdf_file.split('/')[-1].split('.')[0]
    text_df_file = text_df_dir + file_name_without_path_or_ext + '.pkl'
    with open(text_df_file, 'wb') as f:
        pickle.dump(text_df, f)                


In [20]:
user = "m"
if user == "v":
    sample_data_dir = "/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/20200705v1/sample/"
    full_data_dir = "/Volumes/Extreme SSD/Library/SemanticScholar Data/20200705v1/full/"
else:
    full_data_dir = '20200705v1/full/'
    
metadata_dir = full_data_dir + 'metadata/'
pdf_parses_dir = full_data_dir + 'pdf_parses/'

In [21]:
files = range(1)
metadata = [metadata_dir + f'metadata_{i}.jsonl' for i in files]
pdfs = [pdf_parses_dir + f'pdf_parses_{i}.jsonl' for i in files]
fields = ['Computer Science']
print(metadata, pdfs)

['20200705v1/full/metadata/metadata_0.jsonl'] ['20200705v1/full/pdf_parses/pdf_parses_0.jsonl']


In [22]:
for batch in zip(metadata,pdfs):
    process_batch(batch[0], batch[1], fields)

100%|███████████████████████████████████████████████████████████████████████| 121562/121562 [00:02<00:00, 46556.71it/s]
100%|███████████████████████████████████████████████████████████████████████████| 51058/51058 [01:40<00:00, 505.87it/s]


In [23]:
with open('./processed/meta_df/metadata_0.pkl', 'rb') as f:
    meta_df = pickle.load(f)
with open('./processed/text_df/pdf_parses_0.pkl','rb') as f:
    text_df = pickle.load(f)

## Cleaning
Finer cleaning of data:
- remove stop words, high-frequency words, etc.

In [92]:
from nltk import pos_tag
from nltk.corpus import wordnet
def get_word_postag(word):
    #if pos_tag([word])[0][1].startswith('J'):
    #    return wordnet.ADJ
    #if pos_tag([word])[0][1].startswith('V'):
    #    return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        #return wordnet.NOUN
        return True
    else:
        return False
        #return wordnet.ADJ
        #return wordnet.NOUN

from nltk.tokenize import word_tokenize
# Preprocessing: tokenize words
def tokenize(text):
    return(word_tokenize(text))


def sent_to_words(sentences):
    for sentence in sentences:
        return(gensim.utils.simple_preprocess(str(sentence), min_len=3,deacc=True))  # deacc=True removes punctuations

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# Preprocessing: remove stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords]) 
    #return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Preprocessing: lemmatizing
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Preprocessing: remove short text
def find_longer_text(texts,k=200):
    return list(map(lambda x: len(x.split())>k,texts))
    
#     lengths = list(map(lambda x: len(x.split()), texts))
#     return [val >= k for val in lengths]
    #return [idx for idx, val in enumerate(lengths) if val >= k] 

# Preprocessing: alpha num
def keep_alphanum(words):
    #def isalphanum(word):
    #return word.isalnum()
    return filter(lambda word: word.isalnum(), words)
    #return [word for word in words if word.isalnum()]

# Preprocessing: keep nouns
def keep_nouns(words):
    return filter(get_word_postag, words)
    #return [word for word in words if get_word_postag(word) =='n']

# Preprocessing: keep words >= 3 in length
def keep_longer_words(words):
    return filter(lambda x: (len(x) >= 3), words)
    #return [word for word in words if len(word) >= 3]

# Preprocessing: lemmatize
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
def lemmatize(words):
    return (map(lm.lemmatize, words)) # removing list

# Preprocessing: stemming
from nltk.stem import PorterStemmer 
ps = PorterStemmer() 
def stemming(words):
    #return [ps.stem(word) for word in words]
    return map(ps.stem, words)

def remove_digits(words):
    return filter(lambda x: x.isalpha(), words)
    #return list(filter(lambda x: x.isalpha(), words))
#     return [word for word in words if word.isalpha()]

def merged(words):
    return ' '.join(word for word in words)        


In [25]:
Codes =['C', 'C++', 'Java', 'Python'] 
Codes = map(len,Codes)
selectors = [False, False, False, True] 
  
Best_Programming = itertools.compress(Codes, selectors) 
# x = list(map(len,Best_Programming) )
# print(x)
for each in Best_Programming: 
    print(each) 

6


In [101]:
def clean_pdf(text_df, file_name, output_dir='./cleaned/'):
    
    start = time.time()
    
    # Convert to list
    ids = text_df['paper_id'].values.tolist()
    contents = text_df['whole_text'].values.tolist()
    abstracts = text_df['abstract'].values.tolist()
    
    # Add abstract to text
    contents = [i + j for i, j in zip(contents, abstracts)]
    
    t = time.time()
    print(t-start)
    
    # Remove new line characters
    contents = (map(lambda x: re.sub('\s+', ' ', x), contents))
    
    t = time.time()
    print(t-start)
    
    # Preprocessing: lower case text
    contents = (map(lambda x: x.lower(),contents))
    
    t = time.time()
    print(t-start)
    
    # Preprocessing: keep alphanumeric
    contents = (map(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x), contents)) 
    
    t = time.time()
    print(t-start)
    
    # Preprocessing: remove stand along numbers
    contents = (map(lambda x: re.sub(" \d+ ", " ", x), contents))

    t = time.time()
    print(t-start)
    
    # Preprocessing: remove stop words
    contents = (map(remove_stopwords, contents))
    
    t = time.time()
    print(t-start)
    
    contents = list(contents)
    
    # Preprocessing: remove short text
    inds = find_longer_text(contents)
    contents = (itertools.compress(contents, inds))
    ids = (itertools.compress(ids, inds))
    
    print(list(contents)[0])
    
    print('Tokenizing')
    
    # Tokenize words + remove punctuation
    word_list = (map(tokenize,contents))
#     word_list = [tokenize(article) for article in contents]

    t = time.time()
    print(t-start)
    
    # Remove numbers
    word_list = (map(remove_digits, word_list))
    
    t = time.time()
    print(t-start)
    
    # Keep longer words
#     word_list = [keep_longer_words(words) for words in  word_list]
    word_list = map(keep_longer_words,  word_list)
    
    t = time.time()
    print(t-start)
    
    print('Lemmatizing')
    
    # Preprocessing: lemmatize
    word_list = (map(lemmatize, word_list))
    
    print(list(word_list)[0])
    
    t = time.time()
    print(t-start)
    
    print('Bag of Words Representation')
    # Preprocessing: 
    dct = corpora.Dictionary()
    doc2bow = partial(dct.doc2bow,allow_update=True)
    corpus = map(doc2bow, word_list)
#     corpus = [dct.doc2bow(doc, allow_update=True) for doc in word_list]

    t = time.time()
    print(t-start)
    
    #dct.save(file_name+'.dict')
    word_list = list(word_list)
    word_list =  [item for sublist in word_list for item in sublist]
    counter=collections.Counter(word_list)
    print(type(corpus))
    os.makedirs(output_dir, exist_ok=True)
    output_file_name = output_dir + file_name + '_clean.pkl'
    with open(output_file_name, 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump({'dct': dct, 'corpus': list(corpus), 'counter': counter,'ids': ids, 'word_list': word_list}, f)
    

In [102]:
file_name = 'pdf_parses_0'
clean_pdf(text_df[0:1000],file_name)

0.003989219665527344
0.003989219665527344
0.003989219665527344
0.003989219665527344
0.003989219665527344
0.003989219665527344
throughout technical note use capital letters denote matrices bold face letters denote column vectors use e denote ith elementary vector length use r n denote nonnegative orthant r n c set joint probability distributions three random vectors b c ab c denotes set marginal distributions b use represent mixture distribution given two probability distributions f f bernoulli random variable x takes value wp p xf xf random variable follows distribution f wp p follows f wp p use n represent gaussian distribution mean variance finite markov decision process mdp defined 6tuple p r possibly infinite decision horizon ieee personal use permitted republicationredistribution requires ieee permission see httpwwwieeeorgpublicationsstandardspublicationsrightsindexhtml information 1 discount factor state set action set state assumed finite parameter p r transition probability exp

IndexError: list index out of range

In [69]:
cleaned_file= './cleaned/pdf_parses_0_clean.pkl'
with open(cleaned_file, 'rb') as f:
    cleaned_data = pickle.load(f)

In [None]:
cleaned_data['word_list']

In [55]:
def clean_sections(textdf, file_name):
    pass

In [52]:
# ids = selected_data['ids'].values
# pdf_file = pdf_parses_dir + 'pdf_parses_0.jsonl'
# textdf = read_pdf(pdf_file,0,100000, ids)
# textdf.head(5)

Unnamed: 0,paper_id,abstract,body_text,whole_text
0,18980380,This technical note studies Markov decision pr...,"[{'section': 'II. PRELIMINARIES', 'text': 'Thr...","Throughout the technical note, we use capital ..."
1,18981111,,[{'section': 'Exploration of Unknown Spaces by...,"ORLY LAHAV DAVID MIODUSER Tel Aviv University,..."
2,18981625,,[],
3,18982496,In this paper I discuss some constraints and i...,[{'section': 'Lack of Cooperation from Fellow ...,We normally take precautionary measures agains...
4,18983082,,[],


In [53]:
# sections = textdf['body_text'][0]
# section_titles = [x['section'] for x in sections]
# section_titles
# sections[0]
# sections[1]['section']

In [49]:
# import os
# cwd = os.getcwd()
# file_path = cwd + '\\Preprocessed\\0'
# output = clean_pdf(textdf, file_path)

Tokenizing
Lemmatizing
Bag of Words Representation


In [32]:
dct = output[0]
corpus = output[1]

In [43]:
word_list = cleaned_data[2]
word_list =  [item for sublist in word_list for item in sublist]

In [45]:
# Import the wordcloud library
from wordcloud import WordCloud

# Join the different processed titles together.
#long_string = ','.join(list(papers['paper_text_processed'].values))
long_string = ' '.join(word_list)

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
#wordcloud.generate(long_string)
#wordcloud.generate(word_list)
wordcloud.generate_from_frequencies(counter)

# Visualize the word cloud
wordcloud.to_image()

NameError: name 'counter' is not defined

In [None]:
# file_path = cwd + '\\Preprocessed\\'
# def process_pdf(file_name, batch_num, start_ind, end_ind, ids):
#     textdf = read_pdf(file_name, start_ind, end_ind, ids)
#     save_path = file_path + str(batchnum)
#     output = clean_pdf(textdf, save_path)
    

In [66]:
# def process_batch(batch_ind, batch_size=50000, field='Computer Science'):
#     file_name_meta = '20200705v1/full/metadata/metadata_' + str(batch_ind) + '.jsonl'
#     file_name_pdf = '20200705v1/full/pdf_parses/pdf_parses_' + str(batch_ind) + '.jsonl'
    
#     import os
#     cwd = os.getcwd()
#     file_path = cwd + '\\Preprocessed\\' 

#     start = time.time()
    
#     nlines = sum(1 for line in open(file_name_pdf))
#     batch_num = int(np.ceil(nlines / batch_size))
    
#     print('Processing metadata file', batch_ind)
#     selected_data = process_metadata(file_name_meta, field)
#     selected_ids = selected_data['ids'].values
    
#     with open(file_path+'metadata.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
#         pickle.dump(selected_data, f)
    
#     t = time.time()
#     print(t-start)
    
#     for i in range(batch_num):
#         print('Processing pdfs batch Number: ', i)
#         line_nums = [batch_size*i, batch_size*(i+1)]
#         textdf = read_pdf(file_name_pdf,line_nums[0],line_nums[1], ids)  
        
#         t = time.time()
#         print(t-start)
        
#         print('Processing pdfs batch Number: ', i)

#         output = clean_pdf(textdf, file_path+str(i))
        
#         t = time.time()
#         print(t-start)

In [67]:
# process_batch(batch_ind=0, field='Computer Science')

Processing metadata file 0
35.497846364974976
Processing pdfs batch Number:  0


TypeError: can only concatenate str (not "int") to str

In [102]:
word_counts = sorted(dct.dfs.items(), key = lambda x: x[1], reverse=True)
top_ids = [x[0] for x in word_counts[0:100]]
top_words = [dct.id2token[x] for x in top_ids]

KeyError: 980

In [None]:
dct.filter_tokens(bad_ids=top_ids)

## Train LDA model

In [105]:
documents = corpus.get_texts()

AttributeError: 'list' object has no attribute 'get_texts'

In [None]:
# Step 4: Train the LDA model
from gensim.models import LdaModel, LdaMulticore
from gensim.test.utils import common_corpus

#perplexity_logger = PerplexityMetric(corpus=common_corpus, logger='shell')
#convergence_logger = ConvergenceMetric(logger='shell')
#coherence_cv_logger = CoherenceMetric(corpus=corpus, logger='shell', coherence = 'c_v', texts = documents)

lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=2020,
                         num_topics=10,
                         passes=1,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

In [84]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -0.6558824017161562


In [76]:
num_topics = 10

top_topics = lda_model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.6559.
[([(0.005903978, 'model'),
   (0.0056817126, 'data'),
   (0.0050683604, 'system'),
   (0.004683143, 'one'),
   (0.004589382, 'set'),
   (0.004308348, 'result'),
   (0.004308256, 'time'),
   (0.0041383095, 'used'),
   (0.0041151457, 'using'),
   (0.004082712, 'algorithm'),
   (0.003972761, 'two'),
   (0.0039018418, 'method'),
   (0.0038599866, 'also'),
   (0.0038234273, 'number'),
   (0.0033649916, 'value'),
   (0.0032020246, 'problem'),
   (0.0031827844, 'case'),
   (0.003104415, 'function'),
   (0.0029815945, 'use'),
   (0.0029531042, 'different')],
  -0.15817686393398864),
 ([(0.00148182, 'method'),
   (0.0013913988, 'ship'),
   (0.001329745, 'model'),
   (0.0010342445, 'set'),
   (0.001012543, 'user'),
   (0.0009968651, 'data'),
   (0.0008815918, 'one'),
   (0.00087165734, 'system'),
   (0.0008399095, 'using'),
   (0.00083453057, 'used'),
   (0.000791407, 'function'),
   (0.0007810815, 'algorithm'),
   (0.00077615393, 'value'),
   (0.0007526579, 'pr