## Reading packages and data

In [3]:
# Load packages
import numpy as np
import pandas as pd
import json
import nltk
import io 
import os
import gzip
from tqdm import tqdm
from functools import partial 
import copy
import sys

# Gensim
import gensim

import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pickle

# spacy for lemmatization
import spacy

import itertools
import collections

import itertools as it
import re
import time
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/virenbajaj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/virenbajaj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/virenbajaj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# def read_pdf(file_name, start_line, end_line, ids):
#     papers_ids_text = []; abstract = []; body_text = []; whole_text = []

#     with open(file_name) as f:
#         for _ in range(start_line):
#             next(f)
#         index = 0
#         for line in f:
#             paper = json.loads(line)
# #             if index > end_line - start_line:
# #                 break
# #             index += 1
#             if paper['paper_id'] in ids:
#                 print('in')
#                 papers_ids_text.append(paper['paper_id'])
#                 if paper['abstract']:
#                     print(paper['abstract'])
#                     abstract.append(paper['abstract'][0]['text'])
#                 else: 
#                     abstract.append('')
#                 text = []
#                 full_text = ''
#                 if paper['body_text']:
#                     for entry in paper['body_text']:
#                         if entry['section'] and entry['text']:
#                             section = {key: entry[key] for key in ['section', 'text']}
#                             text.append(section)
#                             if full_text:
#                                 full_text = full_text + '\n' + entry['text']
#                             else:
#                                 full_text = entry['text']
#                     body_text.append(text)
#                     whole_text.append(full_text)
#                 else:
#                     body_text.append([])
#                     whole_text.append('')
                
#         textdata = pd.DataFrame({'paper_id': papers_ids_text, 'abstract': abstract, 'body_text': body_text, 'whole_text': whole_text})

#         return textdata

## Preprocessing

Uncompressing papers from fields we want, saving the metadata and pdf_parses as pickled dictionaries

In [5]:
def process_metadata(metadata_file,fields=None, get_ids=None,output_dir = './processed/',put_in_op_dir=False):
    
    file_name_without_path_or_ext = metadata_file.split('/')[-1].split('.')[0]
    # Go through metadata files to get relevant paper ids and titles
    ids = []; title = []; field = [];
    # if file is compressed
    if metadata_file[-3:] == '.gz':
        if put_in_op_dir:
            output_file = output_dir + file_name_without_path_or_ext
        else:  
            output_file = metadata_file[:-3]
        gz = gzip.open(metadata_file, 'rb')
        f = io.BufferedReader(gz)
        f_out = open(output_file,'wb')
    else:
        f = open(metadata_file)
        f_out = None

    for line in tqdm(f.readlines()):
        paper = json.loads(line)
        if not fields:
            if not ids:
                ids.append(paper['paper_id'])
                title.append(paper['title'])
                field.append(paper['mag_field_of_study'])
                if f_out:
                    f_out.write(line)
            elif paper['paper_id'] in get_ids:
                ids.append(paper['paper_id'])
                title.append(paper['title'])
                field.append(paper['mag_field_of_study'])
                if f_out:
                    f_out.write(line)
        elif paper['mag_field_of_study']:
            field_in = any([x in fields for x in paper['mag_field_of_study']])
            if field_in:
                ids.append(paper['paper_id'])
                title.append(paper['title'])
                field.append(paper['mag_field_of_study'])
                if f_out:
                    f_out.write(line)
    f.close()
    if f_out:
        f_out.close()
    # create and save dataframe in output_dir/meta_df
    meta_df = pd.DataFrame({'ids':ids, 'titles':title, 'field': field})
    meta_df_dir = output_dir + 'meta_df/'
    os.makedirs(meta_df_dir, exist_ok=True)
    meta_df_file = meta_df_dir + file_name_without_path_or_ext + '.pkl'
    with open(meta_df_file, 'wb') as f:
        pickle.dump(meta_df, f)
    return meta_df_file


def process_pdf(meta_df_file, pdf_file,fields=None,get_ids=None, output_dir = './processed/'):
    
    # use meta data df to check ids
    with open(meta_df_file, 'rb') as f:
        meta_df = pickle.load(f)
    # get the pdfs 
    # lists to make pdf dataframe from 
    papers_ids_text = []; abstract = [] 
    body_text = []  # list of dicts (for each paper) with section, text, cite_spans
    whole_text = [] # list of strings (for each paper) of entire text in the body
    key_words = []  # key words mentioned with the abstract
    citations = [] 
    # if file is compressed
    if pdf_file[-3:] == '.gz':
        output_file = pdf_file[:-3]
        gz = gzip.open(pdf_file, 'rb')
        f = io.BufferedReader(gz)
        f_out = open(output_file,'wb')
    else:
        f = open(pdf_file)
        f_out = None
        
    for line in tqdm(f.readlines()):
        paper = json.loads(line)
        if paper['paper_id'] in meta_df['ids'].values: #ids defined in meta data df based on field
            if f_out: # if untaring
                f_out.write(line) # write that pdf (untarred pdf parse will only have selected papers)
            # get paper-id
            papers_ids_text.append(paper['paper_id'])
            abstract_text = ''
            terms = []
            # get abstract
            if paper['abstract']:
                abstract_text = paper['abstract'][0]['text']
                # get key_words in abstract
                if len(paper['abstract'])>1:   
                    if paper['abstract'][1]['text'][:11].lower() == 'index terms':
                        terms = paper['abstract'][1]['text'][12:].split(',') #remove "Index Terms-" or "INDEX TERMS " from string    
            abstract.append(abstract_text) 
            key_words.append(terms)
            # get sections and text from body
            text = []
            full_text = ''
            if paper['body_text']:
                for entry in paper['body_text']:
                    if entry['section'] and entry['text']:
                        section = {key: entry[key] for key in ['section', 'text', 'cite_spans']}
                        text.append(section)
                        if full_text: # why the if-else?
                            full_text = full_text + '\n' + entry['text']
                        else:
                            full_text = entry['text']
            body_text.append(text)
            whole_text.append(full_text)
            # get citations
            bib_entries = {}
            if paper['bib_entries']:
                bib_entries = paper['bib_entries']
            citations.append(bib_entries)
            
    f.close()
    if f_out:
        f_out.close()
    # create and save dataframe in output_dir/text_df
    text_df = pd.DataFrame({'paper_id': papers_ids_text, 'abstract': abstract,'key_words': key_words,'body_text': body_text, 'whole_text': whole_text,'citations':citations})
    text_df_dir = output_dir + 'text_df/'
    os.makedirs(text_df_dir, exist_ok=True)
    file_name_without_path_or_ext = pdf_file.split('/')[-1].split('.')[0]
    text_df_file = text_df_dir + file_name_without_path_or_ext + '.pkl'
    with open(text_df_file, 'wb') as f:
        pickle.dump(text_df, f)         
    return text_df_file       

In [6]:
def process_batch(metadata_file,pdf_file, fields=None, get_ids=None, output_dir = './processed/'):
    meta_df_file = process_metadata(metadata_file,fields,get_ids,output_dir)
    text_df_file = process_pdf(meta_df_file,pdf_file,fields,get_ids,output_dir)
    return meta_df_file,text_df_file


## Cleaning
Finer cleaning of data:
- remove stop words, high-frequency words, etc.

In [7]:
from nltk import pos_tag
from nltk.corpus import wordnet
def get_word_postag(word):
    #if pos_tag([word])[0][1].startswith('J'):
    #    return wordnet.ADJ
    #if pos_tag([word])[0][1].startswith('V'):
    #    return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        #return wordnet.NOUN
        return True
    else:
        return False
        #return wordnet.ADJ
        #return wordnet.NOUN

from nltk.tokenize import word_tokenize
# Preprocessing: tokenize words
def tokenize(text):
    return(word_tokenize(text))


def sent_to_words(sentences):
    for sentence in sentences:
        return(gensim.utils.simple_preprocess(str(sentence), min_len=3,deacc=True))  # deacc=True removes punctuations

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# Preprocessing: remove stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords]) 
    #return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# Preprocessing: lemmatizing
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Preprocessing: remove short text
def find_longer_text(texts,k=200):
    return list(map(lambda x: len(x.split())>k,texts))
    
#     lengths = list(map(lambda x: len(x.split()), texts))
#     return [val >= k for val in lengths]
    #return [idx for idx, val in enumerate(lengths) if val >= k] 

# Preprocessing: alpha num
def keep_alphanum(words):
    #def isalphanum(word):
    #return word.isalnum()
    return filter(lambda word: word.isalnum(), words)
    #return [word for word in words if word.isalnum()]

# Preprocessing: keep nouns
def keep_nouns(words):
    return filter(get_word_postag, words)
    #return [word for word in words if get_word_postag(word) =='n']

# Preprocessing: keep words >= 3 in length
def keep_longer_words(words):
    return list(filter(lambda x: (len(x) >= 3), words))
    #return [word for word in words if len(word) >= 3]

# Preprocessing: lemmatize
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
def lemmatize(words):
    return list(map(lm.lemmatize, words)) 

# Preprocessing: stemming
from nltk.stem import PorterStemmer 
ps = PorterStemmer() 
def stemming(words):
    #return [ps.stem(word) for word in words]
    return map(ps.stem, words)

def remove_digits(words):
    return list(filter(lambda x: x.isalpha(), words))
    #return list(filter(lambda x: x.isalpha(), words))
#     return [word for word in words if word.isalpha()]

def merged(words):
    return ' '.join(word for word in words)        


In [8]:
def clean_sections(textdf, file_name):
    pass

In [24]:
def ids_to_keep(meta_df,fields):
    pass
def clean_pdf(text_df, file_name, output_dir):
    
    start = time.time()
    

    # if index is not paper_id
    if text_df.index.name is None: 
        print('changing index to paper_id')
        text_df = text_df.set_index('paper_id')

    ids = text_df.index.values.astype(str)
        
    contents = text_df['whole_text'].values.tolist()
    abstracts = text_df['abstract'].values.tolist()
    
    # Add abstract to text
    contents = [i + j for i, j in zip(contents, abstracts)]
    
    t = time.time()
    print(t-start)
    
    # Remove new line characters
    contents = map(lambda x: re.sub('\s+', ' ', x), contents)
    
    t = time.time()
    print(t-start)
    
    # Preprocessing: lower case text
    contents = map(lambda x: x.lower(),contents)
    
    t = time.time()
    print(t-start)
    
    # Preprocessing: keep alphanumeric
    contents = map(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x), contents)
    
    t = time.time()
    print(t-start)
    
    # Preprocessing: remove stand along numbers
    contents = map(lambda x: re.sub(" \d+ ", " ", x), contents)

    t = time.time()
    print(t-start)
    
    # Preprocessing: remove stop words
    contents = map(remove_stopwords, contents)
    
    t = time.time()
    print(t-start)
    
    contents = list(contents)

    # Preprocessing: remove short text
    inds = find_longer_text(contents)
    contents = itertools.compress(contents, inds)
    ids = list(itertools.compress(ids, inds))
    
    key_words = text_df.loc[ids]['key_words'].values
    print('Tokenizing')
    
    # Tokenize words + remove punctuation
    tokenized_contents = map(tokenize,contents) # documents in BOW format
#     word_list = [tokenize(article) for article in contents]
    t = time.time()
    print(t-start)
    
    # Remove numbers
    tokenized_contents = map(remove_digits, tokenized_contents)
    
    
    t = time.time()
    print(t-start)
    
    # Keep longer words
#     word_list = [keep_longer_words(words) for words in  word_list]
    tokenized_contents = map(keep_longer_words,  tokenized_contents)
    
    
    t = time.time()
    print(t-start)
    
    print('Lemmatizing')
    
    # Preprocessing: lemmatize
    tokenized_contents = map(lemmatize, tokenized_contents)
    
#     print(list(word_list)[0])
    
    t = time.time()
    print(t-start)
    
    print('Bag of Words Representation')
    tokenized_contents = list(tokenized_contents)

    dct = corpora.Dictionary(tokenized_contents) # make dct before corpus
#     doc2bow = partial(dct.doc2bow,allow_update=True)
    
    print('length of dct before filter_extreme: ', len(dct))
    dct.filter_extremes() # using default params # filter dct before creating corpus
    print('length of dct after filter_extreme: ', len(dct))
    

    # Make corpus after any changes to dct
    corpus = list(map(lambda x: dct.doc2bow(x,allow_update=True), tokenized_contents))    
#     corpus = [dct.doc2bow(doc, allow_update=True) for doc in word_list]

    t = time.time()
    print(t-start)


    
    word_list =  [item for sublist in tokenized_contents for item in sublist]
    counter=collections.Counter(word_list)
  
    os.makedirs(output_dir, exist_ok=True)
    output_file_name = output_dir + file_name + '_clean.pkl'

    
    with open(output_file_name, 'wb') as f:  # Python 3: open(..., 'wb') 
        d = {'dct': dct, 'corpus': corpus, 'docs': tokenized_contents, 
             'counter': counter, 'ids': ids, 'word_list':word_list,
             'key_words':key_words}
        pickle.dump(d, f)
    return d

In [10]:
# def add_citations(text_df_file, pdf_file):
#    'Add citation column to current DF by parsing pdf_parse'
#     with open(text_df_file, 'rb') as f:
#         text_df = pickle.load(f)
        
#     citations = []
#     with open(pdf_file, 'rb') as f:
#         for line in tqdm(f.readlines()):
#             paper = json.loads(line)
#             # get citations
#             bib_entries = {}
#             if paper['bib_entries']:
#                 bib_entries = paper['bib_entries']
#             citations.append(bib_entries)
#     text_df['citations'] = citations
    
#     with open(text_df_file, 'wb') as f:
#         pickle.dump(text_df, f)

In [11]:
user = "v"
if user == "v":
    sample_data_dir = "/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/20200705v1/sample/"
    full_data_dir = "/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1/full/"
else:
    full_data_dir = '20200705v1/full/'
    
metadata_dir = full_data_dir + 'metadata/'
pdf_parses_dir = full_data_dir + 'pdf_parses/'

In [12]:
with open('./all_ref_links.pkl', 'rb') as f:
    flat_links = pickle.load(f)

In [12]:
files = range(3,99)
metadata = [metadata_dir + f'metadata_{i}.jsonl.gz' for i in files]
pdfs = [pdf_parses_dir + f'pdf_parses_{i}.jsonl.gz' for i in files]
fields = []
get_ids = set(flat_links) # from cell that gets links of all refs
print(metadata, pdfs)

['/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1/full/metadata/metadata_3.jsonl.gz', '/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1/full/metadata/metadata_4.jsonl.gz', '/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1/full/metadata/metadata_5.jsonl.gz', '/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1/full/metadata/metadata_6.jsonl.gz', '/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1/full/metadata/metadata_7.jsonl.gz', '/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1/full/metadata/metadata_8.jsonl.gz', '/Users/virenbajaj/Desktop/Columbia Fall 20/Graphical Models/project/HighResLDA/SemanticScholar Data/20200705v1

In [13]:
for batch in zip(metadata,pdfs):
    process_batch(batch[0], batch[1], get_ids=get_ids)

# for m in metadata:
#     process_metadata(m,fields=None, get_ids=set(get_ids),output_dir = './processed/refs/',put_in_op_dir=False)
    
# process_pdf('processed/meta_df/metadata_0.pkl', pdfs[0],fields,output_dir='./processed/')
# add_citations(text_df_file='process_pdf(meta_df_file,pdf_file,fields,output_dir)',
#             pdf_file ='/Volumes/Extreme SSD/Library/SemanticScholar Data/20200705v1/full/pdf_parses/pdf_parses_0.jsonl')

100%|██████████| 1365344/1365344 [00:33<00:00, 40770.38it/s]
100%|██████████| 309277/309277 [02:17<00:00, 2253.16it/s]
100%|██████████| 1365708/1365708 [00:22<00:00, 60964.16it/s]
100%|██████████| 309825/309825 [01:24<00:00, 3651.14it/s]
100%|██████████| 1368101/1368101 [00:21<00:00, 63685.69it/s]
100%|██████████| 310763/310763 [01:22<00:00, 3758.87it/s]
100%|██████████| 1363968/1363968 [00:22<00:00, 60153.39it/s]
100%|██████████| 310014/310014 [01:39<00:00, 3111.31it/s]
100%|██████████| 1363902/1363902 [00:30<00:00, 44197.17it/s]
100%|██████████| 310371/310371 [01:32<00:00, 3353.53it/s]
100%|██████████| 1366518/1366518 [00:24<00:00, 55236.24it/s]
100%|██████████| 310043/310043 [01:40<00:00, 3077.58it/s]
100%|██████████| 1366131/1366131 [00:21<00:00, 63628.83it/s]
100%|██████████| 310297/310297 [01:47<00:00, 2889.33it/s]
100%|██████████| 1365724/1365724 [00:25<00:00, 53901.96it/s]
100%|██████████| 311205/311205 [01:27<00:00, 3553.53it/s]
100%|██████████| 1364981/1364981 [00:20<00:00, 6

In [59]:
type(get_ids[0])

str

In [13]:
with open('./processed/meta_df/metadata_1.pkl', 'rb') as f:
    meta_df = pickle.load(f)
with open('./processed/text_df/pdf_parses_1.pkl','rb') as f:
    text_df = pickle.load(f)

In [33]:
text_df

Unnamed: 0,paper_id,abstract,key_words,body_text,whole_text,citations
0,77490118,,[],[],,{}
1,19013359,Distributed antenna systems (DASs) have been w...,"[Energy harvesting, distributed antennas, si...","[{'section': 'I. INTRODUCTION', 'text': 'Energ...",Energy harvesting (EH) traditionally refers to...,{'BIBREF0': {'title': 'Power management in ene...
2,4941008,,[],[],,{'BIBREF0': {'title': 'A proposal of a wireles...
3,55851928,,[],[],,{'BIBREF0': {'title': 'Cognitive Psychology an...
4,16104326,,[],[],,{'BIBREF0': {'title': 'Using genetic algorithm...
5,32329494,,[],[],,{'BIBREF0': {'title': 'Approximation of geopat...
6,2986285,We propose a blind interference alignment sche...,[],"[{'section': 'Introduction', 'text': 'Bandwidt...",Bandwidth is a precious resource for wireless ...,{'BIBREF0': {'title': 'On the compound MIMO br...
7,15719749,,[],[],,{'BIBREF0': {'title': 'An introduction to mill...
8,22087014,,[],[],,{'BIBREF0': {'title': 'An ultra-low-power prog...
9,12047073,Deformable part models (DPMs) and convolutiona...,[],"[{'section': 'Introduction', 'text': 'Part-bas...",Part-based representations are widely used in ...,{'BIBREF0': {'title': 'Poselets: Body part det...


In [25]:
clean_pdf(text_df,'refs1','./cleaned/')

changing index to paper_id
0.0018160343170166016
0.0019388198852539062
0.0019550323486328125
0.0019659996032714844
0.00197601318359375
0.0020110607147216797
Tokenizing
0.3639700412750244
0.3639969825744629
0.36401796340942383
Lemmatizing
0.364077091217041
Bag of Words Representation
length of dct before filter_extreme:  7294
length of dct after filter_extreme:  1186
1.2104439735412598


{'dct': <gensim.corpora.dictionary.Dictionary at 0x7ffa28adfa20>,
 'corpus': [[(0, 1),
   (1, 4),
   (2, 1),
   (3, 2),
   (4, 2),
   (5, 1),
   (6, 1),
   (7, 1),
   (8, 1),
   (9, 2),
   (10, 1),
   (11, 1),
   (12, 1),
   (13, 1),
   (14, 1),
   (15, 23),
   (16, 2),
   (17, 3),
   (18, 11),
   (19, 1),
   (20, 2),
   (21, 5),
   (22, 1),
   (23, 3),
   (24, 1),
   (25, 2),
   (26, 3),
   (27, 1),
   (28, 1),
   (29, 4),
   (30, 1),
   (31, 1),
   (32, 1),
   (33, 1),
   (34, 5),
   (35, 3),
   (36, 6),
   (37, 2),
   (38, 1),
   (39, 2),
   (40, 5),
   (41, 4),
   (42, 2),
   (43, 2),
   (44, 1),
   (45, 1),
   (46, 2),
   (47, 1),
   (48, 2),
   (49, 6),
   (50, 1),
   (51, 1),
   (52, 1),
   (53, 1),
   (54, 1),
   (55, 1),
   (56, 1),
   (57, 1),
   (58, 1),
   (59, 1),
   (60, 1),
   (61, 1),
   (62, 1),
   (63, 2),
   (64, 1),
   (65, 1),
   (66, 1),
   (67, 2),
   (68, 1),
   (69, 2),
   (70, 1),
   (71, 1),
   (72, 3),
   (73, 5),
   (74, 1),
   (75, 1),
   (76, 2),
   (77, 

In [26]:
with open('./cleaned/refs1_clean.pkl', 'rb') as f:
    text_df_ref = pickle.load(f)

In [28]:
dct = text_df_ref['dct']
corpus = text_df_ref['corpus']
counter = text_df_ref['counter']
ids = list(text_df_ref['ids'])
word_list = text_df_ref['word_list']
docs = text_df_ref['docs']
key_words = text_df_ref['key_words']

In [32]:
# documents = corpus.get_texts()
print(len(corpus))
print(len(list(ids)))
print(len(dct.dfs))
print(len(counter))
print(len(docs))
print(len(word_list))
print(len(key_words))

30
30
7294
7294
30
91535
30


In [12]:
text_df = text_df.set_index('paper_id') # SET INDEX TO PAPER ID TO INDEX INTO DF
text_df.index.name 

'paper_id'

In [13]:
# filter DF based on fields
def field_in(df, selected):
    'Filter df based on fields in selected'
    def f(fields):
        return any([x in selected for x in fields])
    mask = list(map(f,df.field))
    return df[mask]
# med_meta_df = field_in(meta_df,['Medicine'])
# cs_meta_df = field_in(meta_df, ['Computer Science'])

In [6]:
# med_text_df = text_df.filter(items=med_meta_df.ids.values, axis=0)
# cs_text_df = text_df.filter(items=cs_meta_df.ids.values, axis=0)

In [14]:
key_words = text_df['key_words'][text_df.key_words.str.len() > 0 ] 
text_df_kw = text_df.loc[key_words.index] # papers with key words

In [15]:
def get_links(paper_refs):
    'Retreive the paper_ids (links) from refrences of the paper'
    refs, links = [], []
    for key,value in paper_refs.items():
        if value['link']:
            links.append(value['link'])
            refs.append(key)
    return links, refs

all_citations = (text_df_kw.citations.values)
all_links=[]
all_refs=[]
for citation in all_citations:
    links,refs = get_links(citation)
    all_links.append(links)
    all_refs.append(refs)

In [38]:
# text_df['refs'] = all_refs
# text_df['links'] = all_links

In [16]:
flat_links = [item for sublist in all_links for item in sublist]
links_found = set(flat_links).intersection(set(meta_df.ids)) # links in current df

In [17]:
len(links_found), len(flat_links)

(101, 13982)

In [21]:
with open('./all_ref_links.pkl', 'wb') as f:
    pickle.dump(flat_links,f)

In [22]:
with open('./all_ref_links.pkl', 'rb') as f:
    flat_links = pickle.load(f)


['7229756',
 '57464058',
 '9166388',
 '10603007',
 '2474018',
 '8946639',
 '1537485',
 '710328',
 '10308849',
 '207242061',
 '59762877',
 '15546892',
 '6103434',
 '63859912',
 '16625241',
 '486400',
 '11547182',
 '18980380',
 '37925315',
 '18576331',
 '24341930',
 '144313853',
 '147628077',
 '145505861',
 '144719215',
 '7062633',
 '145274686',
 '55851928',
 '143510681',
 '140831078',
 '148549273',
 '143175031',
 '143649950',
 '140895009',
 '143306715',
 '15380903',
 '143824531',
 '58251482',
 '144949233',
 '143220167',
 '143638934',
 '141218400',
 '145597436',
 '144399211',
 '14164308',
 '24910066',
 '8501009',
 '14751117',
 '8101493',
 '10874446',
 '1982610',
 '45686218',
 '206640513',
 '8298556',
 '6729299',
 '122048051',
 '1401468',
 '14524347',
 '54330661',
 '8564904',
 '9712422',
 '17580611',
 '17623805',
 '5876787',
 '13449852',
 '12812620',
 '5502589',
 '8611043',
 '900938',
 '2838315',
 '61809129',
 '14181594',
 '4622422',
 '59720479',
 '14853970',
 '4623981',
 '51864516',
 '14

In [15]:
file_name = 'pdf_parses_0_kw'
# for field in fields:
d = clean_pdf(text_df_kw,file_name,output_dir=f'./cleaned/cs-med/')

0.5264949798583984
0.5274271965026855
0.5274591445922852
0.5274801254272461
0.5274980068206787
0.5279140472412109
Tokenizing
4.992978096008301
4.992990016937256
4.993922233581543
Lemmatizing
4.994757890701294
Bag of Words Representation
length of dct before filter_extreme:  34400
length of dct after filter_extreme:  6920
16.1564302444458


In [16]:
cleaned_file= './cleaned/cs-med/pdf_parses_0_kw_clean.pkl'
with open(cleaned_file, 'rb') as f:
    cleaned_data = pickle.load(f)

In [17]:
dct = cleaned_data['dct']
corpus = cleaned_data['corpus']
counter = cleaned_data['counter']
ids = list(cleaned_data['ids'])
word_list = cleaned_data['word_list']
docs = cleaned_data['docs']

In [63]:
# documents = corpus.get_texts()
print(len(corpus))
print(len(list(ids)))
print(len(dct.dfs))
print(len(counter))
print(len(docs))
print(len(word_list))

493
493
34400
34400
493
1279903


## TODO


In [None]:
# file_path = cwd + '\\Preprocessed\\'
# def process_pdf(file_name, batch_num, start_ind, end_ind, ids):
#     textdf = read_pdf(file_name, start_ind, end_ind, ids)
#     save_path = file_path + str(batchnum)
#     output = clean_pdf(textdf, save_path)
    

In [66]:
# def process_batch(batch_ind, batch_size=50000, field='Computer Science'):
#     file_name_meta = '20200705v1/full/metadata/metadata_' + str(batch_ind) + '.jsonl'
#     file_name_pdf = '20200705v1/full/pdf_parses/pdf_parses_' + str(batch_ind) + '.jsonl'
    
#     import os
#     cwd = os.getcwd()
#     file_path = cwd + '\\Preprocessed\\' 

#     start = time.time()
    
#     nlines = sum(1 for line in open(file_name_pdf))
#     batch_num = int(np.ceil(nlines / batch_size))
    
#     print('Processing metadata file', batch_ind)
#     selected_data = process_metadata(file_name_meta, field)
#     selected_ids = selected_data['ids'].values
    
#     with open(file_path+'metadata.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
#         pickle.dump(selected_data, f)
    
#     t = time.time()
#     print(t-start)
    
#     for i in range(batch_num):
#         print('Processing pdfs batch Number: ', i)
#         line_nums = [batch_size*i, batch_size*(i+1)]
#         textdf = read_pdf(file_name_pdf,line_nums[0],line_nums[1], ids)  
        
#         t = time.time()
#         print(t-start)
        
#         print('Processing pdfs batch Number: ', i)

#         output = clean_pdf(textdf, file_path+str(i))
        
#         t = time.time()
#         print(t-start)

In [293]:
# process_batch(batch_ind=0, field='Computer Science')

In [132]:
# word_counts = sorted(dct.dfs.items(), key = lambda x: x[1], reverse=True)
# top_ids = [x[0] for x in word_counts[0:100]]
# top_words = [dct.id2token[x] for x in top_ids]

In [133]:
# dct.filter_tokens(bad_ids=top_ids)