## Reading packages and data

In [287]:
# Load packages
import numpy as np
import pandas as pd
import json
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aksen\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aksen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [288]:
# Load paper_ids and title from metadata
paper_ids = []; title = []; index = 0; area = []
with open('20200705v1/full/metadata/metadata_0.jsonl') as f:
    for line in f:
        index += 1
        paper = json.loads(line)
        #print(paper)
        paper_ids.append(paper['paper_id'])
        if paper['mag_field_of_study']:
            area.append(paper['mag_field_of_study'][0])
        else: 
            area.append('')
        title.append(paper['title'])

In [290]:
# Load text for each paper
papers_ids_text = []; abstract = []; body_text = []; index = 0; whole_text = []
with open('20200705v1/full/pdf_parses/pdf_parses_0.jsonl') as f:
    for line in f:
        index += 1
        paper = json.loads(line)
        papers_ids_text.append(paper['paper_id'])
        if paper['abstract']:
            abstract.append(paper['abstract'][0]['text'])
        else: 
            abstract.append('')
        text = []
        full_text = ''
        if paper['body_text']:
            for entry in paper['body_text']:
                if entry['section'] and entry['text']:
                    section = {key: entry[key] for key in ['section', 'text']}
                    text.append(section)
                    if full_text:
                        full_text = full_text + '\n' + entry['text']
                    else:
                        full_text = entry['text']
            body_text.append(text)
            whole_text.append(full_text)
        else:
            body_text.append([])
            whole_text.append('')

## Data cleaning

In [291]:
# Preprocessing: merging data to dataframe

metadata = pd.DataFrame({'paper_id': paper_ids, 'title': title, 'area': area})
textdata = pd.DataFrame({'paper_id': papers_ids_text, 'abstract': abstract, 'body_text': body_text, 'whole_text': whole_text})
textdata = textdata.merge(metadata, how='left', on='paper_id')
textdata.head(5)

Unnamed: 0,paper_id,abstract,body_text,whole_text,title,area
0,77499681,The purpose of this study is to evaluate the e...,"[{'section': 'CONFLICTS OF INTEREST', 'text': ...",The authors have nothing to disclose.,Effects of Teriparatide Administration on Frac...,Medicine
1,94550656,,[],,The Approximate Analysis of Nonlinear Behavior...,Chemistry
2,94551239,,[],,Scanning probe memories – Technology and appli...,Chemistry
3,94551546,Ethanolamine (EA) or ethylenediamine (ED)-func...,"[{'section': 'INTRODUCTION', 'text': 'Gene the...",Gene therapy holds potential for treating many...,Gd(III) ion-chelated supramolecular assemblies...,Materials Science
4,94552339,,[],,Analytical Procedure for the Determination of ...,Chemistry


In [293]:
len(textdata)

310736

In [292]:
# Preprocessing: lower case text
textdata['whole_text'] = [x.lower() for x in textdata['whole_text']]

In [None]:
# Select rows where whole text has >= 100 words
#textdata['split_text'] = [x.split() for x in textdata['whole_text']]
textdata['split_text'] = [nltk.word_tokenize(x) for x in textdata['whole_text']]
text_length = [len(x)>= 200 for x in textdata['split_text']]
textdata = textdata[text_length]

In [230]:
textdata.head(5)

Unnamed: 0,paper_id,abstract,body_text,whole_text,title,area,split_text
3,94551546,Ethanolamine (EA) or ethylenediamine (ED)-func...,"[{'section': 'INTRODUCTION', 'text': 'Gene the...",gene therapy holds potential for treating many...,Gd(III) ion-chelated supramolecular assemblies...,Materials Science,"[gene, therapy, holds, potential, for, treatin..."
11,159355456,The Government of India has presented an expan...,"[{'section': 'OUR MISSION', 'text': 'Our missi...",our mission is achieving and maintaining excel...,An update on model Ayush wellness clinic at pr...,Political Science,"[our, mission, is, achieving, and, maintaining..."
16,18980380,This technical note studies Markov decision pr...,"[{'section': 'II. PRELIMINARIES', 'text': 'Thr...","throughout the technical note, we use capital ...",Distributionally Robust Counterpart in Markov ...,Mathematics,"[throughout, the, technical, note, ,, we, use,..."
17,18980463,Although development of the adult Drosophila c...,[{'section': 'Embryonic development of the lar...,we followed bo pr development from specificati...,Adult and larval photoreceptors use different ...,Biology,"[we, followed, bo, pr, development, from, spec..."
19,18981111,,[{'section': 'Exploration of Unknown Spaces by...,"orly lahav david mioduser tel aviv university,...",Exploration of Unknown Spaces by People Who Ar...,Computer Science,"[orly, lahav, david, mioduser, tel, aviv, univ..."


In [231]:
from nltk import pos_tag
from nltk.corpus import wordnet
def get_word_postag(word):
    if pos_tag([word])[0][1].startswith('J'):
        return wordnet.ADJ
    if pos_tag([word])[0][1].startswith('V'):
        return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        return wordnet.NOUN
    else:
        return wordnet.ADJ
        #return wordnet.NOUN
    

In [243]:
# Preprocessing: alpha num
def keep_alphanum(words):
    return [word for word in words if word.isalnum()]

In [244]:
# Preprocessing: keep nouns
def keep_nouns(words):
    return [word for word in words if get_word_postag(word) =='n']

In [250]:
# Preprocessing: keep words >= 3 in length
def keep_longer_words(words):
    return [word for word in words if len(word) >= 3]

In [251]:
# Preprocessing: stemming
from nltk.stem import PorterStemmer 
ps = PorterStemmer() 
def stemming(words):
    return [ps.stem(word) for word in words]

In [246]:
textdata['nouns'] = [keep_alphanum(x) for x in textdata['split_text']]
textdata['nouns'] = [keep_nouns(x) for x in textdata['nouns']]

In [252]:
textdata['nouns'] = [keep_longer_words(x) for x in textdata['nouns']]
textdata['nouns'] = [stemming(x) for x in textdata['nouns']]

In [254]:
# Preprocessing: get corpus of words
corpus_all = []
for text in textdata['nouns']:
    corpus_all.extend(text)

In [255]:
# Preprocessing: removing stop words:
stop_words = set(stopwords.words('english'))  
corpus = []
for word in corpus_all:
    if word not in stop_words:
        corpus.append(word)

In [256]:
from  collections import Counter
def getUniqueWords(allWords):
    uniqueWords = Counter()

    for word in allWords:
        uniqueWords[word]+=1
    return uniqueWords.keys() 

In [258]:
unique_words = getUniqueWords(corpus)
len(unique_words)

8717

In [259]:
wordfreq = {}
for paper in textdata['nouns']:
    for noun in paper:
        if noun not in wordfreq.keys():
            wordfreq[noun] = 1
        else:
            wordfreq[noun] += 1

In [284]:
import heapq
most_freq = heapq.nlargest(10, wordfreq, key=wordfreq.get)

In [285]:
most_freq

['system',
 'studi',
 'data',
 'cell',
 'time',
 'case',
 'group',
 'model',
 'result',
 'patient']

In [272]:
def merged(words):
    return ' '.join(word for word in words)

In [273]:
textdata['merged'] = [merged(x) for x in textdata['nouns']]

In [274]:
textdata.head(5)

Unnamed: 0,paper_id,abstract,body_text,whole_text,title,area,split_text,nouns,merged
3,94551546,Ethanolamine (EA) or ethylenediamine (ED)-func...,"[{'section': 'INTRODUCTION', 'text': 'Gene the...",gene therapy holds potential for treating many...,Gd(III) ion-chelated supramolecular assemblies...,Materials Science,"[gene, therapy, holds, potential, for, treatin...","[gene, therapi, diseas, cancer, diseas, gene, ...",gene therapi diseas cancer diseas gene therapi...
11,159355456,The Government of India has presented an expan...,"[{'section': 'OUR MISSION', 'text': 'Our missi...",our mission is achieving and maintaining excel...,An update on model Ayush wellness clinic at pr...,Political Science,"[our, mission, is, achieving, and, maintaining...","[mission, excel, healthcar, servic, system, me...",mission excel healthcar servic system medicin ...
16,18980380,This technical note studies Markov decision pr...,"[{'section': 'II. PRELIMINARIES', 'text': 'Thr...","throughout the technical note, we use capital ...",Distributionally Robust Counterpart in Markov ...,Mathematics,"[throughout, the, technical, note, ,, we, use,...","[note, use, capit, letter, denot, matric, bold...",note use capit letter denot matric bold face l...
17,18980463,Although development of the adult Drosophila c...,[{'section': 'Embryonic development of the lar...,we followed bo pr development from specificati...,Adult and larval photoreceptors use different ...,Biology,"[we, followed, bo, pr, development, from, spec...","[develop, specif, precursor, end, larval, life...",develop specif precursor end larval life molec...
19,18981111,,[{'section': 'Exploration of Unknown Spaces by...,"orly lahav david mioduser tel aviv university,...",Exploration of Unknown Spaces by People Who Ar...,Computer Science,"[orly, lahav, david, mioduser, tel, aviv, univ...","[lahav, david, miodus, tel, aviv, univers, sch...",lahav david miodus tel aviv univers school edu...


## Bag of Words Representation

In [286]:
textdata_sample = textdata

In [275]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = max_features=1000)

X = cv.fit_transform(textdata['merged']).toarray()


In [277]:
X.shape

(82, 1000)