## Data Cleaning

In [19]:
import pandas as pd

articles_df = pd.read_pickle('pickles/articles_df.pkl')
articles_df

Unnamed: 0,paper_id,title,doi,abstract,body_text
0,0015023cc06b5362d332b3baf348d11567ca2fbb,The RNA pseudoknots in foot-and-mouth disease ...,doi.org/10.1101/2020.01.10.901801,word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP..."
1,004f0f8bb66cf446678dc13cf2701feec4f36d76,Healthcare-resource-adjusted vulnerabilities t...,doi.org/10.1101/2020.02.11.20022111,,The 2019-nCoV epidemic has spread across China...
2,00d16927588fb04d4be0e6b269fc02f0d3c2aa7b,"Real-time, MinION-based, amplicon sequencing f...",doi.org/10.1101/634600,Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by..."
3,013d9d1cba8a54d5d3718c229b812d7cf91b6c89,Assessing spread risk of Wuhan novel coronavir...,doi.org/10.1101/2020.02.04.20020479,Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p..."
4,01d162d7fae6aaba8e6e60e563ef4c2fca7b0e18,"TWIRLS, an automated topic-wise inference meth...",doi.org/10.1101/2020.02.24.20025437,Faced with the current large-scale public heal...,The sudden outbreak of the new coronavirus (SA...
...,...,...,...,...,...
13197,ff365ebbc0fc55476886b0abd129e227c1f8a527,Article focus Hip,http://dx.doi.org/10.1302/2046-3758.59.BJR-201...,We report a systematic review and metaanalysis...,Despite the fact that total hip arthroplasty (...
13198,ff7d49ac4008f60ef9c5a437e0d504dcefd1246f,,http://dx.doi.org/10.3201/eid1610.100840,,results of studies conducted in other countrie...
13199,ffb381668d93248759ca3855425e05722cb9f562,,http://dx.doi.org/10.3201/eid1108.050110,,H uman coronaviruses (HCoVs) were first record...
13200,ffd3a93b927e221ded4cf76536ad31bef2c74b89,Fatal Respiratory Infections Associated with R...,http://dx.doi.org/10.3201/eid1811.120607,During an outbreak of severe acute respiratory...,During an outbreak of severe acute respiratory...


In [20]:
import re
import string
import nltk

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9 -]", '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(' +', ' ', text)
    return text

In [21]:
abstracts_clean = pd.DataFrame(articles_df.abstract.apply(clean_text))
body_texts_clean = pd.DataFrame(articles_df.body_text.apply(clean_text))

In [29]:
articles_df_clean = pd.DataFrame({
    'paper_id': articles_df.paper_id,
    'title': articles_df.title,
    'doi': articles_df.doi,
    'abstract': abstracts_clean['abstract'],
    'body_text': body_texts_clean['body_text']
})

In [30]:
articles_df_clean.to_pickle('pickles/corpus.pkl')

## Document-Term Matrix

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
import os
from itertools import accumulate

lenghts = [
    0,
    len(os.listdir('biorxiv_medrxiv/biorxiv_medrxiv/')),
    len(os.listdir('comm_use_subset/comm_use_subset/')),
    len(os.listdir('noncomm_use_subset/noncomm_use_subset/')),
    len(os.listdir('pmc_custom_license/pmc_custom_license/'))
]


length_indexes = list(accumulate(lenghts, lambda x, y: x + y))

indexes = [[length_indexes[i], length_indexes[i + 1]] for i in range(0, len(length_indexes)-1)]

def papers_count_vectorizer(articles, column, indexes):
    cv = CountVectorizer(stop_words='english')
    data_cv = cv.fit_transform(articles[column][indexes[0]:indexes[1]])
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = articles.paper_id[indexes[0]:indexes[1]]
    return data_dtm

In [25]:
abstracts_dtms = [papers_count_vectorizer(articles_df_clean, 'abstract', index) for index in indexes]

In [26]:
abstracts_dtms[1]

Unnamed: 0_level_0,aa,aaa,aaag,aac,aad,aadsj,aadult,aag,aagarose,aaggaacagaacaagaagggaa,...,zymogens,zymograms,zymography,zymosan,zymoxin,zymoxins,zyview,zz,zzn,zzz
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000b7d1517ceebb34e1e3e817695b6de03e2fa78,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00142f93c18b07350be89e96372d240372437ed9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0022796bb2112abd2e6423ba2d57751db06049fb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00326efcca0852dc6e39dc6b7786267e1bc4f194,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00352a58c8766861effed18a4b079d1683fec2ec,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffee1423c1320d7070fe9a871224a468768a4c10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffef8194e52de95fe345db7dd12fe3185d786978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fff1e7b356f0d6cf7b28b019974833200e38f843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fff3678cfe3ce7a9ccae1e7becf17d5d71d1b54a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
directories = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'pmc_custom_license']

for dtm, directory in zip(abstracts_dtms, directories):
    dtm.to_pickle('pickles/dtms/abstracts/{}.pkl'.format(directory))

In [28]:
directories = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'pmc_custom_license']

for directory, index in zip(directories, indexes):
    if directory == 'comm_use_subset':
        continue
    papers_count_vectorizer(articles_df_clean, 'body_text', index).to_pickle('pickles/dtms/body_texts/{}.pkl'.format(directory))