In [23]:
import os
import re
import json
import sys
import pandas as pd
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [2]:
dirs = ["biorxiv_medrxiv", "comm_use_subset", "noncomm_use_subset"]
articles = []

In [56]:
for dir in dirs:
    path = f"kaggle_data/{dir}/{dir}/pdf_json"
    for file_ in tqdm(os.listdir(path)):
        with open(f"{path}/{file_}") as f:
            data = json.load(f)

            paper_id = data['paper_id']
            title = data['metadata']['title']

            full_abstract = ""
            for part in data['abstract']:
                full_abstract += part['text'] + "\n"

            full_text = ""
            for text_part in data["body_text"]:
                full_text += text_part["text"] + "\n\n"

            articles.append((paper_id, title, full_abstract, full_text))

100%|██████████| 1625/1625 [00:02<00:00, 627.75it/s]
100%|██████████| 9524/9524 [02:01<00:00, 78.53it/s]
100%|██████████| 2490/2490 [00:28<00:00, 86.54it/s] 


In [57]:
df = pd.DataFrame(articles, columns=['Paper ID', 'Title', 'Abstract', 'Text'])

In [58]:
df.head()

Unnamed: 0,Paper ID,Title,Abstract,Text
0,7fdd33661d188e3ea250e99f2752d705b5467ef9,Glycoinformatics approach for identifying targ...,COVID-19 outbreak is still threatening the pub...,The coronavirus spike protein (S) plays a key ...
1,ffbd7555a337706238c211197b221795e4e35146,Title: Estimation of COVID-2019 burden and pot...,,CC-BY-NC-ND 4.0 International license It is ma...
2,9a0c89a9b8ff3969d97b7cc4245489ff69f3a615,Development and external validation of a progn...,"Since the outbreak of COVID-19, there has been...",Background: COVID-19 pandemic has developed ra...
3,08911cdc65e71e6398ca79b46806e6c8b2b730ae,Epigenetic dysregulation of ACE2 and interfero...,medRxiv preprint immunology 2015; 156(1): 19-2...,Infection caused by SARS-CoV-2 can result in s...
4,bdaa40d95b82093f60a1c5ac8b798d67cef3a52b,A SARS-CoV-2 Vaccination Strategy Focused on P...,Here we propose a vaccination strategy for SAR...,The current SARS-CoV-2 pandemic has precipitat...


## Find all articles where 'incubation' word was used 

In [59]:
df_incubation = df[df['Text'].str.contains('incubation', flags=re.I)].copy()

def how_many_mentioned(text: str, keyword="incubation"):
    counter = 0
    for word in text.split():
        if word == keyword:
            counter += 1
    return counter

df_incubation["Mentioned"] = df_incubation["Text"].progress_apply(lambda text: text.lower().count("incubation"))

HBox(children=(IntProgress(value=0, max=4353), HTML(value='')))

In [62]:
len(df_incubation)

4353

In [63]:
# Drop duplicate rows
df_incubation.drop_duplicates(subset=["Text", "Title"])

Unnamed: 0,Paper ID,Title,Abstract,Text,Mentioned
10,73d80c8f5780d70bd8d343188c56e898e91557b6,Ca 2+ ions promote fusion of Middle East Respi...,Middle East respiratory syndrome coronavirus (...,Coronaviruses (CoVs) comprise a family of enve...,1
11,78e49fdb6f0aa9924a5b510341d52b618fff0ca6,A Multiscale and Comparative Model for Recepto...,The respiratory syndrome caused by a new type ...,The coronavirus disease 2019 has emerged at th...,2
12,214ef8154bf31571fcb97fd44b8403df7e208e80,Development and Evaluation of an AI System for...,Early detection of COVID-19 based on chest CT ...,"The new coronavirus disease, now known as COVI...",1
16,1eaa329f608055620a57e6273e9d1c409de1e9ee,Structure of the chromatin remodelling enzyme ...,ATP-dependent chromatin remodelling proteins r...,The extended family of ATPases related to the ...,1
17,49ac69f362c27acbc6de0c5cbb640267e7a1e797,Clinical features and outcomes of 221 patients...,Pan and ZY Peng are cocorresponding authors.\n...,"In late December 2019, an outbreak of acute re...",2
18,ecdbe4f84b2227c7cb8460bd851fa826982f8351,Can N95 respirators be reused after disinfecti...,The Coronavirus Disease 2019 pandemic has led ...,COVID-19 is an ongoing pandemic with nearly a ...,2
24,5cb61edbbbed7a03b791fe1628e1d68577a9a980,Importance of suppression and mitigation measu...,I employ a simple mathematical model of an epi...,"management options, and since some nations are...",2
26,94dd454a02cb13481f099002f164e1b4f1590cd7,Ultra-Low-Cost Integrated Silicon-based Transd...,Rapid screening and low-cost diagnosis play a ...,Despite the advancement of diagnostic technolo...,1
27,39037d14bf47bea9a4721c8f331da9661f18fef5,Transmission Dynamics of COVID-19 and Impact o...,In this work we construct a mathematical model...,"In mid-December 2019, the first cases of a hit...",3
29,1f783a5e029f80516169b19fe6e0dacc1e171f87,Metagenomic Nanopore sequencing of influenza v...,Influenza is a major global public health thre...,Influenza is a major global public health thre...,1


In [64]:
incubation_times = []
for text in df_incubation["Text"]:
    sentences = text.split(". ")
    for sentence in sentences:
        if "incubation" in sentence: 
            founds = re.findall(r" \d{1,2} day", sentence)
            for f in founds:
                print(f)
                print(sentence)
                
                number = f.split(" ")[1]
                incubation_times.append(float(number))
    if len(incubation_times) > 10:
        print(incubation_times)
        break

 24 day
6 9 Based on a recent large-scale epidemiological survey, the latency period of the SARS-CoV-2 may extend up to 24 days, even though the proportion of patients with long incubation period is very small, but the medium incubation period remains short at 3 days
 3 day
6 9 Based on a recent large-scale epidemiological survey, the latency period of the SARS-CoV-2 may extend up to 24 days, even though the proportion of patients with long incubation period is very small, but the medium incubation period remains short at 3 days
 20 day
As the incubation period is around 3-4 days but can be as long as 20 days, along with the presence of asymptomatic carriers, the virus has been extremely difficult to contain 11 
 3 day
SARS-CoV-2 also uses the angiotensin converting enzyme II (ACE2) receptors like the SARS-CoV [15] .

The incubation period of COVID-19 can vary from 3 days to 14 days with a median of approximately 5 days [16] 
 14 day
SARS-CoV-2 also uses the angiotensin converting enzy

## Count words in abstract and title

In [42]:
df["Text Word Count"] = df["Text"].apply(lambda x: len(x.strip().split()))
df["Abstract Word Count"] = df["Abstract"].apply(lambda x: len(x.strip().split()))
df["Text Unique Word Count"] = df["Text"].apply(lambda x: len(set(x.strip().split())))

df.head()

df

Unnamed: 0,Paper ID,Title,Abstract,Text,Text Word Count,Abstract Word Count,Text Unique Word Count
0,7fdd33661d188e3ea250e99f2752d705b5467ef9,Glycoinformatics approach for identifying targ...,COVID-19 outbreak is still threatening the pub...,The coronavirus spike protein (S) plays a key ...,1876,175,660
1,ffbd7555a337706238c211197b221795e4e35146,Title: Estimation of COVID-2019 burden and pot...,,CC-BY-NC-ND 4.0 International license It is ma...,982,0,375
2,9a0c89a9b8ff3969d97b7cc4245489ff69f3a615,Development and external validation of a progn...,"Since the outbreak of COVID-19, there has been...",Background: COVID-19 pandemic has developed ra...,3711,237,1166
3,08911cdc65e71e6398ca79b46806e6c8b2b730ae,Epigenetic dysregulation of ACE2 and interfero...,medRxiv preprint immunology 2015; 156(1): 19-2...,Infection caused by SARS-CoV-2 can result in s...,1660,50,533
4,bdaa40d95b82093f60a1c5ac8b798d67cef3a52b,A SARS-CoV-2 Vaccination Strategy Focused on P...,Here we propose a vaccination strategy for SAR...,The current SARS-CoV-2 pandemic has precipitat...,4175,151,1120
5,169886ed560d25fc250346b0eb02b9b5fa73e5f9,Immune Cell Profiling of COVID-19 Patients in ...,"COVID-19, caused by SARS-CoV-2, has recently a...","COVID-19, caused by severe acute respiratory s...",5033,403,1421
6,10a6153b5187c8c61c74afff8f87113422211d27,Inferring Timing of Infection Using Within-hos...,,"To avoid future outbreaks, governments impleme...",1350,0,565
7,f2fe314b042c723bfefde68185f25091cee87dec,Interaction of the spike protein RBD from SARS...,The spread of the COVID-19 caused by the SARS-...,The coronavirus SARS-CoV-2 (previously known a...,3019,258,989
8,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,Sequencing of the human IG light chain loci fr...,Germline variation at immunoglobulin gene (IG)...,Antibodies are essential components of the imm...,5478,200,1479
9,9c016d8d0a0e01eaa2f449c5c779a6ecce82ff17,Early Spread of SARS-Cov-2 in the Icelandic Po...,Limited data exist on how SARS-CoV-2 enters an...,Severe acute respiratory syndrome coronavirus ...,3734,205,1046


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1625 entries, 0 to 1624
Data columns (total 7 columns):
Paper ID                  1625 non-null object
Title                     1625 non-null object
Abstract                  1625 non-null object
Text                      1625 non-null object
Text Word Count           1625 non-null int64
Abstract Word Count       1625 non-null int64
Text Unique Word Count    1625 non-null int64
dtypes: int64(3), object(4)
memory usage: 88.9+ KB
