In [1]:
import pandas as pd
import numpy as np

from covid.data.constants import *
from covid.models.query_model import regexQueryDf, PatternGenerator
from covid.models.paperclassifier.frontpaperclassifier import FrontPaperClassifier

# Instantiate FrontPaperClassifier; to be used for keywords retrieval
YAML_PATH = '../covid/models/paperclassifier/interest.yaml'
fpc = FrontPaperClassifier(km_path=YAML_PATH)

  from pandas import Panel
[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


In [2]:
# Helper Functions

def generate_sample_df(df, n=50, from_year=2010):
    """Creates a randomized sample from df's rows, while (roughly) preserving the percentages 
    of the papers published in different years."""
    
    # select only papers published from/after input year
    df_train = df[~df.publish_time.isnull()]
    df_train = df_train[df_train.publish_time >= str(from_year)]
    df_train['year'] = df_train.publish_time.dt.year
    
    # create normalized weights for each year
    weights = df_train.groupby('year')['title'].apply(lambda g: round(g.count()/len(df_train),2))
    weights.iloc[-1] = 1 - weights[:-1].sum()
    year_to_weight = weights.to_dict()
    series_of_weights = df_train.year.map(year_to_weight)
    
    # create sample
    df_sample = df_train.sample(n=n, weights=series_of_weights, random_state=100)
    
    return df_sample 


def save_as_doccano_txt(df, col, file_path):
    
    with open(file_path, "w") as f:
        f.write(" ".join([text+'\n' for text in list(df[col].values)]))
        
    print(f"Sample data saved as: {file_path}")
    
    return
    

# Import Raw Metadata

In [3]:
DATA_DIR = "../data/"
FILE_PATH = "raw/metadata.csv"
USECOLS = ['cord_uid', 'title', 'abstract', 'publish_time']

In [4]:
df = pd.read_csv(DATA_DIR + FILE_PATH, usecols=USECOLS, parse_dates=['publish_time'])
df.shape

(51078, 4)

# Filter data

In [5]:
cp = PatternGenerator(words=['SARS-CoV-2', 'covid','coronavirus', 'covid-19'])
cp.generatePattern()

rp = PatternGenerator(words=fpc.get_keywords('gender'))
rp.generatePattern()

patterns = [rp.getPattern(),cp.getPattern()]

In [6]:
cond = regexQueryDf(df, ['abstract'], patterns, operatorColumn='OR', operatorPattern='AND')

100%|██████████| 51078/51078 [00:38<00:00, 1339.25it/s]
100%|██████████| 51078/51078 [00:36<00:00, 1411.45it/s]


In [7]:
df_filtered = df[cond]
df_filtered.shape

(402, 4)

# Create Sample

In [8]:
df_sample = generate_sample_df(df_filtered, n=50, from_year=2010)

In [9]:
df_sample.head()

Unnamed: 0,cord_uid,title,abstract,publish_time,year
33179,xsgxd5sy,Clinical Characteristics of 74 Children with C...,Background: Severe acute respiratory syndrome ...,2020-03-23,2020
32774,v0ln3wfa,Clinical Features of COVID-19 Related Liver Da...,BACKGROUND: A recent outbreak of SARS-CoV-2 in...,2020-02-27,2020
32952,ohba2n2o,Clinical outcomes of 402 patients with COVID-2...,The SARS-CoV-2 outbreak is causing widespread ...,2020-03-10,2020
37469,p3g9nyl6,Laboratory Parameters in Detection of COVID-19...,INTRODUCTION: The role of laboratory parameter...,2020-04-04,2020
4533,sc2j2os0,Alterations in Nerve-Evoked Bladder Contractio...,BACKGROUND: Patients with neurodegenerative di...,2014-10-13,2014


# Export as txt

In [10]:
save_as_doccano_txt(df_sample, 
                    col='abstract', 
                    file_path = DATA_DIR + 'doccano_metadata.txt')

Sample data saved as: ../data/doccano_metadata.txt


# Read Annotated Sample

In [23]:
def read_doccano_json(file_path, label_to_name):
    
    df = pd.read_json(file_path, lines=True)
    
    # create label_name cols
    for name in label_to_name.values():
        df[name] = np.NaN
    
    # iterate over rows
    for i in range(len(df)):
        # iterate over labels (list of dicts)
        for d in df.loc[i,'annotations']:
            # update label value
            col = label_to_name[d['label']]
            df.loc[i,col] = df.loc[i,'text'][d['start_offset']: d['end_offset']]
    
    cols_to_drop = ['id', 'annotations']
    
    return df.rename({'text': 'abstract'}, axis=1)\
             .drop(cols_to_drop, axis=1)

In [24]:
label_to_name = {0:'target', 1:'target', 2:'target', 4:'o', 5:'c'}

In [25]:
read_doccano_json(DATA_DIR+'file.json', label_to_name)

Unnamed: 0,abstract,meta,annotation_approver,target,o,c
0,Objectives: Comorbidities have significant ind...,{},,,,
1,OBJECTIVES: This study aimed to assess the aff...,{},,,,
2,Background: Super-spreading events were associ...,{},,,Super-spreading events were associated with th...,The super-spreading events were associated wit...
3,Objective: To evaluate the spectrum of comorbi...,{},,,,
4,Background A novel coronavirus (SARA-CoV-2) em...,{},,,,
5,INTRODUCTION: The role of laboratory parameter...,{},,,,
6,Based on publicly released data for 1212 patie...,{},,,,
7,BACKGROUND: Viral pathogens were more commonly...,{},,,,
8,Background Chest CT is used to assess the seve...,{},,,,
9,Objective To determine the correlation between...,{},,,,
