In [1]:
import os
import re
import sys
import json
import glob

import numpy as np
import pandas as pd

from pprint import pprint
from copy import deepcopy
from datetime import datetime
from tqdm.notebook import tqdm

### Global variables

In [2]:
root = "../data/"

design_list = [
    'retrospective cohort', 'cross-sectional case-control',
    'cross sectional case control', 'prevalence survey', 'systematic review ',
    ' meta-analysis', ' meta analysis', 'matched case-control',
    'matched case control', 'medical record review',
    'observational case series', 'time series analysis',
    'pseudo-randomized controlled trials',
    'pseudo randomized controlled trial', 'randomized controlled trials',
    'randomized controlled trial', 'retrospective analysis',
    'retrospective study', 'retrospective studies'
]

risk_factors = [
    'diabete', 'hypertension', 'heart disease', 'cancer', 'smoking',
    'history of lung', 'local climate', 'elderly', 'children',
    'immune compromised ', 'age deciles', 'race', 'ethnicity', 'education',
    'income', 'insurance', 'housing', 'immigration', 'prison inmate',
    'mental hospital inpatients', 'long-term care facility resident',
    'long term care'
    'health worker', 'first responder', 'hospital staff', 'nursing home',
    'prison', 'staff', 'pregnancy', 'baby', 'enfant', 'pulm', 'neonates'
]

covid_keywords = ['covid-19', 'hcov-19', 'cord-19' ,'2019-ncov', 'wuhan coronavirus', 'sars-cov-2', 'covid']

In [3]:
# Find all .json files in data(they're papers)
paper_filenames = glob.glob(f'{root}/**/*.json', recursive=True)

### Loading metadata 

In [4]:
metadata_df = pd.read_csv(
    root + 'metadata.csv',
    parse_dates=['publish_time'],
    keep_default_na=False,
    usecols=['title', 'sha', 'abstract', 'publish_time', 'journal'])

#### Filtering for papers published on 2019 and after

In [5]:
metadata_df = metadata_df[metadata_df['publish_time'] >= datetime(2019, 1, 1)]

#### Filtering for papers with references to covid-19

In [6]:
metadata_df = metadata_df[metadata_df['abstract'].apply(
    lambda x: any(key in x.lower() for key in covid_keywords))]

In [7]:
metadata_df.shape

(2599, 5)

In [8]:
relevant_papers = metadata_df['sha'].values

### Extract snippets with mentions of study designs

In [20]:
from tqdm.auto import tqdm
from transformers import *
from summarizer import Summarizer

I0413 03:03:51.195220  1796 file_utils.py:41] PyTorch version 1.2.0 available.


In [21]:
scibert_link = 'allenai/scibert_scivocab_uncased'
sci_config = AutoConfig.from_pretrained(scibert_link)
sci_config.output_hidden_states = True
sci_tokenizer = AutoTokenizer.from_pretrained(scibert_link)
sci_model = AutoModel.from_pretrained(scibert_link, config = sci_config)

I0413 03:04:31.188186  1796 filelock.py:274] Lock 2165495909848 acquired on C:\Users\Nikhil Budathoki\.cache\torch\transformers\199e28e62d2210c23d63625bd9eecc20cf72a156b29e2a540d4933af4f50bda1.79c4dd84b76a6991002b44cd58102c732c37aba834ad6401ddd6a89bd0ed809b.lock
I0413 03:04:31.190179  1796 file_utils.py:479] https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/config.json not found in cache or force_download set to True, downloading to C:\Users\Nikhil Budathoki\.cache\torch\transformers\tmpgk1uxboa


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=313.0, style=ProgressStyle(description_…

I0413 03:04:31.446455  1796 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/config.json in cache at C:\Users\Nikhil Budathoki\.cache\torch\transformers\199e28e62d2210c23d63625bd9eecc20cf72a156b29e2a540d4933af4f50bda1.79c4dd84b76a6991002b44cd58102c732c37aba834ad6401ddd6a89bd0ed809b
I0413 03:04:31.448450  1796 file_utils.py:492] creating metadata file for C:\Users\Nikhil Budathoki\.cache\torch\transformers\199e28e62d2210c23d63625bd9eecc20cf72a156b29e2a540d4933af4f50bda1.79c4dd84b76a6991002b44cd58102c732c37aba834ad6401ddd6a89bd0ed809b
I0413 03:04:31.450445  1796 filelock.py:318] Lock 2165495909848 released on C:\Users\Nikhil Budathoki\.cache\torch\transformers\199e28e62d2210c23d63625bd9eecc20cf72a156b29e2a540d4933af4f50bda1.79c4dd84b76a6991002b44cd58102c732c37aba834ad6401ddd6a89bd0ed809b.lock
I0413 03:04:31.453437  1796 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/




I0413 03:04:31.760656  1796 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/config.json from cache at C:\Users\Nikhil Budathoki\.cache\torch\transformers\199e28e62d2210c23d63625bd9eecc20cf72a156b29e2a540d4933af4f50bda1.79c4dd84b76a6991002b44cd58102c732c37aba834ad6401ddd6a89bd0ed809b
I0413 03:04:31.761653  1796 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=227845.0, style=ProgressStyle(descripti…

I0413 03:04:32.339347  1796 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/vocab.txt in cache at C:\Users\Nikhil Budathoki\.cache\torch\transformers\e3debd8fbdf40874753724814ee0520f612b577b26c8755bca485103b47cd3bc.60287becc5ab96d85a4bf377eb90feaf3b9c80d3b23e84311dccd3588f56d4fb
I0413 03:04:32.341344  1796 file_utils.py:492] creating metadata file for C:\Users\Nikhil Budathoki\.cache\torch\transformers\e3debd8fbdf40874753724814ee0520f612b577b26c8755bca485103b47cd3bc.60287becc5ab96d85a4bf377eb90feaf3b9c80d3b23e84311dccd3588f56d4fb
I0413 03:04:32.343340  1796 filelock.py:318] Lock 2165628011240 released on C:\Users\Nikhil Budathoki\.cache\torch\transformers\e3debd8fbdf40874753724814ee0520f612b577b26c8755bca485103b47cd3bc.60287becc5ab96d85a4bf377eb90feaf3b9c80d3b23e84311dccd3588f56d4fb.lock





I0413 03:04:32.947041  1796 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/vocab.txt from cache at C:\Users\Nikhil Budathoki\.cache\torch\transformers\e3debd8fbdf40874753724814ee0520f612b577b26c8755bca485103b47cd3bc.60287becc5ab96d85a4bf377eb90feaf3b9c80d3b23e84311dccd3588f56d4fb
I0413 03:04:32.948039  1796 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/added_tokens.json from cache at None
I0413 03:04:32.948039  1796 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/special_tokens_map.json from cache at None
I0413 03:04:32.949036  1796 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/tokenizer_config.json from cache at None
I0413 03:04:33.192878  1796 filelock.py:274] Lock 21656109694

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442221694.0, style=ProgressStyle(descri…

I0413 03:04:51.238825  1796 file_utils.py:489] storing https://s3.amazonaws.com/models.huggingface.co/bert/allenai/scibert_scivocab_uncased/pytorch_model.bin in cache at C:\Users\Nikhil Budathoki\.cache\torch\transformers\a4e19031683f34af5fd1c4cca73a3dbe33f8b9e50ad91ddf12ceac577b93c433.7587182ea55c40bf7fd0961c1176c31fa22558da2bf20c199874fa5a8ecb4613
I0413 03:04:51.240822  1796 file_utils.py:492] creating metadata file for C:\Users\Nikhil Budathoki\.cache\torch\transformers\a4e19031683f34af5fd1c4cca73a3dbe33f8b9e50ad91ddf12ceac577b93c433.7587182ea55c40bf7fd0961c1176c31fa22558da2bf20c199874fa5a8ecb4613
I0413 03:04:51.242815  1796 filelock.py:318] Lock 2165610969408 released on C:\Users\Nikhil Budathoki\.cache\torch\transformers\a4e19031683f34af5fd1c4cca73a3dbe33f8b9e50ad91ddf12ceac577b93c433.7587182ea55c40bf7fd0961c1176c31fa22558da2bf20c199874fa5a8ecb4613.lock
I0413 03:04:51.243814  1796 modeling_utils.py:507] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/allen




In [22]:
sci_model = Summarizer(custom_model=sci_model,custom_tokenizer=sci_tokenizer)

## Ditching analysis of all papers for analysis of relevant papers only

In [36]:
risk_cov_df = pd.read_json(root + 'risk_covid_join.json', orient='records')

In [37]:
risk_cov_df.shape

(246, 14)

In [40]:
risk_cov_df.dropna(subset = ['full_text'], inplace = True)

In [47]:
id_text = risk_cov_df.loc[:, ['doc_id', 'full_text']]

In [58]:
lengths = [len(text) for text in id_text['full_text'].values]

In [63]:
sample = id_text[id_text['full_text'].apply(lambda x: len(x) < 1000000 and len(x) > 800000)]['full_text'].iloc[0]

In [None]:
tqdm.pandas()
id_text['scibert_summary'] = id_text.progress_apply(lambda x:
                                                             sci_model(x['full_text'], ratio=0.25)
                                                             if len(x['full_text']) > 90000
                                                             else x['scibert_summary']
                                                                )        

In [76]:
id_text_updated = pd.DataFrame(columns =  id_text.columns)
for index, row in id_text.iterrows():
    print(index)
    if row['scibert_summary'] == "Too long":
        if len(row['full_text']) < 1000000:
            row['scibert_summary'] = sci_model(row['full_text'], ratio=0.25)
    id_text_updated.append(row, ignore_index=True)

0
1
2
6
7
8
12
13
14
18
20
21
23
24
26
27
28
29
30
33
36
38
39
40
41
42
44
45
48
50
52
53
54
56
57
61
62
63
64
66
67
68
69
70
71
72
73
74
76
77
78
79
80
81
84
85
89
90
91
92
93
94
96
97
98
100
101
102
104
107
108
109
110
111
112
113
114
115
116
119
121
122
123
124
125
127
128
129
130
135
136
137
138
141
142
144
145
146
147
149
151
152
154
155
157
159
160
161
162
164
172
174
175
177
178
182
183
184
185
190
191
193
194
195
196
197
199
202
203
205
206
208
209
210
211
212
213
215
217
218
219
224
228
229
230
232
233
234
235
236
239
240
242
244


In [77]:
id_text_updated.to_json('../../id_text_summarized.json', orient='records')

In [81]:
risk_cov_df.drop('scibert_summary', axis=1, inplace = True)

In [83]:
risk_cov_df = risk_cov_df.merge(id_text, on='doc_id')

In [85]:
risk_cov_df.to_json('risk_cov_join.json', orient='records')