# Imports

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import json
import re
import random

# Data prep

In [None]:
# Create a lookup for the pdf parse based on paper ID
corpus = {}
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/corpus.jsonl') as f_pdf:
    for line in f_pdf:
        pdf_parse_dict = json.loads(line)
        corpus[pdf_parse_dict['doc_id']] = pdf_parse_dict
print("Corpus parsed.")

Corpus parsed.


In [None]:
# All abstracts should be arrays of sentences, not one string
for key in corpus:
  abs = corpus[key]['abstract']
  if type(abs) is str:
    print("e")

In [None]:
claim_id_to_claim = []
claim_no_evidence = 0
num_evid = 0
# Retrieve all claims for the train set
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_train.jsonl') as f_pdf:
    for line in f_pdf:
        claim_dict = json.loads(line)
        if (claim_dict['evidence']):
          claim_id_to_claim.append(claim_dict)
        else:
          claim_no_evidence += 1
# Retireve all claims from the dev set
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_dev.jsonl') as f_pdf:
    for line in f_pdf:
        claim_dict = json.loads(line)
        if (claim_dict['evidence']):
          claim_id_to_claim.append(claim_dict)
        else:
          claim_no_evidence += 1
print("Claims parsed.")
print("Evidence Num:", len(claim_id_to_claim))
print("No Evidence:", claim_no_evidence, "(these are NEIs from SciFact train and dev)")

Claims parsed.
Evidence Num: 693
No Evidence: 416 (these are NEIs from SciFact train and dev)


In [None]:
# Example claim with evidence
claim_id_to_claim[19]

{'id': 133,
 'claim': 'Assembly of invadopodia is triggered by focal generation of phosphatidylinositol-3,4-biphosphate and the activation of the nonreceptor tyrosine kinase Src.',
 'evidence': {'16280642': [{'sentences': [3, 4], 'label': 'SUPPORT'}]},
 'cited_doc_ids': [38485364, 6969753, 17934082, 16280642, 12640810]}

In [None]:
# create a lookup for the pdf parse based on paper ID
citances = []
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_with_citances.jsonl') as f_pdf:
    for line in f_pdf:
        citance_dict = json.loads(line)
        citances.append((re.sub(r' \[\d+\]', '', citance_dict['citance']), citance_dict['claims']))
        # citances.append((citance_dict['citance'], citance_dict['claims']))
print("Citances parsed.")
print('Len Citances:', len(citances))
# not every claim for every citance is actually in scifact
# For a single citance, all support claims are supported by the same doc(s) and same for nei and contradicts
# our dataset should be # of citances w support evidence + # of negations + # of scifact NEI

Citances parsed.
Len Citances: 585


In [None]:
# Format: (citance, one or more claims)
len(citances)

585

In [None]:
# Example citance
citance_dict

{'s2orc_id': 4504146,
 'title': 'Outcome of the First wwPDB Hybrid/Integrative Methods Task Force Workshop.',
 'abstract': 'Structures of biomolecular systems are increasingly computed by integrative modeling that relies on varied types of experimental data and theoretical information. We describe here the proceedings and conclusions from the first wwPDB Hybrid/Integrative Methods Task Force Workshop held at the European Bioinformatics Institute in Hinxton, UK, on October 6 and 7, 2014. At the workshop, experts in various experimental fields of structural biology, experts in integrative modeling and visualization, and experts in data archiving addressed a series of questions central to the future of structural biology. How should integrative models be represented? How should the data and integrative models be validated? What data should be archived? How should the data and models be archived? What information should accompany the publication of integrative models?',
 'citation_paragrap

In [None]:
claims_and_evidence = {}
for i in claim_id_to_claim:
    i['normalized_claim'] = re.sub(r'[.!?,\s]', '', i['claim'].lower())
    claims_and_evidence[i['normalized_claim']] = i['evidence']

In [None]:
# Load in negated citances
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/data/negations.json') as f:
    negations = json.load(f)
print("Citance negations parsed.")

Citance negations parsed.


In [None]:
citance_to_id_dict = {}
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_with_citances.jsonl') as f_pdf:
    for line in f_pdf:
        citance_dict = json.loads(line)
        citance_to_id_dict[re.sub(r' \[\d+\]', '', citance_dict['citance'])] = citance_dict['s2orc_id']

# Citance to Claim to Evidence matching

In [None]:
print(sum([len(citance[1]) for citance in citances]) / len([len(citance[1]) for citance in citances])) #AVG NUM OF CLAIM PER CITANCE

1.9401709401709402


In [None]:
item = citances[1]
print(item)
for claim in item[1]:
    normal = re.sub(r'[.!?,\s]', '', claim['text'].lower())
    if normal not in claims_and_evidence:
        print("NOT IN:", normal)
    else:
      print(normal)

('Biodegradable and biocompatible 0DBMs seem to be promising candidates to solve the problem, since they show great abilities to deliver the biomolecules in to cells , and some 0DBMs even show inductive properties themselves.', [{'text': '0-dimensional biomaterials lack inductive properties.', 'subject': '', 'is_negation': True}, {'text': '0-dimensional biomaterials show inductive properties.', 'subject': '', 'is_negation': False}])
NOT IN: 0-dimensionalbiomaterialslackinductiveproperties
NOT IN: 0-dimensionalbiomaterialsshowinductiveproperties


In [None]:
citances[0]

('Active caspase-11 then promotes pyroptosis, and could also participate in the activation of the NLRP3 inflammasome or regulate phagosome-lysosome fusion .',
 [{'text': ' Active caspase-11 participate in regulating phagosome-lysosome fusion.',
   'subject': 'caspase-11',
   'is_negation': False},
  {'text': 'Active caspase-11 participates in the activation of the NLRP3 inflammasome.',
   'subject': 'caspase-11',
   'is_negation': False},
  {'text': 'Active caspase-11 participates in the repression of the NLRP3 inflammasome.',
   'subject': 'caspase-11',
   'is_negation': True},
  {'text': 'Active caspase-11 protein promotes pyroptosis.',
   'subject': 'caspase-11',
   'is_negation': False},
  {'text': 'Active caspase-11 protein suppresss pyroptosis.',
   'subject': 'caspase-11',
   'is_negation': True}])

In [None]:
# Setup of SUPPORTS and CONTRADICTS
outer = {}
orpus = {}
lines = []
failed_keys = []
fk2boogaloo = 0
id = 0
counter = 0
citance_set = set()
# Loop through citances dict
for citance in citances:
    # For each citance, loop through all claims
    for claim in citance[1]:
        claim['normalized'] = re.sub(r'[.!?,\s]', '', claim['text'].lower())
        # Skip if there is no matching claim
        if claim['normalized'] not in claims_and_evidence:
            failed_keys.append(claim['normalized'])
            continue

        foo = {}
        bar = {}
        # For each associated claim, loop through all associated evidence
        for doc_id in claims_and_evidence[claim['normalized']]:
            # if claims_and_evidence[claim['normalized']][doc_id][0]['label'] == "SUPPORT" and citance[0] not in citance_set:
            #     counter += 1
            #     citance_set.add(citance[0])
            #     break
            for sentences in claims_and_evidence[claim['normalized']][doc_id]:
                # Negations of the citance
                if sentences['label'] == 'SUPPORT':
                    # Actual citance
                    foo['claim'] = citance[0]
                    # foo['evidence'] = {int(doc_id): {
                    #       'abstract': corpus[int(doc_id)]['abstract'],
                    #       'label': 'SUPPORTS'
                    #       }}
                    # to put in sentence number labels
                    foo['evidence'] = {int(doc_id): [{
                          # 'sentences': list(range(len(corpus[int(doc_id)]['abstract']))),
                          'label': 'SUPPORT'
                          }]
                          }
                    foo['citance_id'] = citance_to_id_dict[citance[0]]
                    foo['doc_ids'] = [int(doc_id)]

                    # Negated citance
                    try:
                        bar['claim'] = negations[citance[0]]
                    except Exception as e: # key error
                        fk2boogaloo += 1
                        continue
                    bar['id'] = id
                    id += 1
                    # bar['evidence'] = {int(doc_id): {
                    #     'abstract': corpus[int(doc_id)]['abstract'],
                    #     'label': 'REFUTES'
                    #     }}
                    bar['evidence'] = {int(doc_id): [{
                          # 'sentences': list(range(len(corpus[int(doc_id)]['abstract']))),
                          'label': 'CONTRADICT'
                          }]
                          }
                    bar['citance_id'] = citance_to_id_dict[citance[0]]
                    bar['doc_ids'] = [int(doc_id)]

                    if citance[0] in outer:
                      counter += 1
                    outer[citance[0]] = foo
                    if negations[citance[0]] in outer:
                      counter += 1
                    outer[negations[citance[0]]] = bar

                    orpus[int(doc_id)] = {
                        'title': corpus[int(doc_id)]['abstract'][0],
                        'abstract': corpus[int(doc_id)]['abstract'],
                        'doc_id': int(doc_id)
                    }
print(fk2boogaloo)
print(len(failed_keys))
print(len(outer.keys()))
print(counter)

0
688
76
340


# NEI GENERATION (SCIFACT)

In [None]:
# create a lookup for the pdf parse based on paper ID
corpus = {}
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/corpus.jsonl') as f_pdf:
    for line in f_pdf:
        pdf_parse_dict = json.loads(line)
        corpus[pdf_parse_dict['doc_id']] = pdf_parse_dict
print("Corpus parsed.")

Corpus parsed.


In [None]:
claim_id_to_claim = []
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_train.jsonl') as f_pdf:
    for line in f_pdf:
        claim_dict = json.loads(line)
        nei_ids = []
        for id in claim_dict['cited_doc_ids']:
            if str(id) not in claim_dict['evidence']:
                nei_ids.append(id)
        if not nei_ids:
          continue
        claim_dict['cited_doc_ids'] = nei_ids
        claim_dict['evidence'] = {}
        claim_id_to_claim.append(claim_dict)
# with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_dev.jsonl') as f_pdf:
#     for line in f_pdf:
#         claim_dict = json.loads(line)
#         nei_ids = []
#         for id in claim_dict['cited_doc_ids']:
#             if str(id) not in claim_dict['evidence']:
#                 nei_ids.append(id)
#         if not nei_ids:
#           continue
#         claim_dict['cited_doc_ids'] = nei_ids
#         claim_dict['evidence'] = {}
#         claim_id_to_claim.append(claim_dict)
print("Claims parsed.")
print(len(claim_id_to_claim))

Claims parsed.
329


In [None]:
# create a lookup for the pdf parse based on paper ID
citances = []
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_with_citances.jsonl') as f_pdf:
    for line in f_pdf:
        citance_dict = json.loads(line)
        citances.append((re.sub(r' \[\d+\]', '', citance_dict['citance']), citance_dict['claims']))
print("Citances parsed.")

Citances parsed.


In [None]:
claims_and_evidence = {}
for i in claim_id_to_claim:
    i['normalized_claim'] = re.sub(r'[.!?,\s]', '', i['claim'].lower())
    claims_and_evidence[i['normalized_claim']] = i['cited_doc_ids']

In [None]:
citance_to_id_dict = {}
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/scifact_dataset/claims_with_citances.jsonl') as f_pdf:
    for line in f_pdf:
        citance_dict = json.loads(line)
        citance_to_id_dict[re.sub(r' \[\d+\]', '', citance_dict['citance'])] = citance_dict['s2orc_id']

In [None]:
# Setup of NEI
citances_to_id = {}
results = []
failed_keys = []
fk2boogaloo = 0
# Loop through citances dict
num = 0
for citance in citances:
    # For each citance, loop through all claims
    for claim in citance[1]:

        claim['normalized'] = re.sub(r'[.!?,\s]', '', claim['text'].lower())
        # Skip if there is no matching claim
        # if claim['normalized'] not in claims_and_evidence:
        #     failed_keys.append(claim['normalized'])
        #     continue
        foo = {}
        if claim['normalized'] in claims_and_evidence:
            ids = []
            for doc_id in claims_and_evidence[claim['normalized']]:
                if id not in ids:
                    ids.append(int(doc_id))
                    orpus[int(doc_id)] = {
                                'title': corpus[int(doc_id)]['abstract'][0],
                                'abstract': corpus[int(doc_id)]['abstract'],
                                'doc_id': int(doc_id)
                            }
            foo['claim'] = citance[0]
            foo['evidence'] = {}
            foo['citance_id'] = citance_to_id_dict[citance[0]]
            foo['doc_ids'] = ids
            if citance[0] not in outer:
                outer[citance[0]] = foo
                num += len(ids)

            else:
              for id in ids:
                if id not in outer[citance[0]]['doc_ids']:
                  outer[citance[0]]['doc_ids'].append(id)
                  num += 1


print(fk2boogaloo)
print(len(failed_keys))
print(num)

0
0
150


# Save dataset

In [None]:
m = list(outer.values())
print(len(m))

649


In [None]:
random.seed(5)
m = list(outer.values())
random.shuffle(m)
split_1 = int(0.7 * len(m))
split_2 = int(0.85 * len(m))
train = m[:split_1]
dev = m[split_1:split_2]
test = m[split_2:]

# with open('/content/gdrive/MyDrive/Independent study - Max & Carlos/data/dataset/train_NOSCIDEV.jsonl', "w") as f:
#     for item in train:
#         json_item = json.dumps(item)
#         f.write(json_item + "\n")

# with open('/content/gdrive/MyDrive/Independent study - Max & Carlos/data/dataset/dev.jsonl', "w") as f:
#     for item in dev:
#         json_item = json.dumps(item)
#         f.write(json_item + "\n")

# with open('/content/gdrive/MyDrive/Independent study - Max & Carlos/data/dataset/test.jsonl', "w") as f:
#     for item in test:
#         json_item = json.dumps(item)
#         f.write(json_item + "\n")

# with open('/content/gdrive/MyDrive/Independent study - Max & Carlos/data/dataset/corpus.jsonl', "w") as f:
#     for item in list(orpus.values()):
#         json_item = json.dumps(item)
#         f.write(json_item + "\n")

# [DEPRECATED] NEI Generation (non-SciFact)

In [None]:
# Setup of NEI
failed_papers = 0
counter = 0
for key in outer:
    if counter == 250:
        break
    counter += 1
    obj = outer[key]
    # find all cited papers from the citance paper
    response = requests.get("https://api.semanticscholar.org/graph/v1/paper/CorpusID:" + str(obj['citance_id']) + "/references?fields=abstract").json()
    try:
        papers = response['data']
    except:
        failed_papers += 1
        continue
    # randomly select a paper
    random.shuffle(papers)
    for paper in papers:
        # if paper_id and full text available
        if not paper['citedPaper']['paperId'] or not paper['citedPaper']['abstract']:
            continue

        response = requests.get("https://api.semanticscholar.org/graph/v1/paper/" + paper['citedPaper']['paperId'] + "?fields=corpusId,abstract,title").json()
        try: # Key error corpus id i guess
            outer[obj['claim']]['doc_ids'].append(response['corpusId'])
        except:
          failed_papers += 1
          continue
        # outer[obj['claim']]['evidence'][response['corpusId']] = {
        #     'abstract': paper['citedPaper']['abstract'],
        #     'label': 'NEI'
        #     }

        doc = nlp(response['abstract'])
        abs = [sent.text for sent in doc.sents]
        orpus[response['corpusId']] = {
                        'title': response['title'],
                        'abstract': abs,
                        'doc_id': response['corpusId']
                    }
        break
print("Failed requests:", failed_papers)

Failed requests: 4
