# Imports


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import spacy
import json
!pip install rouge_score
from rouge_score import rouge_scorer
!pip install nltk
from nltk.translate.bleu_score import sentence_bleu
!pip install scientific_information_change
from scientific_information_change.estimate_similarity import SimilarityEstimator
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util
import re

# Generate contexts

In [None]:
### Code block adapted from https://github.com/allenai/s2orc
# TODO: Loop through batch IDs to process all S2ORC data
BATCH_ID = 0
# create a lookup for the pdf parse based on paper ID
paper_id_to_pdf_parse = {}
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/data_sample_from_s2orc/pdf_parses_{BATCH_ID}.jsonl') as f_pdf:
    for line in f_pdf:
        pdf_parse_dict = json.loads(line)
        paper_id_to_pdf_parse[pdf_parse_dict['paper_id']] = pdf_parse_dict
print("PDFs parsed.")

# filter papers using metadata values
citation_contexts = []
failed_keys = {}
with open(f'/content/gdrive/MyDrive/Independent study - Max & Carlos/data_sample_from_s2orc/metadata_{BATCH_ID}.jsonl') as f_meta:
    for line in f_meta:
        metadata_dict = json.loads(line)
        paper_id = metadata_dict['paper_id']
        # Can add other filters here if needed
        if not metadata_dict['has_outbound_citations']:
            continue
        # get citation context (paragraphs)!
        if paper_id in paper_id_to_pdf_parse:
            # (1) get the full pdf parse from the previously computed lookup dict
            pdf_parse = paper_id_to_pdf_parse[paper_id]
            # (2) pull out fields we need from the pdf parse, including bibliography & text
            bib_entries = pdf_parse['bib_entries']
            # If citing paper has no data, move on
            if not pdf_parse['abstract'] or not pdf_parse['body_text']:
                continue
            paragraphs = pdf_parse['abstract'] + pdf_parse['body_text']
            # (3) loop over paragraphs, grabbing citation contexts
            for paragraph in paragraphs:
                # (4) loop over each inline citation in this paragraph
                for cite_span in paragraph['cite_spans']:
                    # (5) each inline citation can be resolved to a bib entry
                    try:
                        cited_bib_entry = bib_entries[cite_span['ref_id']]
                    except:
                        if paper_id in failed_keys:
                            failed_keys.get(paper_id).append(cite_span['ref_id'])
                        else:
                            failed_keys[paper_id] = [cite_span['ref_id']]
                    # (6) that bib entry *may* be linked to a S2ORC paper. if so, grab paragraph
                    linked_paper_id = cited_bib_entry['link']
                    if linked_paper_id and linked_paper_id in paper_id_to_pdf_parse and \
                    paper_id_to_pdf_parse[linked_paper_id]['abstract'] and \
                    paper_id_to_pdf_parse[linked_paper_id]['body_text'] and not \
                    linked_paper_id == paper_id:
                        citation_contexts.append({
                            'citing_paper_id': paper_id,
                            'cited_paper_id': linked_paper_id,
                            'context': paragraph['text'],
                            'citation_mention_start': cite_span['start'],
                            'citation_mention_end': cite_span['end'],
                        })
print("Citation contexts created")

PDFs parsed.
Citation contexts created


In [None]:
# # To create a text file in G drive with given content
# from pydrive.drive import GoogleDrive
# from pydrive.auth import GoogleAuth
# gauth = GoogleAuth()
# # Create local webserver and auto handles authentication.
# gauth.LocalWebserverAuth()
# # Create GoogleDrive instance with authenticated GoogleAuth instance.
# drive = GoogleDrive(gauth)
# # Create GoogleDriveFile instance
# file = drive.CreateFile({'title':'failed_keys.txt', 'mimeType':'text/plain'})
content = ""
for key in failed_keys:
    content += key + ": "
    for ref in failed_keys[key]:
        content += ref[6:] + " "
    content += "\n"
print(content)
# file.SetContentString(content)
# file.Upload() # Upload file.

In [None]:
# Load ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# Load IMS
estimator = SimilarityEstimator()
# Load model for language processing
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
nlp = spacy.load('en_core_sci_sm')
# Load SBERT model
model = SentenceTransformer('nli-distilroberta-base-v2')

#### We may need to add a check in that the full pdf parse of the cited paper is in the pdf_parse
#### Paragraph may need the ['text'] indexing
#### we should limit this for just a few papers for testing

#### citation context = secondary paper
#### window size = primary paper

### Hyperparameters
window_size = 2
context_type_paragraph = False
context_size = 2
context_offset = -1 # negative indicates shifted left. If context_size = 2 and offset = -1, then the citing context will be the sentence preceding the citing sentence and the citing sentence.
scoring_type = 'sbert'
threshold = False
k = 4


In [None]:
# Score the similarity of two strings given a scoring method
def similarity(scoring_type, str1, str2):
    scoring_type = scoring_type.lower()
    if (scoring_type == 'rouge'):  # 0-1
        scores = scorer.score(str1, str2)
        avg_score = (scores['rouge1'][2] + scores['rouge2'][2] + scores['rougeL'][2]) / 3
        return avg_score
    elif (scoring_type == 'bleu'):  # 0-1
        return sentence_bleu(str1.split(), str2.split(), weights=(0.25, 0.25, 0.25, 0.25))
    elif (scoring_type == 'ims' or scoring_type == 'information matching score'):  # 0-5, normalized to 0-1
        return estimator.estimate_ims_array([str1], [str2]) / 5
    elif (scoring_type == 'sbert'):  # 0-1
        embeddings1 = model.encode([str1], convert_to_tensor=True)
        embeddings2 = model.encode([str2], convert_to_tensor=True)
        return util.cos_sim(embeddings1, embeddings2).item()
    else:
        raise Exception("Please provide valid scoring_type")

# Sort windows and similarity scores, returning only scores/windows of interest (top-k or threshold)
def sort_windows(scores, windows, threshold, k):
    windows = [x for _, x in sorted(zip(scores, windows), key=lambda pair: pair[0], reverse=True)]
    scores.sort(reverse=True)
    if threshold:
        scores = [x for x in scores if x >= k]
        windows = windows[:len(scores)]
    else: # top-k
        scores = scores[:k]
        windows = windows[:k]
    return scores, windows

Batch 0: ~11k valid, down from 3.5 million. This will improve when other batches are processed.

In [None]:
### TODO: Find distortion scores for all citations

for i in range(20):
    cc = citation_contexts[i]

# for cc in citation_contexts:

    ### (1) Find citation's context_window:
    # Find the citing paper key (secondary)
    citing_id = cc['citing_paper_id']
    # Extract citation context (context size is hyperparam)
    if context_type_paragraph: # If context window is paragraph
        context_window = cc['context']
    else: # Context paragraph is something else
        # Collect PDF text of the citing paper
        pdf_parse = paper_id_to_pdf_parse[citing_id]
        paragraphs = pdf_parse['abstract'] + pdf_parse['body_text']
        citing_sentences = []
        # Loop through the entire text
        context_sent_index = 0
        for paragraph in paragraphs: # This can (and maybe should...) be done in preprocessing block
            doc = nlp(paragraph['text'])
            if paragraph['text'] == cc['context']:
                citation_start_index = cc['citation_mention_start']
                for sent in doc.sents:
                    if citation_start_index > sent.start_char and citation_start_index < sent.end_char:
                        context_sent_index = len(citing_sentences)
                    citing_sentences.append(sent.text)
            else:
                for sent in doc.sents:
                    citing_sentences.append(sent.text)
        try:
            context_window = ' '.join(citing_sentences[context_sent_index + context_offset : context_sent_index + context_offset + context_size]) # this may causes errors with offset? start of array. also end
        except:
            context_window = ' '.join(citing_sentences[context_sent_index : context_sent_index + context_size])

    ### (2) Generate array of all sentences in the cited paper
    cited_id = cc['cited_paper_id']
    pdf_parse = paper_id_to_pdf_parse[cited_id]
    paragraphs = pdf_parse['abstract'] + pdf_parse['body_text']
    sentences = []
    for paragraph in paragraphs:
        doc = nlp(paragraph['text'])
        for sent in doc.sents:
            sentences.append(sent.text)

    ### (3) Score all windows in cited/primary
    cited_windows = []
    cited_scores = []
    # Remove citation notations from context (sentence [2] example)
    context_window = re.sub(r' \[\d+\]', '', context_window)
    # For each possible window in cited/primary:
    for i in range(len(sentences) - window_size + 1):
        # Extract window from cited paper and remove citation notation
        curr_window = ' '.join(sentences[i:i + window_size])
        curr_window = re.sub(r' \[\d+\]', '', curr_window)
        # Conduct similarity assessment
        score = similarity(scoring_type, context_window, curr_window)
        cited_windows.append(curr_window)
        cited_scores.append(score)

    ### (4) Determine windows in top-k/threshold
    cited_scores, cited_windows = sort_windows(cited_scores, cited_windows, threshold, k)
    print("\nCitation context: " + context_window)
    for i in range(len(cited_scores)):
        print(str(cited_scores[i]) + ":\t" + cited_windows[i])

    ### (5) TODO: Find distortion scores for each window



Citation context: Furthermore, several mathematical models used to describe biological processes require a noise term to adequately model the behaviour of these processes. Cellular processes, such as transcription and translation, chromatin remodeling and pathwayspecific regulation, are sources of stochastic events leading to cell-to-cell variability , .
0.6957463026046753:	However, the oscillatory properties, such as phase and amplitude are dependent on the gene function and differ between tissues and experimental conditions . Oscillatory patterns of expression in major housekeeping genes responsible for the energy balance (PPAR) and basic transcription (TBP) are bound to impose the same patterns on all transcribed genes regardless of the volume of transcription.
0.6850059032440186:	Second, analysis of correlation with phase shift (also used to identify phase groups) confirms high correlation of nearly all profiles to common cosine curves. Third, living cells are known to have more t

Some outputs have no citation contexts

In [None]:
### TODO: Find distortion scores for all citations
cc = citation_contexts[0]
### (1) Find citation's context_window:
# Find the citing paper key (secondary)
citing_id = cc['citing_paper_id']
print("citing: " + str(citing_id))
# Extract citation context (context size is hyperparam)
if context_type_paragraph: # If context window is paragraph
    context_window = cc['context']
else: # Context paragraph is something else
    # Collect PDF text of the citing paper
    pdf_parse = paper_id_to_pdf_parse[citing_id]
    paragraphs = pdf_parse['abstract'] + pdf_parse['body_text']
    citing_sentences = []
    # Loop through the entire text
    context_sent_index = 0
    for paragraph in paragraphs: # This can (and maybe should...) be done in preprocessing block
        doc = nlp(paragraph['text'])
        if paragraph['text'] == cc['context']:
            citation_start_index = cc['citation_mention_start']
            for sent in doc.sents:
                if citation_start_index > sent.start_char and citation_start_index < sent.end_char:
                    context_sent_index = len(citing_sentences)
                citing_sentences.append(sent.text)
        else:
            for sent in doc.sents:
                citing_sentences.append(sent.text)
    try:
        context_window = ' '.join(citing_sentences[context_sent_index + context_offset : context_sent_index + context_offset + context_size]) # this may causes errors with offset? start of array. also end
    except:
        context_window = ' '.join(citing_sentences[context_sent_index : context_sent_index + context_size])

### (2) Generate array of all sentences in the cited paper
cited_id = cc['cited_paper_id']
print("cited: " + str(cited_id))
pdf_parse = paper_id_to_pdf_parse[cited_id]
paragraphs = pdf_parse['abstract'] + pdf_parse['body_text']
sentences = []
for paragraph in paragraphs:
    doc = nlp(paragraph['text'])
    for sent in doc.sents:
        sentences.append(sent.text)

# print("\n\tCiting / Secondary Paper:")
# for s in citing_sentences:
#   print(s)

# print("\n\tCited / Primary Paper:")
# for s in sentences:
#   print(s)

### (3) Score all windows in cited/primary
cited_windows = []
cited_scores = []
# For each possible window in cited/primary:
for i in range(len(sentences) - window_size + 1):
    # Extract window from cited paper
    curr_window = ' '.join(sentences[i:i + window_size])
    # Conduct similarity assessment
    score = similarity(scoring_type, context_window, curr_window)
    cited_windows.append(curr_window)
    cited_scores.append(score)

### (4) Determine windows in top-k/threshold
cited_scores, cited_windows = sort_windows(cited_scores, cited_windows, threshold, k)
print("\nCitation context: " + context_window)
for i in range(len(cited_scores)):
    print(str(cited_scores[i]) + ":\t" + cited_windows[i])

### (5) TODO: Find distortion scores for each window


citing: 18980380
cited: 18980380

Citation context: Notice that a classical technique called "lifting" is used here: We introduce an auxiliary random vectorũ, so that some non-linear relationship can be modeled linearly. For example, a constraint on the variance can be modeled using this standard form (see [22, Example 2] ), which is otherwise impossible without the auxiliary variable.
1.0:	Notice that a classical technique called "lifting" is used here: We introduce an auxiliary random vectorũ, so that some non-linear relationship can be modeled linearly. For example, a constraint on the variance can be modeled using this standard form (see [22, Example 2] ), which is otherwise impossible without the auxiliary variable.
0.5523049645390071:	The set of joint distribution of (p s , r s ) is hence C s Δ = (ps ,rs)C s . Notice that a classical technique called "lifting" is used here: We introduce an auxiliary random vectorũ, so that some non-linear relationship can be modeled linearly.
0.5

In [None]:
different = 0
same = 0
for i in range(len(citation_contexts)):
  if citation_contexts[i]['citing_paper_id'] != citation_contexts[i]['cited_paper_id']:
    different += 1
  else:
    same += 1
print("Different: " + str(different))
print("Same: " + str(same))

Different: 10106
Same: 702
