In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
from transformers import RobertaConfig, RobertaModel,RobertaTokenizer
import torch
import torch.nn as nn
import json
import torch.nn.functional as F
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [None]:
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

In [None]:
np.random.seed(0)

## Load agreement for 20 citing context

In [None]:
with open ('./annotations_agreement_20.json','r') as file:
    annotation = json.load(file)

## Load cited papers

In [3]:
with open ('./cited_papers_20.json','r') as file:
    cited_papers = json.load(file)
papers = [line.strip() for paper in cited_papers.values() for line in paper ]

## Load tokenizer and model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 

In [None]:
configuration = DistilBertConfig()

In [None]:
bert = BertModel.from_pretrained('bert-base-uncased').to(DEVICE)

## Calculate cosine similarity of citing context and all the cited sentences

In [None]:
def rank_sentences(context):
    context_input = tokenizer(context,return_tensors='pt',max_length=512,truncation=True,padding='max_length')
    bert.train(False)
    with torch.no_grad():
        context_vector = bert(**context_input.to(DEVICE)).last_hidden_state.mean(dim=1)
        score_dict = {}
        for sent in papers:
                cited_input = tokenizer(sent,return_tensors='pt',max_length=512,truncation=True,padding='max_length').to(DEVICE)
                cited_vector = bert(**cited_input.to(DEVICE)).last_hidden_state.mean(dim=1)
                cosine_similar = F.cosine_similarity(cited_vector,context_vector)
                score_dict[sent] = cosine_similar.item()
    return score_dict

In [None]:
citing_score_for_all_sent = {}
for file in annotation:
    context = annotation[file]['tsa']['citing_context']
    print(context)
    score_dict = rank_sentences(context)
    score_dict = {k:v for k,v in sorted(score_dict.items(),key=lambda x: -x[1])}
    citing_score_for_all_sent[context] = score_dict

## Save score

In [None]:
with open ('context_and_score_bert.json','w') as f:
    json.dump(citing_score_for_all_sent,f) 

# citing_score_for_all_sent = {citing_context:{cited_sentece:cosine_similarity}}

## Results
Based on manual inspection, we defined a threshold 0.90 for the cosine similarity between the citation context and the cited paper sentences, above which a sentence will be predicted as positive.

In [None]:
for file in annotation:
    context = annotation[file]['tsa']['citing_context']
    common_sentences = annotation[file]['common_annotated_sentences']
    context_and_cited[context] = common_sentences

In [None]:
with open('context_and_score_bert.json','r') as file:
    context_and_score = json.load(file)

In [None]:
# get the id and index of context
context_index = {}
for file in annotation:
    context_index[annotation[file]['tsa']['citing_context']] = {'citing_arvix_id':annotation[file]['tsa']['citing_arxiv_id'],'citing_context_list_idx':annotation[file]['tsa']['citing_context_list_idx']}

# get the id and index of cited sentences
cited_index = {}
for id in cited_papers:
    for lines in cited_papers[id]:
        for line in lines:
            cited_index[line.strip()] = {'cited_arxiv_id':id,'cited_sentence_index':lines.index(line)}


# Save the result to csv

In [None]:
dataframe = defaultdict(list)
for context, score_dict in context_and_score.items():
    for sent, score in score_dict.items():
        dataframe['citing_arvix_id'].append(context_index[context]['citing_arvix_id'])
        dataframe['citing_context_list_idx'].append(context_index[context]['citing_context_list_idx'])
        dataframe['citing_context'].append(context)
        dataframe['cited_arxiv_id'].append(cited_index[sent]['cited_arxiv_id'])
        dataframe['cited_sentence_index'].append(cited_index[sent]['cited_sentence_index'])
        dataframe['cited_sentence'].append(sent)
        dataframe['cosine_similarity'].append(score)
        if score >= 0.90: # set the threshold
            dataframe['predicted_label'].append(1)
        else:
            dataframe['predicted_label'].append(0)
        if sent[6:-6] in context_and_cited[context]:
            dataframe['real_label'].append(1)
        else:
            dataframe['real_label'].append(0)

df = pd.DataFrame(data=dataframe)
# print(df)
df.to_csv('result_cosine_similarity_bert_pre.csv')

# Evaluation

In [None]:
predicted_label = df.predicted_label.values
real_label = df.real_label.values

print(precision_recall_fscore_support(real_label, predicted_label, average='macro'))
print(precision_recall_fscore_support(real_label, predicted_label, average='micro'))
print(precision_recall_fscore_support(real_label, predicted_label, average=None))
target_names = ['class 0', 'class 1']
print(classification_report(real_label, predicted_label, target_names=target_names))