In [25]:
import json
import re
from nltk import word_tokenize
import numpy as np
import scipy.sparse as sp
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
train_path = '/kaggle/input/anlp-project-data/train.json'
dev_path = '/kaggle/input/anlp-project-data/dev.json'
test_path = '/kaggle/input/anlp-project-data/test.json'

with open(train_path, 'r') as train_file:
    train_data = json.load(train_file)

In [27]:
def clean_str(str_):
    str_ = str_.replace('\n', ' ')
    str_ = re.sub(r'\\t', ' ', str_)
    str_ = re.sub(r'\\r', ' ', str_)
    str_ = re.sub(r'(.)\1{2,}', r'\1', str_)
    return str_.strip().lower()

def tokenize(str_):
    return ' '.join(word_tokenize(str_))

In [28]:
def clean_data(data):
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])
        data['documents'][i]['text'] = tokenize(data['documents'][i]['text'])
    return data

In [29]:
train_data = clean_data(train_data)

In [30]:
print(train_data['documents'][0])

{'id': 34, 'file_name': 'Annex E_Non-Disclosure and Confidentiality Agreement.pdf', 'text': "non-disclosure and confidentiality agreement this non-disclosure and confidentiality agreement ( “ agreement ” ) is made by and between : ( i ) the office of the united nations high commissioner for refugees , having its headquarters located at 94 rue de montbrillant , 1202 geneva , switzerland ( hereinafter “ unhcr ” or the “ discloser ” ) ; and ( ii ) _ , a company established in accordance with the laws of _ and having its principal offices located at _ ( hereinafter the “ bidder ” or the “ recipient ” ) . the discloser and recipient are also referred to collectively as the “ parties ” and individually as a “ party ” . recitals whereas in connection with rfp/2014/620 , request for proposal for the provision off-the-shelf soft-skill , it online and hr specific e-learning courses ( the “ rfp ” ) , it is advantageous to share certain data and information with the bidder participating in the rfp

In [31]:
hypothesis = {}
for key, value in train_data['labels'].items():
    hypothesis[key] = clean_str(value['hypothesis'])

In [32]:
print(len(hypothesis))

17


In [33]:
all_texts = [train_data["documents"][i]["text"] for i in range(len(train_data["documents"]))]

tfidf = TfidfVectorizer()

tfidf.fit(all_texts)

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def compute_predictions_and_labels(data, tfidf_model, hypothesis_data):

    predicted_scores = []
    true_labels = []

    hypothesis_vectors = {key: tfidf_model.transform([text]) for key, text in hypothesis_data.items()}

    for doc_idx in tqdm(range(len(data["documents"]))):
        document = data["documents"][doc_idx]
        doc_text = document["text"]

        current_doc_pred = []
        current_doc_labels = []

        for hyp_key, hyp_value in hypothesis_data.items():

            annotation_data = document["annotation_sets"][0]["annotations"][hyp_key]
            if annotation_data["choice"] == "NotMentioned":
                continue

            relevant_spans = annotation_data["spans"]

            for span_idx, span_coords in enumerate(document["spans"]):
                start, end = span_coords

                span_text = doc_text[start:end]
                span_tfidf_vector = tfidf_model.transform([span_text])

                similarity_score = cosine_similarity(span_tfidf_vector, hypothesis_vectors[hyp_key])[0][0]
                current_doc_pred.append(similarity_score)

                current_doc_labels.append(1 if span_idx in relevant_spans else 0)

        predicted_scores.append(current_doc_pred)
        true_labels.append(current_doc_labels)

    return predicted_scores, true_labels

In [36]:
with open(test_path, 'r') as test_file:
    test_data = json.load(test_file)

test_data = clean_data(test_data)

In [37]:
Y_pred, Y_true = compute_predictions_and_labels(test_data, tfidf, hypothesis)

100%|██████████| 123/123 [01:56<00:00,  1.05it/s]


In [17]:
from sklearn.metrics import precision_recall_curve

def precision_at_80_recall(ypred, ytrue):
    precision, recall, thresholds = precision_recall_curve(ytrue, ypred)
    idx = (abs(recall - 0.8)).argmin()
    return precision[idx]

In [24]:
arr = []
for i in range(len(Y_true)):
    arr.append(precision_at_80_recall(Y_pred[i], Y_true[i]))

print("Precision @ 80\% recall: ", np.mean(np.array(arr)))

Precision @ 80\% recall:  0.05457481716615441


In [23]:
from sklearn.metrics import average_precision_score
def mean_average_precision(Y_pred, Y_true):
    average_prec_scores = []
    for i in range(len(Y_true)):
        average_prec_scores.append(average_precision_score(Y_true[i], Y_pred[i], average='micro'))
    return np.mean(average_prec_scores)

print("Mean Average Precision: ", mean_average_precision(Y_pred, Y_true))

Mean Average Precision:  0.37311921854054936
