In [1]:
import json
import re
from nltk import word_tokenize
import numpy as np
import scipy.sparse as sp
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_path = '/kaggle/input/anlp-project-data/train.json'
dev_path = '/kaggle/input/anlp-project-data/dev.json'
test_path = '/kaggle/input/anlp-project-data/test.json'

with open(train_path, 'r') as train_file:
    train_data = json.load(train_file)

In [3]:
def clean_str(str_):
    str_ = str_.replace('\n', ' ')
    str_ = re.sub(r'\\t', ' ', str_)
    str_ = re.sub(r'\\r', ' ', str_)
    str_ = re.sub(r'(.)\1{2,}', r'\1', str_)
    return str_.strip().lower()

def tokenize(str_):
    return ' '.join(word_tokenize(str_))

In [4]:
def clean_data(data):
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])
        data['documents'][i]['text'] = tokenize(data['documents'][i]['text'])
    return data

In [5]:
train_data = clean_data(train_data)

In [6]:
print(train_data['documents'][0])

{'id': 34, 'file_name': 'Annex E_Non-Disclosure and Confidentiality Agreement.pdf', 'text': "non-disclosure and confidentiality agreement this non-disclosure and confidentiality agreement ( “ agreement ” ) is made by and between : ( i ) the office of the united nations high commissioner for refugees , having its headquarters located at 94 rue de montbrillant , 1202 geneva , switzerland ( hereinafter “ unhcr ” or the “ discloser ” ) ; and ( ii ) _ , a company established in accordance with the laws of _ and having its principal offices located at _ ( hereinafter the “ bidder ” or the “ recipient ” ) . the discloser and recipient are also referred to collectively as the “ parties ” and individually as a “ party ” . recitals whereas in connection with rfp/2014/620 , request for proposal for the provision off-the-shelf soft-skill , it online and hr specific e-learning courses ( the “ rfp ” ) , it is advantageous to share certain data and information with the bidder participating in the rfp

In [7]:
hypothesis = {}
for key, value in train_data['labels'].items():
    hypothesis[key] = clean_str(value['hypothesis'])

In [8]:
print(len(hypothesis))

17


In [9]:
all_texts = [train_data["documents"][i]["text"] for i in range(len(train_data["documents"]))]

tfidf = TfidfVectorizer()

tfidf.fit(all_texts)

In [10]:
def prepare_data(data, tfidf, hypothesis,n_docs):

    X = []
    Y = []

    hypothesis_vectors = {}
    for key, val in hypothesis.items():
        hypothesis_vectors[key] = tfidf.transform([val])

    for i in range(min(n_docs, len(data["documents"]))):
        doc_text = data["documents"][i]["text"]

        for key, val in hypothesis.items():
            choice = data["documents"][i]["annotation_sets"][0]["annotations"][key]["choice"]
            if choice == "NotMentioned":
                continue

            spans_for_hypothesis = data["documents"][i]["annotation_sets"][0]["annotations"][key]["spans"]

            for j, span in enumerate(data["documents"][i]["spans"]):
                start_idx = span[0]
                end_idx = span[1]

                span_text = doc_text[start_idx:end_idx]
                span_vector = tfidf.transform([span_text])

                input_vec = sp.hstack([span_vector, hypothesis_vectors[key]])
                X += [input_vec]
                Y += [1 if j in spans_for_hypothesis else 0]
            
    return sp.vstack(X), Y

In [11]:
X_train, y_train = prepare_data(train_data, tfidf, hypothesis, n_docs=100)
print(np.shape(X_train))
print(np.shape(y_train))

(9941, 21054)
(9941,)


In [12]:
model = SVC(kernel='linear', probability=True, verbose = True)
model.fit(X_train, y_train)

[LibSVM]*.
*
optimization finished, #iter = 1403
obj = -319.999946, rho = 1.000453
nSV = 637, nBSV = 201
Total nSV = 637
.
*
optimization finished, #iter = 1939
obj = -325.999934, rho = 1.000581
nSV = 658, nBSV = 194
Total nSV = 658
*.
*
optimization finished, #iter = 1594
obj = -335.999942, rho = 1.000535
nSV = 621, nBSV = 208
Total nSV = 621
*.
*
optimization finished, #iter = 1916
obj = -341.805324, rho = 1.000511
nSV = 859, nBSV = 197
Total nSV = 859
*.
*
optimization finished, #iter = 1660
obj = -315.999957, rho = 1.000684
nSV = 638, nBSV = 195
Total nSV = 638
*..
*
optimization finished, #iter = 2105
obj = -409.999919, rho = -0.999645
nSV = 934, nBSV = 262
Total nSV = 934


In [13]:
with open(test_path, 'r') as test_file:
    test_data = json.load(test_file)

clean_data(test_data)
X_test, y_test = prepare_data(test_data, tfidf, hypothesis, n_docs=4)

In [14]:
y_pred = model.predict(X_test)

In [35]:
def average_precision(y_true, y_pred):
    indices = np.argsort(-y_pred)
    y_true_sorted = np.array(y_true)[indices]
    tp = np.cumsum(y_true_sorted)
    fp = np.cumsum(1 - y_true_sorted)
    total_relevant = np.sum(y_true_sorted)
    if total_relevant == 0:
        return 0
    precisions = tp / (tp + fp)
    recall = tp / total_relevant
    relevant_indices = np.where(y_true_sorted == 1)[0]
    return np.mean(precisions[relevant_indices])

def precision_at_recall(y_true, y_pred, target_recall=0.8):
    indices = np.argsort(-y_pred)
    y_true_sorted = np.array(y_true)[indices]
    tp = np.cumsum(y_true_sorted)
    fp = np.cumsum(1 - y_true_sorted)
    total_relevant = np.sum(y_true_sorted)
    recall = tp / total_relevant
    precision = tp / (tp + fp)
    recall_threshold_indices = np.where(recall >= target_recall)[0]
    if len(recall_threshold_indices) == 0:
        return None
    idx = recall_threshold_indices[0]
    return precision[idx]


In [36]:
print(np.shape(y_pred))

(3478,)


In [1]:
print("Precision @ 80\% recall: ", precision_at_recall(y_test, y_pred))
print("mean average precision: ", average_precision(y_test, y_pred))

Precision @ 80\% recall:  0.31518578352180937
mean average precision:  0.8257626500671198
