In [2]:
from tqdm.notebook import tqdm
import pandas as pd
pd.set_option("display.max_colwidth", None)
import json
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForSequenceClassification

In [3]:
df = pd.read_csv("../data/valid_df.csv", keep_default_na=False)

In [4]:
df.count()

arg_id          3458
key_point_id    3458
label           3458
argument        3458
topic           3458
stance          3458
key_point       3458
dtype: int64

In [5]:
entailment_model = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
tokenizer = AutoTokenizer.from_pretrained(entailment_model)
model = AutoModelForSequenceClassification.from_pretrained(entailment_model).to("cuda:0")

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [73]:
############################################
#### THIS IS THE LEAVE ONE OUT CELL#########
############################################

MAX_LENGTH=256
import copy

def leave_one_out(hypothesis, premise):

    print(f"Hypothesis: {hypothesis}")
    print(f"Premise: {premise}\n")


    n = len(premise.split()) + len(hypothesis.split())

    true_score = compute_entailment(hypothesis, premise)["entail"]
    print(f"True entailment score: {true_score}\n")

    for i in range(n):

        premise_copy = copy.copy(premise).split()
        hypothesis_copy = copy.copy(hypothesis).split()

        # Drop word in the respective sequence
        index = None
        if i < len(premise_copy):
            dropped_word = premise_copy.pop(i)
            which = "premise"
        else:
            index = i - len(premise_copy) - 1
            dropped_word = hypothesis_copy.pop(index)
            which = "hypothesis"

        premise_copy = " ".join(premise_copy)
        hypothesis_copy = " ".join(hypothesis_copy)

        score = compute_entailment(hypothesis_copy, premise_copy)["entail"]

        print(f"Dropping word {index+1 if index else i+1} \"{dropped_word}\" in {which}.")
        print(f"Entailment score is: {score}")
        print(f"That's a difference of {true_score-score}.\n")

def compute_entailment(hypothesis, premise):
    tokenized_input_seq_pair = tokenizer.encode_plus(hypothesis, premise, max_length=MAX_LENGTH, return_token_type_ids=True, truncation=True)
    input_ids = torch.Tensor(tokenized_input_seq_pair['input_ids']).long().unsqueeze(0).cuda()
    token_type_ids = torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().unsqueeze(0).cuda()
    attention_mask = torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().unsqueeze(0).cuda()
    outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=None)
    predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()
    entailment_prob = predicted_probability[0]
    neutral_prob = predicted_probability[1]
    contradiction_prob = predicted_probability[2]
    result = {'entail':entailment_prob, 'neutral':neutral_prob, 'contradict':contradiction_prob}
    return result


arg = """school uniforms cut down on bulling and keep everyone the same ."""
kp = """School uniform reduces bullying"""

arg_list = arg.split()
kp_list = kp.split()

leave_one_out(arg, kp)



Hypothesis: school uniforms cut down on bulling and keep everyone the same .
Premise: School uniform reduces bullying

True entailment score: 0.9782942533493042

Dropping word 1 "School" in premise.
Entailment score is: 0.9620389342308044
That's a difference of 0.016255319118499756.

Dropping word 2 "uniform" in premise.
Entailment score is: 0.9140735864639282
That's a difference of 0.06422066688537598.

Dropping word 3 "reduces" in premise.
Entailment score is: 0.05332595482468605
That's a difference of 0.9249682985246181.

Dropping word 4 "bullying" in premise.
Entailment score is: 0.834721565246582
That's a difference of 0.14357268810272217.

Dropping word 0 "." in hypothesis.
Entailment score is: 0.9770272970199585
That's a difference of 0.0012669563293457031.

Dropping word 6 "school" in hypothesis.
Entailment score is: 0.19731928408145905
That's a difference of 0.7809749692678452.

Dropping word 2 "uniforms" in hypothesis.
Entailment score is: 0.07608986645936966
That's a differe

In [45]:
print(compute_entailment_hyp_prem(arg, kp))
print(compute_entailment_sentence(arg+" "+kp))

{'entail': 0.9782942533493042, 'neutral': 0.020531466230750084, 'contradict': 0.0011742558563128114}
{'entail': 0.8369314074516296, 'neutral': 0.15482930839061737, 'contradict': 0.008239319548010826}


In [36]:
arg+" "+kp

'school uniforms cut down on bulling and keep everyone the same. School uniform reduces bullying'

In [11]:
valid_df = df

In [18]:
def match_argument_with_keypoints(result, kp_dict, arg_dict):
    for arg_id, arg in arg_dict.items():
        result[arg_id] = {}
        for kp_id, kp in kp_dict.items():
            result[arg_id][kp_id] = compute_entailment(arg, kp)
    return result

In [19]:
argument_keypoints = {}
for topic in tqdm(valid_df.topic.unique()):
    for stance in [-1, 1]:
        topic_keypoint_ids = valid_df[(valid_df.topic==topic) & (valid_df.stance==stance)]['key_point_id'].tolist()
        topic_keypoints = valid_df[(valid_df.topic==topic) & (valid_df.stance==stance)]['key_point'].tolist()
        topic_kp_dict = dict(zip(topic_keypoint_ids, topic_keypoints))
        
        topic_argument_ids = valid_df[(valid_df.topic==topic) & (valid_df.stance==stance)]['arg_id'].tolist()
        topic_arguments = valid_df[(valid_df.topic==topic) & (valid_df.stance==stance)]['argument'].tolist()
        topic_arg_dict= dict(zip(topic_argument_ids, topic_arguments))
        # match 
        argument_keypoints = match_argument_with_keypoints(argument_keypoints, topic_kp_dict, topic_arg_dict)

  0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
json.dump(argument_keypoints, open("entailment_all_valid_predictions.json", "w",encoding='utf-8'))

In [17]:
! python3 ../src-py/track_1_kp_matching.py ../data entailment_all_valid_predictions.json our_valid

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
loaded predictions for 932 arguments
mAP strict= 0.6896999511088654 ; mAP relaxed = 0.8973558780369991
