In [100]:
import copy
from typing import *
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
from torch.utils.data import DataLoader
import torch
import sys
import pickle
from collections import defaultdict
import pandas as pd
from nltk import word_tokenize
sys.path.insert(0, "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/code/src-py")
import sbert_training
work_tokenizer = word_tokenize
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_dir = "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/"
#!pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
try:
    path = "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code/model"
    model = SentenceTransformer(path)
except:
    model = sbert_training.train_model('/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/data/siamese-data/',
                                       "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/data/kpm_data",
                                       'dev',
                                       "/home/marcelbraasch/PycharmProjects/new_KPA/argmining-21-keypoint-analysis-sharedtask-code-2/code/siamese-models",
                                       'roberta-base',
                                       model_suffix='contrastive-10-epochs',
                                       data_file_suffix='contrastive',
                                      num_epochs=1, max_seq_length=70, add_special_token=True, train_batch_size=32, loss='ContrastiveLoss')

In [None]:
def create_arg_kps_mapping(arguments_df, key_points_df):
    mapping = {}
    topics = arguments_df["topic"].unique()
    for topic in topics:
        arguments = arguments_df.loc[arguments_df["topic"] == topic][["argument"]].drop_duplicates()
        key_points = key_points_df.loc[key_points_df["topic"] == topic][["key_point"]].drop_duplicates()
        map = pd.merge(arguments, key_points, how="cross")
        mapping[topic] = map
    return mapping

def load_kpm_data():

    try:
        with open("gold_labels_and_prediction_scores.pkl", "rb") as handle:
            return pickle.load(handle)
    except:
        pass

    data = defaultdict(dict)
    for subset in ["dev"]:#, "train"]:
        # Load files
        arguments_file = repo_dir + f"data/kpm_data/arguments_{subset}.csv"
        key_points_file = repo_dir + f"data/kpm_data/key_points_{subset}.csv"
        labels_file = repo_dir + f"data/kpm_data/labels_{subset}.csv"
        arguments_df = pd.read_csv(arguments_file)
        key_points_df = pd.read_csv(key_points_file)
        labels_df = pd.read_csv(labels_file)

        # Get gold standard
        positive_labels_df = labels_df.loc[labels_df["label"] == 1]
        gold_standard = pd.merge(positive_labels_df, key_points_df, how="inner", on="key_point_id")
        gold_standard = pd.merge(gold_standard, arguments_df, how="inner", on=["arg_id","topic"])
        gold_standard = gold_standard[["topic", "argument", "key_point"]]
        gold_standard["score"] = 1
        data[subset]["gold_standard"] = gold_standard

        # Compute model scores
        def compute_score_from(row):
            argument = row["argument"]
            key_point = row["key_point"]
            return compute_entailment_from_arg_kp(argument, key_point, model)

        mappings = []
        arg_to_kps = create_arg_kps_mapping(arguments_df, key_points_df)
        for topic, arg_kps_mapping in arg_to_kps.items():
            arg_kps_mapping['score'] = arg_kps_mapping.apply(lambda row: compute_score_from(row), axis=1)
            arg_kps_mapping['topic'] = topic
            arg_kps_mapping = arg_kps_mapping[["topic", "argument", "key_point", "score"]]
            mappings.append(arg_kps_mapping)
        predictions = pd.concat(mappings, axis=0)
        data[subset]["predictions"] = predictions

    return data

In [75]:
class InputExample:

    def __init__(self,
                 arg: str = None,
                 kp: str = None,
                 label: Union[int, float] = 0,
                 dropped_word: str = None
                 ):

        self.arg = arg
        self.kp = kp
        self.dropped_word = dropped_word

    def __str__(self):
        return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))

In [124]:
def tokenize_kp(row):
    return work_tokenizer(row["key_point"])

def create_leave_one_out_sents(row):
    words = row["key_point_words"]
    samples = [{"dropped": "Reference", "new_kp": row}]
    for i in range(len(words)):
        new_kp = copy.deepcopy(words)
        dropped_word = new_kp.pop(i)
        new_kp = " ".join(new_kp)
        samples.append({"dropped": dropped_word, "new_kp": new_kp})
    return samples


def get_data_for_leave_one_out():
    """
    Creates a nested list of data.
    For each arg-kp pair we create no_of_words_in_kp InputExamples.
    """
    dfs = load_kpm_data()
    data = dfs["dev"]["predictions"]
    data["key_point_words"] = data.apply(lambda row: tokenize_kp(row), axis=1)
    data["dropped_sentences"] = data.apply(lambda row: create_leave_one_out_sents(row), axis=1)
    return data

get_data_for_leave_one_out()

KeyboardInterrupt: 

In [82]:
def compute_entailment(example, model):
    arg = model.encode(example.arg, show_progress_bar=False),
    kp = model.encode(example.kp, show_progress_bar=False)
    return float(util.pytorch_cos_sim(arg, kp))

def compute_entailment_from_arg_kp(arg, kp, model):
    arg = model.encode(arg, show_progress_bar=False),
    kp = model.encode(kp, show_progress_bar=False)
    return float(util.pytorch_cos_sim(arg, kp))

In [27]:
def leave_one_out(model, data):
    results = []
    for sentences in samples:
        result = []
        for i, example in enumerate(sentences):
            r = {"dropped_word": example.dropped_word,
                 "score": compute_entailment(example, model)}
            result.append(r)
        results.append(result)
    return results

samples = get_data_for_leave_one_out()
results = leave_one_out(model, samples)
print(*(results[0]), sep="\n")

{'dropped_word': 'Reference', 'score': 0.635991632938385}
{'dropped_word': 'School', 'score': 0.6456453800201416}
{'dropped_word': 'uniform', 'score': 0.6269412636756897}
{'dropped_word': 'is', 'score': 0.6195348501205444}
{'dropped_word': 'harming', 'score': 0.551324725151062}
{'dropped_word': 'the', 'score': 0.6279229521751404}
{'dropped_word': 'student', 'score': 0.6066310405731201}
{'dropped_word': "'s", 'score': 0.6129819750785828}
{'dropped_word': 'self', 'score': 0.5350502729415894}
{'dropped_word': 'expression', 'score': 0.5559728145599365}


In [28]:
############################################################
############################################################
################# CODE GRAVEYARD ###########################
############################################################
############################################################

In [29]:
# modified_features = [{"input_ids": features[0]["input_ids"],
#                       "attention_mask": features[0]["attention_mask"]},
#                      {"input_ids": features[1]["input_ids"],
#                       "attention_mask": features[1]["attention_mask"]}]

# Prepare tokens
# kp_topic = modified_features[1]["input_ids"]
# kp_topic_tokens = modified_features[1]["input_ids"].tolist()[0]
# sep_token_index = kp_topic_tokens.index(50265) # <SEP> Token
# kp_tokens = kp_topic_tokens[:sep_token_index]
# topic_tokens = kp_topic_tokens[sep_token_index:]

# Prepare list of kps where a token is dropped
# new_kps = []
# for i in range(len(kp_tokens)):
#     dropped = kp_tokens[i]
#     new_kp = copy.deepcopy(kp_tokens)
#     new_kp.pop(i)
#     new_kps.append({"kp": new_kp, "dropped": dropped, "index": i})

# # Create dropped modified_features
# for new_kp in new_kps:
#     kp, dropped, index = new_kp.values()
#
#     # Modify sample itself
#     modified_features[1]["input_ids"] = torch.tensor(kp+topic_tokens)
#
#     # Modify attention mask
#     old_attention_mask = modified_features[1]["attention_mask"][0]
#     first_part = old_attention_mask[:index]
#     second_part = old_attention_mask[index+1:] # Here dropped index is excluded
#     new_attention_mask = torch.cat([first_part, second_part], dim=0)
#     new_attention_mask = new_attention_mask.view(1,len(new_attention_mask))
#     modified_features[1]["attention_mask"] = new_attention_mask
#
#     dropped_score = compute_entailment(loss_model, modified_features, labels)
#
#     dropped_scores.append({"score": dropped_score,
#                            "dropped": dropped,
#                            "index": index})
#     s = 0

In [30]:
# def get_backbone_model_and_tokenizer():
#     name = 'roberta-base'
#     backbone = models.Transformer(name)
#     backbone.max_seq_length = 70
#     backbone.tokenizer.add_tokens(['<SEP>'], special_tokens=True)
#     backbone.auto_model.resize_token_embeddings(len(backbone.tokenizer))
#     return backbone, backbone.tokenizer

In [31]:
# #Modify token embeddings
# old_token_embedding = features[1]["token_embeddings"][0]
# first_part = features[1]["token_embeddings"][0][:index]
# second_part = features[1]["token_embeddings"][0][index+1:]
# new_token_embedding = torch.cat([first_part, second_part])
# new_token_embedding = new_token_embedding.view(1, *new_token_embedding.size())
# features[1]["token_embeddings"] = new_token_embedding

# Modify sentence embeddings
# We can just drop the sentence embeddings, the model will infer this automatically

In [32]:
# def get_dataloader(examples, model):
#     dataloader = DataLoader(examples, shuffle=False, batch_size=1)
#     dataloader.collate_fn = model.smart_batching_collate
#     return iter(dataloader)

In [33]:
# def get_dataloaders_for_leave_one_out(data, model):
#     dataloaders = []
#     for sample in data:
#         # sample has no_of_words_in_kp examples in it
#         dataloader = DataLoader(sample, shuffle=False, batch_size=1)
#         #dataloader.collate_fn = model.smart_batching_collate
#         dataloaders.append(dataloader)
#     return dataloaders

In [34]:
# def prepare_model_for_inference(model):
#     model = losses.ContrastiveLoss(model)
#     model.to(device)
#     model.eval()
#     return model

In [35]:
# def compute_entailment(model, features, labels):
#     return float((1 - model(features, labels)))