In [1]:
from sentence_transformers import SentenceTransformer, models, util
from nltk import word_tokenize
from collections import defaultdict
import matplotlib.pyplot as plt
import sbert_training
import pandas as pd
import copy
import torch
import sys
import pickle
import random

In [2]:
#!pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

In [3]:
word_tokenizer = word_tokenize
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_dir = "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/"

In [4]:
def save_with_pickle(path, data):
    with open(path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_from_pickle(path):
    data = None
    with open(path, "rb") as handle:
        data = pickle.load(handle)
    return data

In [5]:
def load_model():
    try:
        path = "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code/model"
        model = SentenceTransformer(path)
    except:
        model = sbert_training.train_model('/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/data/siamese-data/',
                                           "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/data/kpm_data",
                                           'dev',
                                           "/home/marcelbraasch/PycharmProjects/new_KPA/argmining-21-keypoint-analysis-sharedtask-code-2/code/siamese-models",
                                           'roberta-base',
                                           model_suffix='contrastive-10-epochs',
                                           data_file_suffix='contrastive',
                                          num_epochs=10, max_seq_length=70, add_special_token=True, train_batch_size=32, loss='ContrastiveLoss')
    return model

model = load_model()

2022-01-24 20:25:54 - Load pretrained SentenceTransformer: /home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code/model
2022-01-24 20:25:55 - Use pytorch device: cuda


In [6]:
def load_closed_class_words():
    path = "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code-2/code/src-py/closed_class_words.txt"
    data = []
    with open(path, "r") as file:
        for line in file:
            data.extend(line.rstrip().split())
    return data

closed_class_words = load_closed_class_words()

In [7]:
def compute_entailment(arg, kp, model):
    arg = model.encode(arg, show_progress_bar=False),
    kp = model.encode(kp, show_progress_bar=False)
    return float(util.pytorch_cos_sim(arg, kp))

In [8]:
def create_arg_kps_mapping(arguments_df, key_points_df):
    mapping = {}
    topics = arguments_df["topic"].unique()
    for topic in topics:
        arguments = arguments_df.loc[arguments_df["topic"] == topic][["argument"]].drop_duplicates()
        key_points = key_points_df.loc[key_points_df["topic"] == topic][["key_point"]].drop_duplicates()
        map = pd.merge(arguments, key_points, how="cross")
        mapping[topic] = map
    return mapping

def load_kpm_data(model):
    # path = "gold_labels_and_prediction_scores.pkl"
    # try:
    #     return load_from_pickle(path)
    # except:
    #     pass
    data = defaultdict(dict)
    for subset in ["dev"]:#, "train"]:

        # Load files
        arguments_file = repo_dir + f"data/kpm_data/arguments_{subset}.csv"
        key_points_file = repo_dir + f"data/kpm_data/key_points_{subset}.csv"
        labels_file = repo_dir + f"data/kpm_data/labels_{subset}.csv"
        arguments_df = pd.read_csv(arguments_file)
        key_points_df = pd.read_csv(key_points_file)
        labels_df = pd.read_csv(labels_file)

        # Get gold standard
        positive_labels_df = labels_df.loc[labels_df["label"] == 1]
        gold_standard = pd.merge(positive_labels_df, key_points_df, how="inner", on="key_point_id")
        gold_standard = pd.merge(gold_standard, arguments_df, how="inner", on=["arg_id","topic", "stance"])
        gold_standard = gold_standard.rename(columns={"label": "score"})
        data[subset]["gold_standard"] = gold_standard

        # Compute model scores
        def compute_score_from(row):
            argument = row["argument"]
            key_point = row["key_point"]
            return compute_entailment(argument, key_point, model)

        mappings = []
        arg_to_kps = create_arg_kps_mapping(arguments_df, key_points_df)
        for topic, arg_kps_mapping in arg_to_kps.items():
            arg_kps_mapping['score'] = arg_kps_mapping.apply(lambda row: compute_score_from(row), axis=1)
            arg_kps_mapping['topic'] = topic
            arg_kps_mapping = arg_kps_mapping[["topic", "argument", "key_point", "score"]]
            mappings.append(arg_kps_mapping)
        predictions = pd.concat(mappings, axis=0)
        data[subset]["predictions"] = predictions
        save_with_pickle(path, data)
    return data

In [9]:
def tokenize_kp(row):
    return word_tokenizer(row["key_point"])

def _leave_one_out(row):
    words = row["key_point_words"]
    samples = [{"dropped": "Reference", "new_kp": row["key_point"], "score": row["score"]}]
    for i in range(len(words)):
        new_kp = copy.deepcopy(words)
        dropped_word = new_kp.pop(i)
        new_kp = " ".join(new_kp)
        new_score = compute_entailment(row["argument"], new_kp, model)
        samples.append({"dropped": dropped_word, "new_kp": new_kp, "score": new_score})
    return samples

def leave_one_out(model):
    path = "leave_one_out.pkl"

    try:
        return load_from_pickle(path)
    except:
        pass

    # Iterates over the kpm data dict and compute leave one out for each entry
    dfs = load_kpm_data(model)
    for gold_or_pred in dfs.values():
        for df in gold_or_pred.values():
            df["key_point_words"] = df.apply(lambda row: tokenize_kp(row), axis=1)
            df["leave_one_out"] = df.apply(lambda row: _leave_one_out(row), axis=1)
    save_with_pickle(path, dfs)

    return dfs

In [21]:
def _argument_leave_one_out_(tokens, words, generate_ngrams=False):

    samples = []
    random_choices = []
    random_amount = 30
    counter = 0
    tries = 0

    # Create dropped argument realizations of ngrams
    if generate_ngrams:
        for ngram in range(1,4):
            for i in range(len(tokens)-ngram+1):
                new_arg = copy.deepcopy(tokens)
                dropped_words = [new_arg.pop(i) for _ in range(ngram)]
                new_arg = " ".join(new_arg)
                samples.append({"dropped": dropped_words,
                                "new_arg": new_arg,
                                "ngram": f"{ngram}"})

    # Drop 2 to 4 random words 10 times excluding functional words
    lexical_mask = [1 if x not in closed_class_words else 0 for x in tokens]
    lexical_amount = lexical_mask.count(1)
    lexical_indices = [i for i, x in enumerate(lexical_mask) if x]
    if lexical_amount <= 2: return samples
    while counter != random_amount:
        amount = random.randrange(2, 4)
        random_choice = {j for j in random.choices(lexical_indices, k=amount)}
        if len(random_choice) == amount:
            if random_choice not in random_choices:
                random_choices.append(random_choice)
                counter += 1
        tries += 1
        if tries == 250:
            break

    # We gather the collection first before we process it
    # to make sure our selection is unique
    s = 0
    for random_choice in random_choices:
        random_choice = list(random_choice)
        random_choice.sort(reverse=True)
        new_arg = copy.deepcopy(tokens)
        dropped_words = [new_arg.pop(index) for index in random_choice]
        sample = {"dropped": dropped_words,
                  "new_arg": new_arg,
                  "ngram": f"random_{len(random_choice)}",
                  "indices": random_choice}
        samples.append(sample)

    return samples

def load_mappings():
    path = "arg_to_dropped_mapping.pkl"
    try:
        return load_from_pickle(path)
    except:
        pass
    mappings = {argument:_argument_leave_one_out_(word_tokenizer(argument), argument)
                for argument in arguments}
    save_with_pickle(path, mappings)
    return mappings

# Create leave one out for the arguments
n = 5
k = 0
leave_one_out_path = "./Leave One Out"
data_path = "./gold_labels_and_prediction_scores.pkl"
column_names = ["dropped", "new_arg", "ngram", "indices"]
data = load_from_pickle(data_path)
current = data["dev"]["predictions"]
arguments = current["argument"].unique()
topics = current["topic"].unique()
mappings = load_mappings()
for topic in topics:
    key_points = current.loc[current['topic'] == topic]["key_point"].unique()
    for argument in arguments:
        top_n = current.loc[current["argument"]==argument] \
                       .sort_values(by=["score"], ascending=False) \
                       .head(n)
        new_args = pd.DataFrame.from_dict(mappings[argument])
        new_args["new_arg"] = new_args.apply(lambda row: " ".join(row["new_arg"]), axis=1)
        new_args.rename(columns={"score": "reference_score"})
        loo_curr_arg = pd.merge(top_n, new_args, how="cross")
        loo_curr_arg["dropped_score"] = loo_curr_arg.apply(lambda row: compute_entailment(row["new_arg"], row["key_point"], model), axis=1)
        loo_curr_arg["dropped_score_normalized"] = loo_curr_arg.apply(lambda row: row["dropped_score"] / len(row["indices"]), axis=1)
        loo_curr_arg["diff"] = loo_curr_arg["score"] - loo_curr_arg["dropped_score"]
        loo_curr_arg["diff_normalized"] = loo_curr_arg.apply(lambda row: row["diff"] / len(row["indices"]), axis=1)
        save_with_pickle(f"./Leave One Out/arg_{k}_leave_one_out.pkl", loo_curr_arg)
        k += 1

In [20]:
# cmap = plt.cm.get_cmap('YlOrBr')
#
# s = 0
# counter = 0
# for sample in best_arg_kps.iterrows():
#     leave_one_out_sample = sample[1]["leave_one_out"]
#
#     plt.text(0,1.5,f"Argument: {sample[1]['argument']}", fontsize=20)
#     plt.text(0,1.4,f"Key point: {sample[1]['key_point']}", fontsize=20)
#
#     _max = max([leave_one_out_sample[i]["score"]
#                for i in range(1, len(leave_one_out_sample))])
#
#     y_coord = 1.2
#     for i in range(1, len(leave_one_out_sample)):
#         word = leave_one_out_sample[i]["dropped"]
#         importance = leave_one_out_sample[i]["score"] #- _max
#         plt.text(0,y_coord,f"{word} ({importance})", fontsize=20, backgroundcolor=cmap(importance))
#         y_coord -= 0.1
#     plt.axis('off')
#     # _in = input()
#     # if _in == "exit":
#     break
#     # plt.figure().clear()
#     # plt.close()
#     # plt.cla()
#     # plt.clf()
#
#     # Drop words in argument
