In [None]:
"""
TODO:

- Import model
    Import the trained model instead of training from scratch
    - See cell 3
    - The basic idea is:
        - initialize the model as it was trained, in sbert_training the definition can be found
        - load the weights into the model (by whichever means)
        - The trained model is here: https://drive.google.com/drive/folders/1qgGdoNMUcyQivTtu5udzGcQB8SFgxm-M?usp=sharing

- Changing loss to score
    Right now the metric is the loss because I could not yet find a way to retrieve the score. One can interpret
    the loss as the inverse score, so the lesser the loss, the better. To better compare we have to change that to a score.

- Leave-one-out
    Extend the method / dataloader with all instead of only the dummy sample
    - The dev data can be found in /new_KPA/argmining-21-keypoint-analysis-sharedtask-code/data/kpm_data
    - There could be two ways to approach this:
        - Interpret the gold standard:
            - Iterate over kps in key_points_dev.csv and grab the respective argument from arguments_dev.csv
        - Let the model decide what is the right arg
            - Iterate over all args, computer score with each kp and save argmax
    - What needs to be added is for each InputExample save the word we dropped so we can access it during
      entailment computing. For this copy InputExample from SentenceBert and modify class, save word to later access

- SHAP
    - I have prepared the model for inference (except the loss thing) so next SHAP can be implemented
"""

In [None]:
import copy
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
from torch.utils.data import DataLoader
import torch
import sys
from nltk import word_tokenize
sys.path.insert(0, "/home/marcelbraasch/PycharmProjects/new_KPA/argmining-21-keypoint-analysis-sharedtask-code/code/src-py")
import sbert_training
work_tokenizer = word_tokenize
device = "cuda:0" if torch.cuda.is_available() else "cpu"
!source /home/marcelbraasch/PycharmProjects/new_KPA/venv/bin/activate
print(device)

In [None]:
model = sbert_training.train_model('/home/marcelbraasch/PycharmProjects/new_KPA/argmining-21-keypoint-analysis-sharedtask-code/data/siamese-data/',
                           "/home/marcelbraasch/PycharmProjects/new_KPA/argmining-21-keypoint-analysis-sharedtask-code/data/kpm_data",
                           'dev',
                           "/home/marcelbraasch/PycharmProjects/new_KPA/argmining-21-keypoint-analysis-sharedtask-code/code/siamese-models",
                           'roberta-base',
                           model_suffix='contrastive-10-epochs',
                           data_file_suffix='contrastive',
                           num_epochs=1, max_seq_length=70, add_special_token=True, train_batch_size=32, loss='ContrastiveLoss')

In [3]:
# def load_model():
#     max_seq_length = 70
#     model_name = 'distilbert-base-uncased'
#     word_embedding_model = models.Transformer(model_name)
#     word_embedding_model.max_seq_length = max_seq_length
#     word_embedding_model.tokenizer.add_tokens(['<SEP>'], special_tokens=True)
#     word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))
#     pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
#                                    pooling_mode_mean_tokens=True,
#                                    pooling_mode_cls_token=False,
#                                    pooling_mode_max_tokens=False)
#     model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
#     return model

# config = BertConfig.from_json_file("/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code/model/config.json")
# model = TFBertModel.from_pretrained('/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code/model/my_pytorch_model.bin', from_pt=True, config=config)

# path = "./siamese_state_dict_2.pt"
# state_dict = torch.load(path)["state_dict"]
# model = load_model()
# model.load_state_dict(state_dict)

In [6]:
def get_data_for_leave_one_out(path):
    """
    Creates a nested list of data.
    For each arg-kp pair we create no_of_words_in_kp InputExamples.
    """
    samples = []
    # Here all the data should be loaded instead of only 1 test sample
    for _ in range(1):
        topic = "We should abandon the use of school uniform"
        arg = "A real education is about giving students the tools to learn, think, and express themselves; dictating to them what to wear sends a strong message that we don't trust them to think on their own."
        kp = "School uniform is harming the student's self expression"
        label = 1
        sample = [InputExample(texts=[arg, kp + " <SEP> " + topic], label=label)]
        words = work_tokenizer(kp)
        for i in range(len(words)):
            new_kp = copy.deepcopy(words)
            new_kp.pop(i)
            new_kp_topic = " ".join(new_kp)
            dropped_word = words[i]
            sample.append(InputExample(texts=[arg, new_kp_topic], label=label))
        samples.append(sample)
    return samples

In [7]:
def get_dataloader(examples, model):
    dataloader = DataLoader(examples, shuffle=False, batch_size=1)
    dataloader.collate_fn = model.smart_batching_collate
    return iter(dataloader)

In [8]:
def get_dataloaders_for_leave_one_out(data, model):
    dataloaders = []
    for sample in data:
        # sample has no_of_words_in_kp examples in it
        dataloader = DataLoader(sample, shuffle=False, batch_size=1)
        dataloader.collate_fn = model.smart_batching_collate
        dataloaders.append(dataloader)
    return dataloaders

In [9]:
def prepare_model_for_inference(model):
    model = losses.ContrastiveLoss(model)
    model.to(device)
    model.eval()
    return model

In [11]:
def compute_entailment(model, features, labels):
    return float((1 - model(features, labels)))

In [23]:
def run_methods(model):
    path = "./test_path/"
    loss_model = prepare_model_for_inference(model)
    # methods = [leave_one_out]

    # Mask one
    data = get_data_for_leave_one_out(path)
    dataloaders = get_dataloaders_for_leave_one_out(data, model)
    for dataloader in dataloaders:
        scores = []
        dropped = []
        words = None
        for i, (features, labels) in enumerate(dataloader):
            scores.append(compute_entailment(loss_model, features, labels))
            if not i:
                dropped.append("Reference")
                words = [model.tokenizer.convert_ids_to_tokens(x) for x in features[1]["input_ids"][0].tolist()]
                sep_index = words.index("<SEP>")
                words = words[1:sep_index]
            else:
                dropped.append(words.pop(0))

run_methods(model)

2022-01-18 22:06:29 - Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "/home/marcelbraasch/PycharmProjects/argmining-21-keypoint-analysis-sharedtask-code/venv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_29992/1855964181.py", line 26, in <module>
    run_methods(model)
  File "/tmp/ipykernel_29992/1855964181.py", line 23, in run_methods
    s = 0
  File "/tmp/ipykernel_29992/1855964181.py", line 23, in run_methods
    s = 0
  File "/home/marcelbraasch/.local/share/JetBrains/Toolbox/apps/PyCharm-P/ch-0/213.5744.248/plugins/python/helpers/pydev/_pydevd_bundle/pydevd_frame.py", line 747, in trace_dispatch
    self.do_wait_suspend(thread, frame, event, arg)
  File "/home/marcelbraasch/.local/share/JetBrains/Toolbox/apps/PyCharm-P/ch-0/213.5744.248/plugins/python/helpers/pydev/_pydevd_bu

TypeError: object of type 'NoneType' has no len()

In [72]:
############################################################
############################################################
################# CODE GRAVEYARD ###########################
############################################################
############################################################

# def get_backbone_model_and_tokenizer():
#     name = 'roberta-base'
#     backbone = models.Transformer(name)
#     backbone.max_seq_length = 70
#     backbone.tokenizer.add_tokens(['<SEP>'], special_tokens=True)
#     backbone.auto_model.resize_token_embeddings(len(backbone.tokenizer))
#     return backbone, backbone.tokenizer

            # modified_features = [{"input_ids": features[0]["input_ids"],
            #                       "attention_mask": features[0]["attention_mask"]},
            #                      {"input_ids": features[1]["input_ids"],
            #                       "attention_mask": features[1]["attention_mask"]}]

            # Prepare tokens
            # kp_topic = modified_features[1]["input_ids"]
            # kp_topic_tokens = modified_features[1]["input_ids"].tolist()[0]
            # sep_token_index = kp_topic_tokens.index(50265) # <SEP> Token
            # kp_tokens = kp_topic_tokens[:sep_token_index]
            # topic_tokens = kp_topic_tokens[sep_token_index:]

            # Prepare list of kps where a token is dropped
            # new_kps = []
            # for i in range(len(kp_tokens)):
            #     dropped = kp_tokens[i]
            #     new_kp = copy.deepcopy(kp_tokens)
            #     new_kp.pop(i)
            #     new_kps.append({"kp": new_kp, "dropped": dropped, "index": i})

            # # Create dropped modified_features
            # for new_kp in new_kps:
            #     kp, dropped, index = new_kp.values()
            #
            #     # Modify sample itself
            #     modified_features[1]["input_ids"] = torch.tensor(kp+topic_tokens)
            #
            #     # Modify attention mask
            #     old_attention_mask = modified_features[1]["attention_mask"][0]
            #     first_part = old_attention_mask[:index]
            #     second_part = old_attention_mask[index+1:] # Here dropped index is excluded
            #     new_attention_mask = torch.cat([first_part, second_part], dim=0)
            #     new_attention_mask = new_attention_mask.view(1,len(new_attention_mask))
            #     modified_features[1]["attention_mask"] = new_attention_mask
            #
            #     dropped_score = compute_entailment(loss_model, modified_features, labels)
            #
            #     dropped_scores.append({"score": dropped_score,
            #                            "dropped": dropped,
            #                            "index": index})
            #     s = 0

                # #Modify token embeddings
                # old_token_embedding = features[1]["token_embeddings"][0]
                # first_part = features[1]["token_embeddings"][0][:index]
                # second_part = features[1]["token_embeddings"][0][index+1:]
                # new_token_embedding = torch.cat([first_part, second_part])
                # new_token_embedding = new_token_embedding.view(1, *new_token_embedding.size())
                # features[1]["token_embeddings"] = new_token_embedding

                # Modify sentence embeddings
                # We can just drop the sentence embeddings, the model will infer this automatically