# Setup

In [1]:
! pip install datasets transformers torch
! pip install sentencepiece
! pip install accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x8

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch, sentencepiece
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available")

GPU available: Tesla T4


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# MMADD

In [5]:
def get_question(text, premise_column):
    return text.replace("[","").replace("]", "").split("<")[0].replace(f"{premise_column}: ", "")

def get_answer(text, hypothesis_column):
    #print(text)
    return text.replace("[","").replace("]", "").split(">")[1].replace(f"{hypothesis_column}: ", "")

def transform_df(file_name, dataset_type):
  if dataset_type=="mrpc" or dataset_type=="rte":
    premise_column = "Sentence1"
    hypothesis_column = "Sentence2"
  elif dataset_type=="mnli":
    premise_column = "Premise"
    hypothesis_column = "Hypothesis"
  elif dataset_type=="qnli":
    premise_column = "Question"
    hypothesis_column = "Sentence"

  df = pd.read_csv(file_name)

  df["original_question"] = df["original_text"].apply(lambda x: get_question(x, premise_column))
  df["original_answer"] = df["original_text"].apply(lambda x: get_answer(x, hypothesis_column))
  df["perturbed_question"] = df["perturbed_text"].apply(lambda x: get_question(x, premise_column))
  df["perturbed_answer"] = df["perturbed_text"].apply(lambda x: get_answer(x, hypothesis_column))

  return df

def concat_dfs(files, dataset_types):
  df = pd.concat([transform_df(file, dataset_type) for (file,dataset_type) in zip(files, dataset_types)], axis=0, ignore_index=True)
  df = Dataset.from_pandas(df)
  df_dataset = DatasetDict({
        "test": df,
  })
  return df_dataset

def load_model_and_tokenizer(save_directory):
    model = DebertaV2ForSequenceClassification.from_pretrained(save_directory)
    tokenizer = DebertaV2Tokenizer.from_pretrained(save_directory)
    return model, tokenizer

def get_model_output(model, tokenizer, validation_set, device, output_column_name):
  model.eval()
  preds = []
  print("starting model outputs")

  with torch.no_grad():
    for item in validation_set:
      #print(item["perturbed_question"])
      #print(item["perturbed_answer"])
      inputs = tokenizer(item['perturbed_question'], item['perturbed_answer'], return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
      outputs = model(**inputs)
      preds.append(outputs)

  print("model outputs done")
  return preds

def get_outputs(model_wrappers, dataset, device):
  dfs= []
  for i, model_wrapper in enumerate(model_wrappers):
    dfs.append(get_model_output(model_wrapper[0].to(device), model_wrapper[1], dataset["test"], device, f"model_{i}"))
  return dfs


def get_value1(row):
    index = row['base_output']
    tensor = row['model_1'] - row['model_0']
    return abs(tensor[0][index].item())

def get_value2(row):
    index = row['base_output']
    tensor = row['model_2'] - row['model_0']
    return abs(tensor[0][index].item())

def get_value3(row):
    index = row['base_output']
    tensor = row['model_3'] - row['model_0']
    return abs(tensor[0][index].item())

def get_value4(row):
    index = row['base_output']
    tensor = row['model_4'] - row['model_0']
    return abs(tensor[0][index].item())

def get_value5(row):
    index = row['base_output']
    tensor = row['model_5'] - row['model_0']
    return abs(tensor[0][index].item())

def get_max_val(row):
  threshold = 0.5
  vals = [row[f"extractedValue_{i}"] for i in range(1,6)]
  return vals.index(max(vals))+1 if max(vals) > threshold else 0

def get_ensemble_output(row):
  model_num = row["maxModel"]
  return torch.argmax(row[f"model_{model_num}"][0]).item()

# Deberta MRPC

In [7]:
dataset_type = "mrpc"
output_save_file = f"/content/drive/My Drive/Ensemble_Def/{dataset_type}-deberta_og.csv"

logfile_directory = "/content/drive/My Drive/CS6220_logs/Val_logs"
files = [f"{dataset_type}_vallog_PWWS.csv", f"{dataset_type}_vallog_textbugger.csv", f"{dataset_type}_vallog_textfooler.csv", f"{dataset_type}_vallog_DeepWordBug.csv"]
files = [f"{logfile_directory}/{file}" for file in files]
dataset_types = [f"{dataset_type}", f"{dataset_type}", f"{dataset_type}", f"{dataset_type}"]
dataset = concat_dfs(files, dataset_types)

In [None]:

save_directory = f"/content/drive/My Drive/finetuned_models/{dataset_type}-deberta-xsmall"
adv_save_directories = [f"{dataset_type}_{attack_name}/" for attack_name in ["textbugger", "textfooler", "DeepWordBug", "PWWS"]]
adv_save_directories = [f"/content/drive/My Drive/Adv_trained/{adv_save_directory}" for adv_save_directory in adv_save_directories]

model_wrappers = [load_model_and_tokenizer(save_directory), load_model_and_tokenizer(adv_save_directories[0])]
for adv_save_directory in adv_save_directories:
   model_wrappers.append(load_model_and_tokenizer(adv_save_directory))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
df_outputs = get_outputs(model_wrappers, dataset, device)


starting model outputs
model outputs done
starting model outputs
model outputs done
starting model outputs
model outputs done
starting model outputs
model outputs done
starting model outputs
model outputs done
starting model outputs
model outputs done


In [None]:
dfs = [[x.logits.cpu() for x in df] for df in df_outputs]

In [None]:
mydfs = pd.concat([pd.DataFrame(dfs[i], columns=[f"model_i"]) for i in range(len(dfs))], axis=1, ignore_index=True )
mydfs.columns = [f"model_{i}" for i in range(len(dfs))]
mydfs["base_output"] = mydfs["model_0"].apply(lambda x: torch.argmax(x, dim=1).item())


mydfs["extractedValue_1"] = mydfs.apply(get_value1, axis=1)
mydfs["extractedValue_2"] = mydfs.apply(get_value2, axis=1)
mydfs["extractedValue_3"] = mydfs.apply(get_value3, axis=1)
mydfs["extractedValue_4"] = mydfs.apply(get_value4, axis=1)
mydfs["extractedValue_5"] = mydfs.apply(get_value5, axis=1)

mydfs["maxModel"] = mydfs.apply(get_max_val, axis=1)
mydfs["ensemble_output"] = mydfs.apply(get_ensemble_output, axis=1)
train_df = pd.DataFrame({
    "model_perturbed_output":dataset["test"]["perturbed_output"],
    "ground_truth":dataset["test"]["ground_truth_output"],
    'original_text':dataset["test"]["original_text"],
    'perturbed_text':dataset["test"]["perturbed_text"],
    'original_score':dataset["test"]["original_score"],
    'perturbed_score':dataset["test"]["perturbed_score"],
    'num_queries':dataset["test"]["num_queries"],
    'result_type':dataset["test"]["result_type"],
    'original_question':dataset["test"]["original_question"],
    'original_answer':dataset["test"]["original_answer"],
    'perturbed_question':dataset["test"]["perturbed_question"],
    'perturbed_answer':dataset["test"]["perturbed_answer"]
  })


  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])


In [None]:
for c in train_df.columns:
  mydfs[c] = train_df[c]
print((mydfs["ground_truth"] == mydfs["base_output"]).sum())
print((mydfs["ground_truth"] == mydfs["ensemble_output"]).sum())
mydfs.to_csv(output_save_file)

91
636


# Deberta RTE

In [None]:
dataset_type = "rte"
output_save_file = f"/content/drive/My Drive/Ensemble_Def/{dataset_type}-deberta_og.csv"

logfile_directory = "/content/drive/My Drive/CS6220_logs/Val_logs"
files = [f"{dataset_type}_vallog_PWWS.csv", f"{dataset_type}_vallog_textbugger.csv", f"{dataset_type}_vallog_textfooler.csv", f"{dataset_type}_vallog_DeepWordBug.csv"]
files = [f"{logfile_directory}/{file}" for file in files]
dataset_types = [f"{dataset_type}", f"{dataset_type}", f"{dataset_type}", f"{dataset_type}"]
dataset = concat_dfs(files, dataset_types)

In [None]:

save_directory = f"/content/drive/My Drive/finetuned_models/{dataset_type}-deberta-xsmall"
adv_save_directories = [f"{dataset_type}_{attack_name}/" for attack_name in ["textbugger", "textfooler", "DeepWordBug", "PWWS"]]
adv_save_directories = [f"/content/drive/My Drive/Adv_trained/{adv_save_directory}" for adv_save_directory in adv_save_directories]

model_wrappers = [load_model_and_tokenizer(save_directory), load_model_and_tokenizer(adv_save_directories[0])]
for adv_save_directory in adv_save_directories:
   model_wrappers.append(load_model_and_tokenizer(adv_save_directory))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
df_outputs = get_outputs(model_wrappers, dataset, device)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done


In [None]:
dfs = [[x.logits.cpu() for x in df] for df in df_outputs]

In [None]:
mydfs = pd.concat([pd.DataFrame(dfs[i], columns=[f"model_i"]) for i in range(len(dfs))], axis=1, ignore_index=True )
mydfs.columns = [f"model_{i}" for i in range(len(dfs))]
mydfs["base_output"] = mydfs["model_0"].apply(lambda x: torch.argmax(x, dim=1).item())


mydfs["extractedValue_1"] = mydfs.apply(get_value1, axis=1)
mydfs["extractedValue_2"] = mydfs.apply(get_value2, axis=1)
mydfs["extractedValue_3"] = mydfs.apply(get_value3, axis=1)
mydfs["extractedValue_4"] = mydfs.apply(get_value4, axis=1)
mydfs["extractedValue_5"] = mydfs.apply(get_value5, axis=1)

mydfs["maxModel"] = mydfs.apply(get_max_val, axis=1)
mydfs["ensemble_output"] = mydfs.apply(get_ensemble_output, axis=1)
train_df = pd.DataFrame({
    "model_perturbed_output":dataset["test"]["perturbed_output"],
    "ground_truth":dataset["test"]["ground_truth_output"],
    'original_text':dataset["test"]["original_text"],
    'perturbed_text':dataset["test"]["perturbed_text"],
    'original_score':dataset["test"]["original_score"],
    'perturbed_score':dataset["test"]["perturbed_score"],
    'num_queries':dataset["test"]["num_queries"],
    'result_type':dataset["test"]["result_type"],
    'original_question':dataset["test"]["original_question"],
    'original_answer':dataset["test"]["original_answer"],
    'perturbed_question':dataset["test"]["perturbed_question"],
    'perturbed_answer':dataset["test"]["perturbed_answer"]
  })


  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])


In [None]:
for c in train_df.columns:
  mydfs[c] = train_df[c]
print((mydfs["ground_truth"] == mydfs["base_output"]).sum())
print((mydfs["ground_truth"] == mydfs["ensemble_output"]).sum())
mydfs.to_csv(output_save_file)

92
547


# Deberta MNLI

In [None]:
dataset_type = "mnli"
output_save_file = f"/content/drive/My Drive/Ensemble_Def/{dataset_type}-deberta_og.csv"

logfile_directory = "/content/drive/My Drive/CS6220_logs/Val_logs"
files = [f"{dataset_type}_vallog_PWWS.csv", f"{dataset_type}_vallog_textbugger.csv", f"{dataset_type}_vallog_textfooler.csv", f"{dataset_type}_vallog_DeepWordBug.csv"]
files = [f"{logfile_directory}/{file}" for file in files]
dataset_types = [f"{dataset_type}", f"{dataset_type}", f"{dataset_type}", f"{dataset_type}"]
dataset = concat_dfs(files, dataset_types)

In [None]:

save_directory = f"/content/drive/My Drive/finetuned_models/{dataset_type}-deberta-xsmall"
adv_save_directories = [f"{dataset_type}_{attack_name}/" for attack_name in ["textbugger", "textfooler", "DeepWordBug", "PWWS"]]
adv_save_directories = [f"/content/drive/My Drive/Adv_trained/{adv_save_directory}" for adv_save_directory in adv_save_directories]

model_wrappers = [load_model_and_tokenizer(save_directory), load_model_and_tokenizer(adv_save_directories[0])]
for adv_save_directory in adv_save_directories:
   model_wrappers.append(load_model_and_tokenizer(adv_save_directory))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
df_outputs = get_outputs(model_wrappers, dataset, device)


starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


model outputs done


In [None]:
dfs = [[x.logits.cpu() for x in df] for df in df_outputs]

In [None]:
mydfs = pd.concat([pd.DataFrame(dfs[i], columns=[f"model_i"]) for i in range(len(dfs))], axis=1, ignore_index=True )
mydfs.columns = [f"model_{i}" for i in range(len(dfs))]
mydfs["base_output"] = mydfs["model_0"].apply(lambda x: torch.argmax(x, dim=1).item())


mydfs["extractedValue_1"] = mydfs.apply(get_value1, axis=1)
mydfs["extractedValue_2"] = mydfs.apply(get_value2, axis=1)
mydfs["extractedValue_3"] = mydfs.apply(get_value3, axis=1)
mydfs["extractedValue_4"] = mydfs.apply(get_value4, axis=1)
mydfs["extractedValue_5"] = mydfs.apply(get_value5, axis=1)

mydfs["maxModel"] = mydfs.apply(get_max_val, axis=1)
mydfs["ensemble_output"] = mydfs.apply(get_ensemble_output, axis=1)
train_df = pd.DataFrame({
    "model_perturbed_output":dataset["test"]["perturbed_output"],
    "ground_truth":dataset["test"]["ground_truth_output"],
    'original_text':dataset["test"]["original_text"],
    'perturbed_text':dataset["test"]["perturbed_text"],
    'original_score':dataset["test"]["original_score"],
    'perturbed_score':dataset["test"]["perturbed_score"],
    'num_queries':dataset["test"]["num_queries"],
    'result_type':dataset["test"]["result_type"],
    'original_question':dataset["test"]["original_question"],
    'original_answer':dataset["test"]["original_answer"],
    'perturbed_question':dataset["test"]["perturbed_question"],
    'perturbed_answer':dataset["test"]["perturbed_answer"]
  })


  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])


In [None]:
for c in train_df.columns:
  mydfs[c] = train_df[c]
print((mydfs["ground_truth"] == mydfs["base_output"]).sum())
print((mydfs["ground_truth"] == mydfs["ensemble_output"]).sum())
mydfs.to_csv(output_save_file)

53
410


# Deberta QNLI

In [None]:
dataset_type = "qnli"
output_save_file = f"/content/drive/My Drive/Ensemble_Def/{dataset_type}-deberta_og.csv"

logfile_directory = "/content/drive/My Drive/CS6220_logs/Val_logs"
files = [f"{dataset_type}_vallog_PWWS.csv", f"{dataset_type}_vallog_textbugger.csv", f"{dataset_type}_vallog_textfooler.csv", f"{dataset_type}_vallog_DeepWordBug.csv"]
files = [f"{logfile_directory}/{file}" for file in files]
dataset_types = [f"{dataset_type}", f"{dataset_type}", f"{dataset_type}", f"{dataset_type}"]
dataset = concat_dfs(files, dataset_types)

In [None]:

save_directory = f"/content/drive/My Drive/finetuned_models/{dataset_type}-deberta-xsmall"
adv_save_directories = [f"{dataset_type}_{attack_name}/" for attack_name in ["textbugger", "textfooler", "DeepWordBug", "PWWS"]]
adv_save_directories = [f"/content/drive/My Drive/Adv_trained/{adv_save_directory}" for adv_save_directory in adv_save_directories]

model_wrappers = [load_model_and_tokenizer(save_directory), load_model_and_tokenizer(adv_save_directories[0])]
for adv_save_directory in adv_save_directories:
   model_wrappers.append(load_model_and_tokenizer(adv_save_directory))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
df_outputs = get_outputs(model_wrappers, dataset, device)


starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done
starting model outputs


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model outputs done


In [None]:
dfs = [[x.logits.cpu() for x in df] for df in df_outputs]

In [None]:
mydfs = pd.concat([pd.DataFrame(dfs[i], columns=[f"model_i"]) for i in range(len(dfs))], axis=1, ignore_index=True )
mydfs.columns = [f"model_{i}" for i in range(len(dfs))]
mydfs["base_output"] = mydfs["model_0"].apply(lambda x: torch.argmax(x, dim=1).item())


mydfs["extractedValue_1"] = mydfs.apply(get_value1, axis=1)
mydfs["extractedValue_2"] = mydfs.apply(get_value2, axis=1)
mydfs["extractedValue_3"] = mydfs.apply(get_value3, axis=1)
mydfs["extractedValue_4"] = mydfs.apply(get_value4, axis=1)
mydfs["extractedValue_5"] = mydfs.apply(get_value5, axis=1)

mydfs["maxModel"] = mydfs.apply(get_max_val, axis=1)
mydfs["ensemble_output"] = mydfs.apply(get_ensemble_output, axis=1)
train_df = pd.DataFrame({
    "model_perturbed_output":dataset["test"]["perturbed_output"],
    "ground_truth":dataset["test"]["ground_truth_output"],
    'original_text':dataset["test"]["original_text"],
    'perturbed_text':dataset["test"]["perturbed_text"],
    'original_score':dataset["test"]["original_score"],
    'perturbed_score':dataset["test"]["perturbed_score"],
    'num_queries':dataset["test"]["num_queries"],
    'result_type':dataset["test"]["result_type"],
    'original_question':dataset["test"]["original_question"],
    'original_answer':dataset["test"]["original_answer"],
    'perturbed_question':dataset["test"]["perturbed_question"],
    'perturbed_answer':dataset["test"]["perturbed_answer"]
  })


  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])
  values = np.array([convert(v) for v in values])


In [None]:
for c in train_df.columns:
  mydfs[c] = train_df[c]
print((mydfs["ground_truth"] == mydfs["base_output"]).sum())
print((mydfs["ground_truth"] == mydfs["ensemble_output"]).sum())
mydfs.to_csv(output_save_file)

298
577


In [None]:
dataset["test"]

Dataset({
    features: ['original_text', 'perturbed_text', 'original_score', 'perturbed_score', 'original_output', 'perturbed_output', 'ground_truth_output', 'num_queries', 'result_type', 'original_question', 'original_answer', 'perturbed_question', 'perturbed_answer'],
    num_rows: 800
})