Install Dependencies

In [None]:
# installations
! pip install transformers datasets

Import ECQA data

In [None]:
import pandas as pd

ecqa_train = pd.read_csv('raw/ecqa/train.csv', header = 0, index_col = 0)
ecqa_dev = pd.read_csv('raw/ecqa/dev.csv', header = 0, index_col = 0)
ecqa_test = pd.read_csv('raw/ecqa/test.csv', header = 0, index_col = 0)

esnli_train = pd.concat([pd.read_csv('raw/esnli/train1.csv', header = 0, index_col = 0),\
                          pd.read_csv('raw/esnli/train2.csv', header = 0, index_col = 0)])
esnli_dev = pd.read_csv('raw/esnli/dev.csv', header = 0, index_col = 0)
esnli_test = pd.read_csv('raw/esnli/test.csv', header = 0, index_col = 0)

In [None]:
def seperate_ecqa(groups):
    idx = groups.index
    
    neg = str.splitlines(groups['taskA_neg'][idx[0]])
    neg_idx = idx[groups['q_ans'] != groups['q_op']]
    
    pos_idx = idx[groups['q_ans'] == groups['q_op']]
    groups['labels'][pos_idx] = 1
    
    # It is impossible to automate the pairing of option and explanation if they doesn't match
    # drop_explanations should be False when evaluating original task. 
    if len(neg_idx) != len(neg):
        groups['drop_exp'] = True
        return groups
    
    for local_idx, global_idx in enumerate(neg_idx):
        groups['explanations'][global_idx] = neg[local_idx]
    
    pos = str.splitlines(groups['taskA_pos'][idx[0]])
    pos_grp = groups[groups['q_ans'] == groups['q_op']]
    add = len(pos) - len(pos_grp)
    
    # We duplicate the correct answer with an unique explanation
    groups = groups.append([pos_grp]*(add), ignore_index=True) if len(pos_grp) == 1 else \
    groups.append([pos_grp.iloc(0)]*(add), ignore_index=True)
    
    idx = groups.index
    pos_idx = idx[groups['q_ans'] == groups['q_op']]
    
    for local_idx, global_idx in enumerate(pos_idx):
        groups['explanations'][global_idx] = pos[min(local_idx,len(pos)-1)]
   
    return groups

In [None]:
from tqdm.auto import tqdm

def reformat_ecqa(df):
    # Copy data
    df = df.copy()

    # Format the data such every option gets an unique row
    id_columns = ["q_no","q_concept","q_text","q_ans",'taskA_neg','taskA_pos']
    df = pd.wide_to_long(df, "q_op", i=id_columns, j="option").reset_index()
    df = df.assign(explanations = "", drop_exp = False, labels = 0)
    
    tqdm.pandas()
    df = df.groupby('q_no').progress_apply(seperate_ecqa).reset_index(drop=True)
    df = df.rename(columns={"q_no": "groups", "q_text": "questions", "q_op": "options", "option": "option_no"})
    df[['questions', 'options','explanations']] = df[['questions', 'options','explanations']].applymap(lambda x: x.strip() if isinstance(x, str) else x)
    return df[['groups', 'questions', 'options', 'explanations', 'option_no', 'labels', 'drop_exp']]
    
ecqa_train_rf = reformat_ecqa(ecqa_train)
ecqa_dev_rf = reformat_ecqa(ecqa_dev)
ecqa_test_rf = reformat_ecqa(ecqa_test)

In [None]:
from tqdm.auto import tqdm

def reformat_esnli(esnli):
    # Copy data
    df = esnli.copy()
    df['groups'] = df.index
    df = df.reset_index(drop=True)
    

    # Format the data such every option gets an unique row
    id_columns = ['groups']
    df = pd.wide_to_long(df, "Explanation", i=id_columns, j="explanation_no", sep="_").reset_index()
    
    tqdm.pandas()
    #df['id'] = df['id'].progress_apply(lambda x: x[:-1])
    df = df.rename(columns={"gold_label": "labels", "Sentence1": "premise", "Sentence2":"hypothesis", "Explanation": "explanations"})
    df['labels'] = df['labels'].map({'contradiction': 0, 'neutral': 1, 'entailment': 2})
    df[['premise', 'hypothesis','explanations']] = df[['premise', 'hypothesis','explanations']].progress_apply(lambda x: x.strip() if isinstance(x, str) else x)
    return df[["groups", "premise", "hypothesis", "explanations", "labels", "explanation_no"]]
    
esnli_train_rf = reformat_esnli(esnli_train)
esnli_dev_rf = reformat_esnli(esnli_dev)
esnli_test_rf = reformat_esnli(esnli_test)

In [None]:
#print(f"ECQA: {ecqa_train.head()}\ne-SNLI: {esnli_train.head()}\nComVE: {comve_train.head()}")
print(f"ECQA: {ecqa_train_rf.head()}\ne-SNLI: {esnli_train_rf.head()}\nComVE: {comve_train_rf.head()}")

In [None]:
print("ECQA:")
print(f"Training length: \t\t{str(len(ecqa_train_rf))} \nValue counts: \n{ecqa_train_rf['labels'].value_counts()}\n" + 
    f"Evaluation length: \t\t{str(len(ecqa_dev_rf))} \nValue counts: \n{ecqa_dev_rf['labels'].value_counts()}\n" +
     f"Test length (explantions): \t{str(len(ecqa_test_rf))} \nValue counts: \n{ecqa_test_rf['labels'].value_counts()}")

print("\ne-SNLI:")
print(f"Training length: \t\t{str(len(esnli_train_rf))} \nValue counts: \n{esnli_train_rf['labels'].value_counts()}\n" + 
    f"Evaluation length: \t\t{str(len(esnli_dev_rf))} \nValue counts: \n{esnli_dev_rf['labels'].value_counts()}\n" +
     f"Test length (explantions): \t{str(len(esnli_test_rf))} \nValue counts: \n{esnli_test_rf['labels'].value_counts()}")

Tokenize the preprocessed data into bert format

In [None]:
from tqdm import tqdm
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast = True)

# Tokenizer for a bert input without explanations, for comparison
def tokenize_ne(entry):
    #<[CLS] <Question> [SEP] <Option> [SEP]
    try:
        return tokenizer(entry[sen1], entry[sen2])
    except:
        print(f"sen1: {entry[sen1]}, sen2: {entry[sen2]}")
# Tokenizer for a bert input
def tokenize(entry):
    #<CLS> <Question> [SEP] <Option> [SEP] <Explanation> [SEP]
    try:
        #return tokenizer(entry[sen1] + " " + entry[sen2], entry[exp])
        return tokenizer(entry[sen1], entry[sen2] + " [SEP] " + entry[exp])
    except Exception as err: 
        print(f"sen1: {entry[sen1]}, sen2: {entry[sen2]}, exp: {entry[exp]}")
        raise err

In [None]:
# Tokenize the data in a bert format
tqdm.pandas()
sen1 = "questions"
sen2 = "options"
exp = "explanations"
bert_ecqa_train_ne = pd.DataFrame(list(ecqa_train_rf.groupby([sen1, sen2]).first().reset_index().progress_apply(tokenize_ne, axis=1)))
bert_ecqa_dev_ne = pd.DataFrame(list(ecqa_dev_rf.groupby([sen1, sen2]).first().reset_index().progress_apply(tokenize_ne, axis=1)))
bert_ecqa_test_ne = pd.DataFrame(list(ecqa_test_rf.groupby([sen1, sen2]).first().reset_index().progress_apply(tokenize_ne, axis=1)))
bert_ecqa_train = pd.DataFrame(list(ecqa_train_rf[ecqa_train_rf['drop_exp'] == False].reset_index().progress_apply(tokenize, axis=1)))
bert_ecqa_dev = pd.DataFrame(list(ecqa_dev_rf[ecqa_dev_rf['drop_exp'] == False].reset_index().progress_apply(tokenize, axis=1)))
bert_ecqa_test = pd.DataFrame(list(ecqa_test_rf[ecqa_test_rf['drop_exp'] == False].reset_index().progress_apply(tokenize, axis=1)))

# Add the labels we lost
bert_ecqa_train_ne[['groups', 'questions', 'options', 'labels']] = ecqa_train_rf.groupby([sen1, sen2]).first().reset_index()[['groups', 'questions', 'options', 'labels']]
bert_ecqa_dev_ne[['groups', 'questions', 'options', 'labels']] = ecqa_dev_rf.groupby([sen1, sen2]).first().reset_index()[['groups', 'questions', 'options', 'labels']]
bert_ecqa_test_ne[['groups', 'questions', 'options', 'labels']] = ecqa_test_rf.groupby([sen1, sen2]).first().reset_index()[['groups', 'questions', 'options', 'labels']]
bert_ecqa_train[['groups', 'questions', 'options', 'explanations', 'labels']] = ecqa_train_rf[ecqa_train_rf['drop_exp'] == False].reset_index()[['groups', 'questions', 'options', 'explanations', 'labels']]
bert_ecqa_dev[['groups', 'questions', 'options', 'explanations', 'labels']] = ecqa_dev_rf[ecqa_dev_rf['drop_exp'] == False].reset_index()[['groups', 'questions', 'options', 'explanations', 'labels']]
bert_ecqa_test[['groups', 'questions', 'options', 'explanations', 'labels']] = ecqa_test_rf[ecqa_test_rf['drop_exp'] == False].reset_index()[['groups', 'questions', 'options', 'explanations', 'labels']]

# Save the tokenized bert data
bert_ecqa_train.to_csv('tokenized/bert/ecqa/train.csv')
bert_ecqa_dev.to_csv('tokenized/bert/ecqa/dev.csv')
bert_ecqa_test.to_csv('tokenized/bert/ecqa/test.csv')
bert_ecqa_train_ne.to_csv('tokenized/bert/ecqa/train_ne.csv')
bert_ecqa_dev_ne.to_csv('tokenized/bert/ecqa/dev_ne.csv')
bert_ecqa_test_ne.to_csv('tokenized/bert/ecqa/test_ne.csv')

In [None]:
sen1 = "premise"
sen2 = "hypothesis"
exp = "explanations"

bert_esnli_train_ne = pd.DataFrame(list(esnli_train_rf.dropna().groupby([sen1, sen2]).first().reset_index().progress_apply(tokenize_ne, axis=1)))
bert_esnli_dev_ne = pd.DataFrame(list(esnli_dev_rf.dropna().groupby([sen1, sen2]).first().reset_index().progress_apply(tokenize_ne, axis=1)))
bert_esnli_test_ne = pd.DataFrame(list(esnli_test_rf.groupby([sen1, sen2]).first().reset_index().progress_apply(tokenize_ne, axis=1)))
bert_esnli_train = pd.DataFrame(list(esnli_train_rf.dropna().reset_index().progress_apply(tokenize, axis=1)))
bert_esnli_dev = pd.DataFrame(list(esnli_dev_rf.dropna().reset_index().progress_apply(tokenize, axis=1)))
bert_esnli_test = pd.DataFrame(list(esnli_test_rf.reset_index().progress_apply(tokenize, axis=1)))

# Add the labels we lost
bert_esnli_train_ne[['groups', 'premise', 'hypothesis', 'labels']] = esnli_train_rf.dropna().groupby([sen1, sen2]).first().reset_index()[['groups', 'premise', 'hypothesis', 'labels']]
bert_esnli_dev_ne[['groups', 'premise', 'hypothesis', 'labels']] = esnli_dev_rf.dropna().groupby([sen1, sen2]).first().reset_index()[['groups', 'premise', 'hypothesis', 'labels']]
bert_esnli_test_ne[['groups', 'premise', 'hypothesis', 'labels']] = esnli_test_rf.groupby([sen1, sen2]).first().reset_index()[['groups', 'premise', 'hypothesis', 'labels']]
bert_esnli_train[['groups', 'premise', 'hypothesis', 'explanations', 'labels']] = esnli_train_rf.dropna().reset_index()[['groups', 'premise', 'hypothesis', 'explanations', 'labels']]
bert_esnli_dev[['groups', 'premise', 'hypothesis', 'explanations', 'labels']] = esnli_dev_rf.dropna().reset_index()[['groups', 'premise', 'hypothesis', 'explanations', 'labels']]
bert_esnli_test[['groups', 'premise', 'hypothesis', 'explanations', 'labels']] = esnli_test_rf.reset_index()[['groups', 'premise', 'hypothesis', 'explanations', 'labels']]

bert_esnli_train.to_csv('tokenized/bert/esnli/train.csv')
bert_esnli_dev.to_csv('tokenized/bert/esnli/dev.csv')
bert_esnli_test.to_csv('tokenized/bert/esnli/test.csv')
bert_esnli_train_ne.to_csv('tokenized/bert/esnli/train_ne.csv')
bert_esnli_dev_ne.to_csv('tokenized/bert/esnli/dev_ne.csv')
bert_esnli_test_ne.to_csv('tokenized/bert/esnli/test_ne.csv')

Tokenize the preprocessed data into GPT format

In [None]:
from tqdm import tqdm
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
#tokenizer.padding_side = "left"

# Define PAD Token = EOS Token = 50256
#tokenizer.pad_token = tokenizer.eos_token

# Tokenizer for a bert input without explanations, for comparison
def tokenize_ne(entry):
    try:
        return tokenizer("Statement: " + entry[sen1] + "\n" + 
                         "Statement: " + entry[sen2] + "\n" +
                         "Explanation:")
    except Exception as err:
        print("Statement: " + entry[sen1] + "\n" + 
              "Statement: " + entry[sen2] + "\n" +
              "Explanation:")
        raise err

# Tokenizer for a bert input
def tokenize(entry):
    try:
        return tokenizer("Statement: " + entry[sen1] + "\n" + 
                         "Statement: " + entry[sen2] + "\n" +
                         "Explanation: " + entry[exp] + tokenizer.eos_token)
    except Exception as err: 
        print("Statement: " + entry[sen1] + "\n" + 
              "Statement: " + entry[sen2] + "\n" +
              "Explanation: " + entry[exp] + tokenizer.eos_token )
        raise err

def stokenize_ne(entry):
    try:
        return tokenizer("Statement: " + entry[sen1] + "\n" + 
                         "Explanation:")
    except Exception as err:
        print("Statement: " + entry[sen1] + "\n" + 
              "Explanation:")
        raise err

# Tokenizer for a bert input
def stokenize(entry):
    try:
        return tokenizer("Statement: " + entry[sen1] + "\n" + 
                         "Explanation: " + entry[exp] + tokenizer.eos_token)
    except Exception as err: 
        print("Statement: " + entry[sen1] + "\n" + 
              "Explanation: " + entry[exp] + tokenizer.eos_token )
        raise err

In [None]:
# Tokenize the data in a GPT-2 format
tqdm.pandas()
sen1 = "questions"
sen2 = "options"
exp = "explanations"
gpt_ecqa_train_ne = pd.DataFrame(list(ecqa_train_rf[ecqa_train_rf['drop_exp'] == False].dropna().reset_index().progress_apply(tokenize_ne, axis=1)))
gpt_ecqa_dev_ne = pd.DataFrame(list(ecqa_dev_rf[ecqa_dev_rf['drop_exp'] == False].dropna().reset_index().progress_apply(tokenize_ne, axis=1)))
gpt_ecqa_test_ne = pd.DataFrame(list(ecqa_test_rf[ecqa_test_rf['drop_exp'] == False].reset_index().progress_apply(tokenize_ne, axis=1)))

gpt_ecqa_train = pd.DataFrame(list(ecqa_train_rf[ecqa_train_rf['drop_exp'] == False].dropna().reset_index().progress_apply(tokenize, axis=1)))
gpt_ecqa_dev = pd.DataFrame(list(ecqa_dev_rf[ecqa_dev_rf['drop_exp'] == False].dropna().reset_index().progress_apply(tokenize, axis=1)))
gpt_ecqa_test = pd.DataFrame(list(ecqa_test_rf[ecqa_test_rf['drop_exp'] == False].reset_index().progress_apply(tokenize, axis=1)))
gpt_ecqa_final_test = pd.DataFrame(list(ecqa_test_rf.groupby([sen1, sen2]).first().reset_index().progress_apply(tokenize_ne, axis=1)))

gpt_ecqa_train_ne[['gold_ids', 'gold_mask']] = gpt_ecqa_train[['input_ids', 'attention_mask']]
gpt_ecqa_dev_ne[['gold_ids', 'gold_mask']] = gpt_ecqa_dev[['input_ids', 'attention_mask']]
gpt_ecqa_test_ne[['gold_ids', 'gold_mask']] = gpt_ecqa_test[['input_ids', 'attention_mask']]

gpt_ecqa_train_ne[['groups', 'questions', 'options', 'explanations', 'labels']] = ecqa_train_rf[ecqa_train_rf['drop_exp'] == False].dropna().reset_index()[['groups', 'questions', 'options', 'explanations', 'labels']]
gpt_ecqa_dev_ne[['groups', 'questions', 'options', 'explanations', 'labels']] = ecqa_dev_rf[ecqa_dev_rf['drop_exp'] == False].dropna().reset_index()[['groups', 'questions', 'options', 'explanations', 'labels']]
gpt_ecqa_test_ne[['groups', 'questions', 'options', 'explanations', 'labels']] = ecqa_test_rf[ecqa_test_rf['drop_exp'] == False].dropna().reset_index()[['groups', 'questions', 'options', 'explanations', 'labels']]
gpt_ecqa_final_test[['groups', 'questions', 'options', 'labels']] = ecqa_test_rf.groupby([sen1, sen2]).first().reset_index()[['groups', 'questions', 'options', 'labels']]

gpt_ecqa_train_ne.to_csv('tokenized/gpt2/ecqa/train.csv')
gpt_ecqa_dev_ne.to_csv('tokenized/gpt2/ecqa/dev.csv')
gpt_ecqa_test_ne.to_csv('tokenized/gpt2/ecqa/test.csv')
gpt_ecqa_final_test.to_csv('tokenized/gpt2/ecqa/final_test.csv')

sen1 = "premise"
sen2 = "hypothesis"
exp = "explanations"
gpt_esnli_train_ne = pd.DataFrame(list(esnli_train_rf.dropna().reset_index().progress_apply(tokenize_ne, axis=1)))
gpt_esnli_dev_ne = pd.DataFrame(list(esnli_dev_rf.dropna().reset_index().progress_apply(tokenize_ne, axis=1)))
gpt_esnli_test_ne = pd.DataFrame(list(esnli_test_rf.reset_index().progress_apply(tokenize_ne, axis=1)))

gpt_esnli_train = pd.DataFrame(list(esnli_train_rf.dropna().reset_index().progress_apply(tokenize, axis=1)))
gpt_esnli_dev = pd.DataFrame(list(esnli_dev_rf.dropna().reset_index().progress_apply(tokenize, axis=1)))
gpt_esnli_test = pd.DataFrame(list(esnli_test_rf.reset_index().progress_apply(tokenize, axis=1)))

gpt_esnli_train_ne[['gold_ids', 'gold_mask']] = gpt_esnli_train[['input_ids', 'attention_mask']]
gpt_esnli_dev_ne[['gold_ids', 'gold_mask']] = gpt_esnli_dev[['input_ids', 'attention_mask']]
gpt_esnli_test_ne[['gold_ids', 'gold_mask']] = gpt_esnli_test[['input_ids', 'attention_mask']]

gpt_esnli_train_ne[['groups', 'premise', 'hypothesis', 'explanations', 'labels']] = esnli_train_rf.dropna().reset_index()[['groups', 'premise', 'hypothesis', 'explanations', 'labels']]
gpt_esnli_dev_ne[['groups', 'premise', 'hypothesis', 'explanations', 'labels']] = esnli_dev_rf.dropna().reset_index()[['groups', 'premise', 'hypothesis', 'explanations', 'labels']]
gpt_esnli_test_ne[['groups', 'premise', 'hypothesis', 'explanations', 'labels']] = esnli_test_rf.reset_index()[['groups', 'premise', 'hypothesis', 'explanations', 'labels']]

gpt_esnli_train_ne.to_csv('tokenized/gpt2/esnli/train.csv')
gpt_esnli_dev_ne.to_csv('tokenized/gpt2/esnli/dev.csv')
gpt_esnli_test_ne.to_csv('tokenized/gpt2/esnli/test.csv')

In [None]:
import pandas as pd
import json

with open('generated/multitask/separate/ecqa_train.json', 'r') as f:
    ecqa_generated = pd.DataFrame(json.load(f))
with open('generated/multitask/separate/comve_train.json', 'r') as f:
    esnli_generated = pd.DataFrame(json.load(f))


In [None]:
import pandas as pd
import json

with open('generated/multitask/separate/ecqa_test.json', 'r') as f:
    ecqa_generated = pd.DataFrame(json.load(f))

# Tokenize the data in a bert format
tqdm.pandas()
sen1 = "questions"
sen2 = "options"
exp = "generated"
ecqa_test = pd.DataFrame(list(ecqa_generated.reset_index().progress_apply(tokenize, axis=1)))
ecqa_test[['groups', 'data_id', 'questions', 'options', 'explanations', 'generated', 'labels']] = ecqa_generated.reset_index()[['groups', 'data_id', 'questions', 'options', 'explanations', 'generated', 'labels']]
ecqa_test.to_csv('generated/multitask/separate/ecqa_test_gpt.csv')

In [None]:
import json
from tqdm.auto import tqdm
import pandas as pd

with open('generated/multitask/separate/esnli_test.json', 'r') as f:
    esnli_generated = pd.DataFrame(json.load(f))

# Tokenize the data in a bert format
tqdm.pandas()
sen1 = "premise"
sen2 = "hypothesis"
exp = "generated"
esnli_test = pd.DataFrame(list(esnli_generated.reset_index().progress_apply(tokenize, axis=1)))
esnli_test[['groups', 'premise', 'hypothesis', 'generated', 'labels']] = esnli_generated.reset_index()[['groups', 'premise', 'hypothesis', 'generated', 'labels']]
esnli_test.to_csv('generated/multitask/separate/esnli_test_gpt.csv')