In [3]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import os
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import nltk
import warnings
warnings.filterwarnings("ignore")

AUGMENT_WITH_NEUTRAL_ARGS = True
data_dir = "data/argumentation"

train_iam = pd.read_csv(os.path.join(data_dir, 'train_iam.tsv'), sep='\t')[['topic', 'argument', 'label']]
dev_iam = pd.read_csv(os.path.join(data_dir, 'dev_iam.txt'), sep='\t')[['topic', 'argument', 'label']]
test_iam = pd.read_csv(os.path.join(data_dir, 'test_iam.txt'), sep='\t')[['topic', 'argument', 'label']]

train_cckg = pd.read_csv(os.path.join(data_dir, 'train_cckg.tsv'), sep='\t')[['topic', 'argument', 'label']]
dev_cckg = pd.read_csv(os.path.join(data_dir, 'dev_cckg.tsv'), sep='\t')[['topic', 'argument', 'label']]
test_cckg = pd.read_csv(os.path.join(data_dir, 'test_cckg.tsv'), sep='\t')[['topic', 'argument', 'label']]

def transform_cckg_labels(data):
    data.loc[data.label=='support', 'label']=1
    data.loc[data.label=='counter', 'label']=-1
    return data

train_cckg = transform_cckg_labels(train_cckg)
dev_cckg = transform_cckg_labels(dev_cckg)
test_cckg = transform_cckg_labels(test_cckg)

train_df = pd.concat([train_iam, train_cckg]).sample(frac=1) #shuffle these bad boys
dev_df = pd.concat([dev_iam, dev_cckg]).sample(frac=1)
test_df = pd.concat([test_iam, test_cckg]).sample(frac=1)


train_df = train_cckg #shuffle these bad boys
dev_df = dev_cckg
test_df = test_cckg

all_claims = pd.read_csv(os.path.join(data_dir, 'claims.tsv'), sep='\t')

np.random.seed(42)

if AUGMENT_WITH_NEUTRAL_ARGS:
    neutral_claims = all_claims[all_claims.type=='O'] 
    lower_bound = 0
    
    min_train_label = min(train_df['label'].value_counts())
    train_sample = neutral_claims.iloc[:min_train_label]
    train_sample = train_sample[['topic', 'argument', 'label']]
    train_df = pd.concat([train_df, train_sample]).sample(frac=1)
    lower_bound = min_train_label
    
    min_dev_label = min(dev_df['label'].value_counts())
    dev_sample = neutral_claims.iloc[lower_bound: lower_bound + min_dev_label]  
    dev_sample = dev_sample[['topic', 'argument', 'label']]  
    dev_df = pd.concat([dev_df, dev_sample]).sample(frac=1)
    lower_bound = lower_bound + min_dev_label
    
    min_test_label = min(test_df['label'].value_counts())
    test_sample = neutral_claims.iloc[lower_bound: lower_bound + min_test_label]    
    test_sample = test_sample[['topic', 'argument', 'label']]
    test_df = pd.concat([test_df, test_sample]).sample(frac=1)

label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])
train_df['label'] = label_encoder.transform(train_df['label'])
dev_df['label'] = label_encoder.transform(dev_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)
})

In [4]:
label_encoder.classes_

array([-1, 0, 1], dtype=object)

In [5]:
model_name = 'google/flan-t5-base'

In [6]:
from transformers import AutoTokenizer
model_name = 'google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load('rouge')

In [7]:
"""    
need to be careful cause we cant give the same prefix to every type of 
arguments otherwise the model generalises and considers counter arguments as 
supporting arguments if we give the prefix : 'generate a supporting argument'   
"""
def preprocess_function(sample):
    def process_by_type(label_type):
        if AUGMENT_WITH_NEUTRAL_ARGS:
            if label_type==2:
                label = 'supporting'
            elif label_type == 1:
                label = 'neutral'
            elif label_type==0:
                label = 'counter'
            else:
                raise ValueError
        else:
            if label_type == 1:
                label = 'supporting'
            else:
                label = 'counter'
        
        label_indices = [i for i, label in enumerate(sample['label']) if label == label_type]        
        prefix = f"Given the following topic, generate a good {label} argument. Topic="
        labeled_samples = {key: [sample[key][i] for i in label_indices] for key in sample.keys()}
        inputs = [prefix + doc for doc in labeled_samples['topic']]

        model_inputs = tokenizer(inputs, max_length=4096, truncation=True)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(labeled_samples['argument'], max_length=4096, truncation=True)
        model_inputs['labels'] = labels['input_ids']
        return model_inputs
    
    if AUGMENT_WITH_NEUTRAL_ARGS:
        model_inputs_supporting=process_by_type(label_type=2) ## supporting
        model_inputs_neutral=process_by_type(label_type=1) ## neutral
        model_inputs_counter=process_by_type(label_type=0) ## counter
        
        combined_model_inputs = {
        'input_ids': model_inputs_supporting['input_ids'] + model_inputs_neutral['input_ids'] + model_inputs_counter['input_ids'],
        'attention_mask': model_inputs_supporting['attention_mask'] + model_inputs_neutral['attention_mask'] + model_inputs_counter['attention_mask'],
        'labels': model_inputs_supporting['labels'] + model_inputs_neutral['labels'] + model_inputs_counter['labels']
        }
        
    else:
        model_inputs_supporting=process_by_type(label_type=1) ## supporting
        model_inputs_counter=process_by_type(label_type=0) ## counter
        
        combined_model_inputs = {
        'input_ids': model_inputs_supporting['input_ids'] + model_inputs_counter['input_ids'],
        'attention_mask': model_inputs_supporting['attention_mask'] + model_inputs_counter['attention_mask'],
        'labels': model_inputs_supporting['labels'] + model_inputs_counter['labels']
        }
    
    return combined_model_inputs
        


In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['label'])

Map:   0%|          | 0/2706 [00:00<?, ? examples/s]

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

In [13]:
batch_size = 8
training_arguments = Seq2SeqTrainingArguments(
    output_dir='results/',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    load_best_model_at_end=True,
    save_strategy="epoch",
    logging_strategy="steps",
    fp16=True, # for cuda
    push_to_hub=False,
    logging_steps=50,
    eval_steps=50,
    save_steps=50,
)


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mluca-mouchel[0m ([33mlia_epfl[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoi

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.2313,2.76404,21.0208,8.5771,19.5783,19.5261,12.497487
2,3.0971,2.738573,22.9365,8.1887,20.9527,20.9023,14.050251
3,3.0651,2.726497,23.0449,7.9468,20.9181,20.8639,14.286432
4,3.009,2.720764,22.886,7.9175,20.8062,20.7677,14.316583
5,3.0437,2.717869,22.8603,7.9036,20.7757,20.7375,14.331658


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=850, training_loss=3.073619025735294, metrics={'train_runtime': 596.1691, 'train_samples_per_second': 22.695, 'train_steps_per_second': 1.426, 'total_flos': 678351126435840.0, 'train_loss': 3.073619025735294, 'epoch': 5.0})

In [16]:
saved_model = f"models/{model_name.split('/')[-1]}/w_neutral_CCKG/"
trainer.save_model(saved_model)

In [17]:
predictions = trainer.predict(tokenized_dataset["test"])

In [None]:
predictions.metrics

{'test_loss': 3.1586780548095703,
 'test_rouge1': 19.3503,
 'test_rouge2': 3.9895,
 'test_rougeL': 16.4591,
 'test_rougeLsum': 16.4658,
 'test_gen_len': 16.910714285714285,
 'test_runtime': 54.0749,
 'test_samples_per_second': 23.819,
 'test_steps_per_second': 1.498}

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained(saved_model)
tokenizer = T5Tokenizer.from_pretrained(saved_model)

NameError: name 'saved_model' is not defined

In [None]:
def generate_arg(topic, arg_type='supporting'):
    prefix = f"Given the following topic, generate a good {arg_type} argument. Topic="
    inputs = tokenizer(prefix + topic, return_tensors='pt', padding=True, truncation=True)
    outputs = model.generate(**inputs, min_length=25, max_length=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
df_iam = pd.read_csv(os.path.join(data_dir, 'train_iam.tsv'), sep='\t')
df_cckg = pd.read_csv(os.path.join(data_dir, 'train_cckg.tsv'), sep='\t')
topic = df_iam['topic'].sample(1).values[0]
print(topic)

Shouldn't alcohol be forbidden


In [None]:
print(topic.upper())

SHOULDN'T ALCOHOL BE FORBIDDEN


In [None]:
arg_types=['supporting', 'counter', 'neutral']

for arg_type in arg_types:
    print(f"""{arg_type}---{generate_arg("Should alcohol be forbidden", arg_type=arg_type)}""")


supporting---Alcohol is a dangerous drug that can cause serious health problems. [ref]. [ref]. Alcohol is a dangerous drug that can cause serious health problems.
counter---Alcohol is a dangerous drug, and it can cause serious health problems. [ref]. [ref]. Alcohol is a dangerous drug, and it can cause serious health problems.
neutral---The alcoholics are able to make a living by drinking alcohol, and they are able to make a living by consuming alcohol.


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = 'google/flan-t5-base'
model_dir = f"models/{model_name.split('/')[-1]}/w_neutral/"
data_dir = 'data/argumentation'

model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
