In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from generative_model import GenerativeModel, train, test, validate
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from task2_utils import tokenize_df_gen, decode_data, compute_metrics, concat_tag

import sys
import os
sys.path.insert(1, '../')
import data_handler
sys.path.insert(1, '../kp_match')
import warnings
warnings.filterwarnings('ignore')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"
device = torch.device(0)

In [4]:
df_train, df_val, df_test = data_handler.load_full_dataset('../dataset/', get_train=True, get_dev=True, get_test=True)

In [5]:
# Concatenate topics and keypoints, as stated in the paper
df_train = data_handler.concatenate_topics(df_train, input_col='argument', output_col='argument')
df_val = data_handler.concatenate_topics(df_val, input_col='argument', output_col='argument')
df_test = data_handler.concatenate_topics(df_test, input_col='argument', output_col='argument')

# Compute baseline

In [4]:
def test_baseline(model_type, device, loss, metrics):
    """ Test baseline of model on validation and test set
    Parameters
    ----------
    model_type: string
        name of model
    device: torch device
        Selected device on which to perform the grid search 
        (usually a GPU)
    loss: function
        function which computes model's loss
    metrics: array-like
        array of strings containing metrics
        to compute
    """
    
    
    # Load model's tokenizer
    if model_type == 'google/pegasus-large':
        tokenizer = AutoTokenizer.from_pretrained('google/pegasus-xsum')
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_type)
    
    # Max length for tokenization
    max_length = 100
    
    # Load data
    _, df_val, df_test = data_handler.load_full_dataset('../dataset/', get_train=False, get_dev=True, get_test=True)

    # Only take few test examples
    df_test = df_test.sample(frac = 0.00027, random_state = 270898)
    
    # Concatenate topics and keypoints, as stated in the paper
    df_val = data_handler.concatenate_topics(df_val, input_col='argument', output_col='argument')
    df_test = data_handler.concatenate_topics(df_test, input_col='argument', output_col='argument')
    
    # Additional pre-processing step for t5 model
    if model_type == "t5-small" or model_type == "t5-base" or model_type == "t5-large":
        df_val = concat_tag(df_val, 'argument')
        df_test = concat_tag(df_test, 'argument')
    
    # Istantiate model and move it on the desired device
    model = GenerativeModel(model_type)
    model.to(device)
    
    # Tokenize data
    tokenized_val = tokenize_df_gen(df_val, tokenizer, max_length=max_length)
    tokenized_test = tokenize_df_gen(df_test, tokenizer, max_length=max_length, key_points_on=False)

    # Organize data
    seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True, max_length=max_length)
    val_loader = DataLoader(
        tokenized_val,
        collate_fn=seq2seq_data_collator, 
        pin_memory=True
    )
    
    # Validate model
    val_res = validate(model, device, val_loader, max_length=max_length)
    
    dec_pred, dec_exp = decode_data(val_res['predicted'], val_res['labels'], tokenizer)
    
    # Compute metrics
    val_metrics = compute_metrics(dec_pred, dec_exp, metrics)
    print(f"Validation results with model {model_type}:")
    print(val_metrics)
    
    # Pick some validation phrases to display
    print(f"Some validation phrases generated using {model_type}:")
    df_sample = df_val.sample(frac = 0.009, random_state = 270898)
    index = df_sample.index
    for i in index:
        print(f"Argument: {df_val['argument'].iloc[i]} \nGenerated key-point: {dec_pred[i]}\n\n")

    print("----------------- TEST -----------------")

    test_loader = DataLoader(
        tokenized_test, # dataset di validazione
        collate_fn=seq2seq_data_collator, # data collator
        pin_memory=True
    )

    # Test model
    test_res = test(model, device, test_loader, max_length=max_length)

    # Show test phrases
    dec_pred = tokenizer.batch_decode(test_res['predicted'].type(torch.IntTensor).cpu().data.numpy(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(f"Some test phrases generated using {model_type}:")
    end = len(dec_pred)
    for i in range(end):
        print(f"Argument: {df_test['argument'].iloc[i]} \nGenerated key-point: {dec_pred[i]}\n\n")


# Pegasus Xsum baseline

In [21]:
model_type = 'google/pegasus-xsum'
test_baseline(model_type, device, None, ['rouge']) 

Validation results with model google/pegasus-xsum:
{'rouge': {'rouge1': {'precision': 0.06213652920676828, 'recall': 0.10197494484179298, 'fmeasure': 0.07226329623893364}, 'rouge2': {'precision': 0.005004212838704235, 'recall': 0.009373921670117321, 'fmeasure': 0.0061882023262247714}, 'rougeL': {'precision': 0.057589484741861985, 'recall': 0.09315528473137195, 'fmeasure': 0.06649627733578528}, 'rougeLsum': {'precision': 0.05764041082741468, 'recall': 0.09305865983855138, 'fmeasure': 0.06653920905986996}}}
Some validation phrases generated using google/pegasus-xsum:
Argument: The USA is a good country to live in It is an excellent country to live in due to its economy and possibilities of having a house, a car, a good job 
Generated key-point: What is your favourite country to live in?


Argument: Routine child vaccinations should be mandatory They are against the freedom of parents to choose how to care for their children 
Generated key-point: Thousands of people have taken to the stre

# T5 large baseline

In [8]:
model_type = "t5-large"
test_baseline(model_type, device, None, ['rouge']) 

Validation results with model t5-large:
{'rouge': {'rouge1': {'precision': 0.10961694906758337, 'recall': 0.295346156963005, 'fmeasure': 0.1545275636508756}, 'rouge2': {'precision': 0.034345113662356354, 'recall': 0.08198110766045547, 'fmeasure': 0.04687169373786172}, 'rougeL': {'precision': 0.10342556295835226, 'recall': 0.27779973649538947, 'fmeasure': 0.1456140366802304}, 'rougeLsum': {'precision': 0.10342266642318665, 'recall': 0.2771198004893661, 'fmeasure': 0.14555087425763585}}}
Some validation phrases generated using t5-large:
Argument: summarize: The USA is a good country to live in It is an excellent country to live in due to its economy and possibilities of having a house, a car, a good job 
Generated key-point: the usa is a good country to live in due to its economy and possibilities of having a house, a car, a good job. the


Argument: summarize: Routine child vaccinations should be mandatory They are against the freedom of parents to choose how to care for their children 

# Pegasus large baseline

In [6]:
model_type = 'google/pegasus-large'
test_baseline(model_type, device, None, ['rouge']) 

Validation results with model google/pegasus-large:
{'rouge': {'rouge1': {'precision': 0.10820963088972507, 'recall': 0.3143958999937265, 'fmeasure': 0.15573026472146867}, 'rouge2': {'precision': 0.03366233124541992, 'recall': 0.08740654474350124, 'fmeasure': 0.04722381748122127}, 'rougeL': {'precision': 0.10105335298347559, 'recall': 0.2929210257335261, 'fmeasure': 0.1451897431166692}, 'rougeLsum': {'precision': 0.10055656289123927, 'recall': 0.2912791867954916, 'fmeasure': 0.1448048784387525}}}
Some validation phrases generated using google/pegasus-large:
Argument: The USA is a good country to live in It is an excellent country to live in due to its economy and possibilities of having a house, a car, a good job 
Generated key-point: The USA is a good country to live in It is an excellent country to live in due to its economy and possibilities of having a house, a car, a good


Argument: Routine child vaccinations should be mandatory They are against the freedom of parents to choose h

# Retrain

In [6]:
from datetime import datetime
def show_time():
    """ Displays current time
    """
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

In [7]:
def train_best_model(config, df_train, df_val, df_test, max_length, loss, metrics, device, save_val=""):
    """ Train model and evaluate it on
    Validation and Test sets, computing
    the metrics for each dataset
    Parameters
    ----------
    config: dict
        contains parameters and 
        hyper-parameters useful to train
        the model
    df_train: pd.DataFrame
        Training data
    df_val: pd.DataFrame
        Validation Data
    df_test: pd.DataFrame
        Test Data
    max_length: int
        Max number of tokens
    loss: function
        function which computes model's loss
    metrics: array-like
        array of strings containing metrics
        to compute
    device: torch device
        Selected device on which to perform the grid search 
        (usually a GPU)
    save_val: string default=""
        if the string is not empty 
        the generated validation phrases will
        be saved to the path contained in the string,
        otherwise they won't be saved
    Returns
    -------
    train_scores: array-like
        Scores on the Training data of the challenge metrics 
    validation_scores: dict
        Scores on the Validation data of the challenge metrics
    """
    
    # Load the best model's tokenizer
    if config['model_type'] == 'google/pegasus-large':
        tokenizer = AutoTokenizer.from_pretrained('google/pegasus-xsum')
    else:
        tokenizer = AutoTokenizer.from_pretrained(config['model_type'])
    
    #Tokenize data
    tokenized_tr = tokenize_df_gen(df_train, tokenizer, max_length=max_length)
    tokenized_val = tokenize_df_gen(df_val, tokenizer, max_length=max_length)
    tokenized_test = tokenize_df_gen(df_test, tokenizer, max_length=max_length, key_points_on=False)
    
    # Organize data
    train_loader = DataLoader(tokenized_tr, batch_size = config['batch_size'], shuffle = True, pin_memory=True)
    val_loader = DataLoader(tokenized_val, pin_memory=True)
    test_loader = DataLoader(tokenized_test, pin_memory=True)

    # Load model and move it on the desired device
    model = GenerativeModel(config['model_type'])
    model.to(device)
    model.train()
    
    total_steps = len(train_loader) * config['epochs']
    
    # Create optimizer and scheduler 
    if config['optimizer'] == 'adamW':
        optimizer= torch.optim.AdamW(model.parameters(),
                  lr = config['lr'], 
                  eps = config['eps'],
                  weight_decay = config['weight_decay'])

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                        num_warmup_steps = config['warmup_steps'],
                                        num_training_steps = total_steps)
    
    
    print("Starting Training!")
    # Train best model
    show_time()
    train_res = train(model, device, train_loader, optimizer, config['epochs'], loss, scheduler, max_length, verbose=False)
    print("Training ended!")
    show_time()
    
    # Compute metrics
    train_scores = [None] * len(train_res['predicted'])
    for i, elem in enumerate(train_res['predicted']):
        dec_pred, dec_exp = decode_data(elem, train_res['labels'][i], tokenizer)
        train_scores[i] = compute_metrics(dec_pred, dec_exp, metrics)
    
    print("Train performances")
    print(train_scores)
    print("\n")
    
    model.eval()
    
    # Perform evaluation
    val_res = validate(model, device, val_loader, max_length=max_length)
    
    # Compute metrics
    dec_pred, dec_exp = decode_data(val_res['predicted'], val_res['labels'], tokenizer)
    validation_scores = compute_metrics(dec_pred, dec_exp, metrics)
    
    print("Validation performances")
    print(validation_scores)
    print("\n")
    
    # Save generated validation phrases
    if save_val != "":
        df = pd.DataFrame()
        df['predictions'] = dec_pred
        df['expected'] = dec_exp
        df.to_csv(save_val, sep='#')
    
    # Perform evaluation
    test_res = test(model, device, test_loader, max_length=max_length)
    
    # Print test generated phrases
    dec_pred = tokenizer.batch_decode(test_res['predicted'].type(torch.IntTensor).cpu().data.numpy(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(f"# Some test phrases generated using {config['model_type']}:")
    end = len(dec_pred)
    for i in range(end):
        print(f"Argument: {df_test['argument'].iloc[i]} \nGenerated key-point: {dec_pred[i]}\n\n")
        
    return train_scores, validation_scores

In [8]:
def compute_mean_var(res, set_name):
    """ Compute and print mean and variance of the computed scores
    Parameters
    ----------
    res: array-like
        array containing the scores
    set_name: string
        name of set of data
    """
    
    arr = []
    
    # Extract values of elements in the array correctly, depending on their shape
    for el in res:
            if len(el) >= 2:
                el = el[-1]
            if type(el) == dict:
                arr.append(el['rouge']['rouge1']['fmeasure'])
            else: 
                arr.append(el[0]['rouge']['rouge1']['fmeasure'])
    
    arr = np.array(arr)
    
    # Array of single values to compute mean and variance on
    vals = []
    
    for i in range(len(arr)):
        vals.append(arr[i])

    vals = np.array(vals)
    print(f"{set_name} Mean: {vals.mean()} and Variance: {vals.var()}")

### Pegasus base

In [11]:
config = {}
config['model_type'] = 'google/pegasus-xsum'
config['epochs'] = 1
config['lr'] = 4e-4
config['eps'] = 1e-6
config['weight_decay'] = 1e-8
config['warmup_steps'] = 1e2
config['batch_size'] = 8
config['optimizer'] = 'adamW'

In [9]:
df_test = df_test.sample(frac = 0.00018, random_state = 270898)

In [12]:
tr_metrics = []; val_metrics = [];
save_res = ""
for i in range(5):
    if i == 4:
        save_res = 'pegasus_val.csv'
    tr, val = train_best_model(config, df_train, df_val, df_test, 100, None, ['rouge'], device, save_res)
    tr_metrics.append(tr)
    val_metrics.append(val)

Starting Training!
Current Time = 14:35:53
Training ended!
Current Time = 14:53:07
Train performances
[{'rouge': {'rouge1': {'precision': 0.30724058780341995, 'recall': 0.3620097093949123, 'fmeasure': 0.31785761313826877}, 'rouge2': {'precision': 0.2083250475335769, 'recall': 0.2514903167492144, 'fmeasure': 0.21942628878586368}, 'rougeL': {'precision': 0.3007181961678671, 'recall': 0.35445767148573204, 'fmeasure': 0.31116840764530984}, 'rougeLsum': {'precision': 0.3004019915214863, 'recall': 0.35429599037417026, 'fmeasure': 0.31093671758658303}}}]


Validation performances
{'rouge': {'rouge1': {'precision': 0.1591784435134822, 'recall': 0.18309427086601043, 'fmeasure': 0.1642876361800807}, 'rouge2': {'precision': 0.023830675664914786, 'recall': 0.025459368530020697, 'fmeasure': 0.023746484694542977}, 'rougeL': {'precision': 0.13371991134339328, 'recall': 0.14885487901792285, 'fmeasure': 0.1357002470673943}, 'rougeLsum': {'precision': 0.13366777713205188, 'recall': 0.14852742617688303, 

In [55]:
set_name = "Train"; compute_mean_var(tr_metrics, set_name)
set_name = "Validation"; compute_mean_var(val_metrics, set_name)

Train Mean: 0.31595160969417585 and Variance: 8.062557959681058e-05
Validation Mean: 0.1601169643915677 and Variance: 5.400468650616418e-05


### T5 Large

In [9]:
config = {}
config['model_type'] = 't5-large'
config['epochs'] = 1
config['lr'] = 2e-5
config['eps'] = 1e-8
config['weight_decay'] = 0
config['warmup_steps'] = 1e2
config['batch_size'] = 4
config['optimizer'] = 'adamW'


#Only for T5
df_tr = df_train
df_v = df_val
df_te = df_test
df_tr = concat_tag(df_tr, 'argument')
df_v = concat_tag(df_v, 'argument')
df_te = concat_tag(df_te, 'argument')

In [10]:
df_test = df_test.sample(frac = 0.00018, random_state = 270898)

In [11]:
tr_metrics = []; val_metrics = [];
save_res = ""
for i in range(5):
    if i == 4:
        save_res = 't5_large_val.csv'
    tr, val = train_best_model(config, df_train, df_val, df_test, 100, None, ['rouge'], device, save_res)
    tr_metrics.append(tr)
    val_metrics.append(val)

Starting Training!
Current Time = 17:55:30
Training ended!
Current Time = 18:40:15
Train performances
[{'rouge': {'rouge1': {'precision': 0.13039234806901873, 'recall': 0.2772629269643505, 'fmeasure': 0.16647823191251548}, 'rouge2': {'precision': 0.04069219795437884, 'recall': 0.09150309764297974, 'fmeasure': 0.052229193176196106}, 'rougeL': {'precision': 0.12093662889182433, 'recall': 0.2586256273467172, 'fmeasure': 0.1545348879978713}, 'rougeLsum': {'precision': 0.12096531020286447, 'recall': 0.2586143470270639, 'fmeasure': 0.15460781518492983}}}]


Validation performances
{'rouge': {'rouge1': {'precision': 0.1387286630127928, 'recall': 0.2662663448668885, 'fmeasure': 0.17147911701220686}, 'rouge2': {'precision': 0.044379200193840965, 'recall': 0.07188973717506326, 'fmeasure': 0.051012310368146185}, 'rougeL': {'precision': 0.13318115725233315, 'recall': 0.2531902879728971, 'fmeasure': 0.1639344075837387}, 'rougeLsum': {'precision': 0.13250231357363396, 'recall': 0.2522886408076629, '

In [7]:
set_name = "Train"; compute_mean_var(tr_metrics, set_name)
set_name = "Validation"; compute_mean_var(val_metrics, set_name)

Train Mean: 0.16612830272796644 and Variance: 4.306871295187838e-06
Validation Mean: 0.17183199859090229 and Variance: 4.216180926595218e-06


### Pegasus Large

In [10]:
config = {}
config['model_type'] = 'google/pegasus-large'
config['epochs'] = 1
config['lr'] = 5e-4
config['eps'] = 1e-8
config['weight_decay'] = 1e-6
config['warmup_steps'] = 1e3
config['batch_size'] = 8
config['optimizer'] = 'adamW'

In [11]:
tr_metrics = []; val_metrics = [];
save_res = ""
for i in range(5):
    if i == 4:
        save_res = 'pegasus_large_val.csv'
    tr, val = train_best_model(config, df_train, df_val, df_test, 100, None, ['rouge'], device, save_res)
    tr_metrics.append(tr)
    val_metrics.append(val)

Starting Training!
Current Time = 20:59:19
Training ended!
Current Time = 21:16:59
Train performances
[{'rouge': {'rouge1': {'precision': 0.18183926160919697, 'recall': 0.23309780080863485, 'fmeasure': 0.18514979968414852}, 'rouge2': {'precision': 0.09583374205499902, 'recall': 0.12766603652619835, 'fmeasure': 0.10276293387582418}, 'rougeL': {'precision': 0.1755627003410492, 'recall': 0.2241461695955993, 'fmeasure': 0.1784404415138791}, 'rougeLsum': {'precision': 0.17570379543193, 'recall': 0.22407952524999306, 'fmeasure': 0.17848723751034817}}}]


Validation performances
{'rouge': {'rouge1': {'precision': 0.15293951054052973, 'recall': 0.18594011413304923, 'fmeasure': 0.15968759432984228}, 'rouge2': {'precision': 0.03507326887082321, 'recall': 0.03795505521048996, 'fmeasure': 0.03378925066987077}, 'rougeL': {'precision': 0.14019070757160124, 'recall': 0.16872666886253876, 'fmeasure': 0.14533371129964887}, 'rougeLsum': {'precision': 0.14031292677792073, 'recall': 0.16879261036869775, '

In [12]:
set_name = "Train"; compute_mean_var(tr_metrics, set_name)
set_name = "Validation"; compute_mean_var(val_metrics, set_name)

Train Mean: 0.18497260165827967 and Variance: 2.05267845038774e-05
Validation Mean: 0.14080026448339333 and Variance: 0.0002468332494546151
