In [3]:
import torch
from challenge_metrics import load_kpm_data
from siamese_network import SiameseNetwork, train, test
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from matching_utils import compute_metrics, extract_challenge_metrics
from torch.utils.data import DataLoader
import numpy as np
import os
from datetime import datetime

import sys
sys.path.insert(1, "../")
import data_handler

import warnings
warnings.filterwarnings('ignore')

In [2]:
%load_ext autoreload
%autoreload 2

In [8]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
device = torch.device(0)

## Compute baseline

In [4]:
# Prepare data
_, df_val, df_test = data_handler.load(path="../dataset/", filename_test="test.csv", filename_dev="dev.csv", sep_char='#')
val_kpm_data = load_kpm_data("../dataset/", subset="dev")
test_kpm_data = load_kpm_data("../dataset/", subset="test")

# Concatenate topics and keypoints, as stated in the paper
df_test = data_handler.concatenate_topics(df_test)
df_val = data_handler.concatenate_topics(df_val)

max_length = 100

loss = torch.nn.MSELoss()
metrics = ['accuracy', 'precision', 'recall', 'f1']


ֿ** loading task data:

ֿ** loading task data:


In [5]:
def test_baseline(model_type, device, loss, df_val, df_test, val_kpm_data, test_kpm_data, metrics):
    """ Test baseline of model on validation and test set
    Parameters
    ----------
    model_type: string
        name of model
    device: torch device
        Selected device on which to perform the grid search 
        (usually a GPU)
    loss: function
        function which computes model's loss
    df_val: pd.DataFrame
        Validation Data
    df_test: pd.DataFrame
        Test Data
    val_kpm_data: tuple
        Validation data from challenge
    test_kpm_data: tuple
        Test data from challenge
    metrics: array-like
        array of strings containing metrics
        to compute
    """
    
    # Load our model's (bert-base-uncased) tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_type, do_lower_case=True)

    #Tokenize data
    columns_list = ['argument', 'key_points', 'label']
    tokenized_test = data_handler.tokenize_df(df_test[columns_list], tokenizer, max_length=max_length)
    tokenized_val = data_handler.tokenize_df(df_val[columns_list], tokenizer, max_length=max_length)

    test_loader = DataLoader(tokenized_test, pin_memory=True)
    val_loader = DataLoader(tokenized_val, pin_memory=True)
    
    # Load model and move it on the desired device
    model = SiameseNetwork(model_type=AutoModel.from_pretrained(model_type))
    model.to(device)
    print("Model successfully loaded!\n")
    
    # Put model in evaluation mode
    model.eval()
    
    val_res = test(model, device, val_loader, loss)

    # Compute metrics
    val_metrics = compute_metrics(val_res['predicted'].T, val_res['labels'].T, metrics)
    print(f"Validation results with model {model_type}:")
    print(val_metrics)

    # Compute challenge metrics
    val_challenge_metrics = extract_challenge_metrics(val_res['predicted'].T, val_kpm_data[2], val_kpm_data[0], val_kpm_data[1])
    print(val_challenge_metrics)
    
    test_res = test(model, device, test_loader, loss)

    # Compute metrics
    test_metrics = compute_metrics(test_res['predicted'].T, test_res['labels'].T, metrics)
    print(f"Test results with model {model_type}:")
    print(test_metrics)

    # Compute challenge metrics
    test_challenge_metrics = extract_challenge_metrics(test_res['predicted'].T, test_kpm_data[2], test_kpm_data[0], test_kpm_data[1])
    print(test_challenge_metrics)

BERT baseline:

In [6]:
test_baseline('bert-base-uncased', device, loss, df_val, df_test, val_kpm_data, test_kpm_data, metrics)

Model successfully loaded!

Validation results with model bert-base-uncased:
{'accuracy': 0.21341816078658185, 'precision': 0.21341816078658185, 'recall': 1.0, 'f1': 0.3517635843660629}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.2299602651825317 ; mAP relaxed = 0.2299602651825317
(0.2299602651825317, 0.2299602651825317)
Test results with model bert-base-uncased:
{'accuracy': 0.16112084063047286, 'precision': 0.16112084063047286, 'recall': 1.0, 'f1': 0.27752639517345395}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.15331714855933892 ; mAP relaxed = 0.15331714855933892
(0.15331714855933892, 0.15331714855933892)


RoBERTa-base baseline:

In [11]:
test_baseline('roberta-base', device, loss, df_val, df_test, val_kpm_data, test_kpm_data, metrics)

Model successfully loaded!

Validation results with model roberta-base:
{'accuracy': 0.21341816078658185, 'precision': 0.21341816078658185, 'recall': 1.0, 'f1': 0.3517635843660629}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.12368955349553747 ; mAP relaxed = 0.12368955349553747
None
Test results with model roberta-base:
{'accuracy': 0.16112084063047286, 'precision': 0.16112084063047286, 'recall': 1.0, 'f1': 0.27752639517345395}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.059267577712564024 ; mAP relaxed = 0.059267577712564024
None


RoBERTa-Large baseline:

In [7]:
test_baseline('roberta-large', device, loss, df_val, df_test, val_kpm_data, test_kpm_data, metrics)

Model successfully loaded!

Validation results with model roberta-large:
{'accuracy': 0.21341816078658185, 'precision': 0.21341816078658185, 'recall': 1.0, 'f1': 0.3517635843660629}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.07408259804980344 ; mAP relaxed = 0.07408259804980344
None
Test results with model roberta-large:
{'accuracy': 0.16112084063047286, 'precision': 0.16112084063047286, 'recall': 1.0, 'f1': 0.27752639517345395}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.0453987591811596 ; mAP relaxed = 0.0453987591811596
None


# Re-train Best Models

In [4]:
# Prepare data
df_train, df_val, df_test = data_handler.load(path="../dataset/", filename_train="train.csv", filename_test="test.csv", filename_dev="dev.csv", sep_char='#')

train_kpm_data = load_kpm_data("../dataset/", subset="train")
val_kpm_data = load_kpm_data("../dataset/", subset="dev")
test_kpm_data = load_kpm_data("../dataset/", subset="test")

# Concatenate topics and keypoints, as stated in the paper
df_train = data_handler.concatenate_topics(df_train)
df_val = data_handler.concatenate_topics(df_val)
df_test = data_handler.concatenate_topics(df_test)

max_length = 100

loss = torch.nn.MSELoss()
metrics = ['accuracy', 'precision', 'recall', 'f1']


ֿ** loading task data:

ֿ** loading task data:

ֿ** loading task data:


In [5]:
def train_best_model(config, df_train, df_val, df_test, max_length, loss, metrics, train_kpm_data, val_kpm_data, test_kpm_data, device, save_model=""):
    """ Train model and evaluate it on
    Validation and Test sets, computing
    the metrics for each dataset
    Parameters
    ----------
    config: dict
        contains parameters and 
        hyper-parameters useful to train
        the model
    df_train: pd.DataFrame
        Training data
    df_val: pd.DataFrame
        Validation Data
    df_test: pd.DataFrame
        Test Data
    max_length: int
        Max number of tokens
    loss: function
        function which computes model's loss
    metrics: array-like
        array of strings containing metrics
        to compute
    train_kpm_data: tuple
        Training data from challenge
    val_kpm_data: tuple
        Validation data from challenge
    test_kpm_data: tuple
        Test data from challenge
    device: torch device
        Selected device on which to perform the grid search 
        (usually a GPU)
    save_model: string default=""
        if the string is not empty the model will
        be saved to the path contained in the string,
        otherwise it won't be saved
    Returns
    -------
    train_challenge_metrics: array-like
        Scores on the Training data of the challenge metrics 
    val_challenge_metrics: tuple
        Scores on the Validation data of the challenge metrics 
    test_challenge_metrics: tuple
        Scores on the Test data of the challenge metrics 
    """
    
    # Load the model's tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config['model_type'], do_lower_case=True)

    #Tokenize data
    columns_list = ['argument', 'key_points', 'label']
    tokenized_tr = data_handler.tokenize_df(df_train[columns_list], tokenizer, max_length=max_length)
    tokenized_val = data_handler.tokenize_df(df_val[columns_list], tokenizer, max_length=max_length)
    tokenized_test = data_handler.tokenize_df(df_test[columns_list], tokenizer, max_length=max_length)

    train_loader = DataLoader(tokenized_tr, batch_size = config['batch_size'], pin_memory=True)
    val_loader = DataLoader(tokenized_val, pin_memory=True)
    test_loader = DataLoader(tokenized_test, pin_memory=True)

    # Create model and move it to the desired device 
    model = SiameseNetwork(model_type=AutoModel.from_pretrained(config['model_type']))
    model.to(device)
    model.train()
    
    total_steps = len(train_loader) * config['epochs']
    
    # Create optimizer and scheduler with the desired hyper-parameters
    if config['optimizer'] == 'adamW':
        optimizer= torch.optim.AdamW(model.parameters(),
                  lr = config['lr'], 
                  eps = config['eps'],
                  weight_decay = config['weight_decay'])

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                        num_warmup_steps = config['warmup_steps'],
                                        num_training_steps = total_steps)
    
    
    print("Starting Training!")
    # Train best model
    show_time()
    train_res = train(model, device, train_loader, loss, optimizer, config['epochs'], scheduler, verbose=False)
    print("Training ended!")
    show_time()
    
    # Compute metrics for each epoch
    train_metrics = [0] * config['epochs']
    train_challenge_metrics = [0] * config['epochs']

    for i in range(config['epochs']):
        train_metrics[i] = compute_metrics(train_res['predicted'], train_res['labels'], metrics)
        print(f"Epoch {i+1}/{config['epochs']} results:\n- Train Metrics: {train_metrics[i]}\n- Train Challenge Metrics: ")
        train_challenge_metrics[i] = extract_challenge_metrics(train_res['predicted'], train_kpm_data[2], train_kpm_data[0], train_kpm_data[1])

    # Validate model
    model.eval()
    val_res = test(model, device, val_loader, loss)
    val_metrics = compute_metrics(val_res['predicted'].T, val_res['labels'].T, metrics)
    print(f"Validation results {config['model_type']}:")
    print(val_metrics)

    val_challenge_metrics = extract_challenge_metrics(val_res['predicted'].T, val_kpm_data[2], val_kpm_data[0], val_kpm_data[1])
    print(val_challenge_metrics)
    
    # Test model
    model.eval()
    test_res = test(model, device, test_loader, loss)

    test_metrics = compute_metrics(test_res['predicted'].T, test_res['labels'].T, metrics)
    print(f"Test results with model {config['model_type']}:")
    print(test_metrics)

    test_challenge_metrics = extract_challenge_metrics(test_res['predicted'].T, test_kpm_data[2], test_kpm_data[0], test_kpm_data[1])
    print(test_challenge_metrics)
    
    if save_model:
        torch.save(model.state_dict(), save_model)
    
    return train_challenge_metrics, val_challenge_metrics, test_challenge_metrics

In [6]:
def show_time():
    """ Displays current time
    """
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

In [44]:
def compute_mean_var(arr, set_name):
    """ Compute and print mean and variance of the computed scores
    Parameters
    ----------
    arr: array-like
        array containing the scores
    set_name: string
        name of set of data
    """
    arr = np.array(arr)
    # Array of single values to compute mean and variance on
    vals = []
    # Extract values of elements in the array correctly, depending on their shape
    if len(arr.shape) == 3:
        for i in range(len(arr)):
            vals.append(arr[i][0][0])
    else:
        for i in range(len(arr)):
            vals.append(arr[i][0])

    vals = np.array(vals)
    print(f"{set_name} Mean: {vals.mean()} and Variance: {vals.var()}")

### Roberta-Large

In [7]:
# Best config of hyper-parameters
config = {}
config['model_type'] = 'roberta-large'
config['epochs'] = 1
config['lr'] = 6e-6
config['eps'] = 1e-8
config['weight_decay'] = 1e-2
config['warmup_steps'] = 0
config['batch_size'] = 8
config['optimizer'] = 'adamW'

In [8]:
tr_metrics = []; val_metrics = []; test_metrics = [];

for i in range(5):
    tr, val, ts = train_best_model(config, df_train, df_val, df_test, max_length, loss, metrics, train_kpm_data, val_kpm_data, test_kpm_data, device)
    tr_metrics.append(tr)
    val_metrics.append(val)
    test_metrics.append(ts)

Starting Training!
Current Time = 22:17:26
Training ended!
Current Time = 22:36:04
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8995880785073903, 'precision': 0.8234772324068599, 'recall': 0.653755868544601, 'f1': 0.7288667887987438}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7927716645759911 ; mAP relaxed = 0.7927716645759911
Validation results roberta-large:
{'accuracy': 0.8525159051474841, 'precision': 0.6565934065934066, 'recall': 0.6476964769647696, 'f1': 0.6521145975443383}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.7937510676375459 ; mAP relaxed = 0.7937510676375459
(0.7937510676375459, 0.7937510676375459)
Test results with model roberta-large:
{'accuracy': 0.7189141856392294, 'precision': 0.34652725914861837, 'recall': 0.8405797101449275, 'f1': 0.4907456372289794}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.6953469539396201 ; mAP relaxed = 0.69534695

Starting Training!
Current Time = 22:41:17
Training ended!
Current Time = 22:59:57
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8954204022292221, 'precision': 0.8124256837098692, 'recall': 0.6415492957746479, 'f1': 0.7169464847848899}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7723323801350089 ; mAP relaxed = 0.7723323801350089
Validation results roberta-large:
{'accuracy': 0.8721804511278195, 'precision': 0.7176470588235294, 'recall': 0.6612466124661247, 'f1': 0.6882933709449929}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.8017381088936288 ; mAP relaxed = 0.8017381088936288
(0.8017381088936288, 0.8017381088936288)
Test results with model roberta-large:
{'accuracy': 0.7965557501459428, 'precision': 0.4298160696999032, 'recall': 0.8043478260869565, 'f1': 0.5602523659305995}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.7978766740951682 ; mAP relaxed = 0.79787667

Starting Training!
Current Time = 23:05:23
Training ended!
Current Time = 23:24:01
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8958565543978677, 'precision': 0.80709921443119, 'recall': 0.6511737089201878, 'f1': 0.7208003118097961}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7814010489066355 ; mAP relaxed = 0.7814010489066355
Validation results roberta-large:
{'accuracy': 0.8689994216310005, 'precision': 0.699859747545582, 'recall': 0.6761517615176151, 'f1': 0.687801516195727}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.8143255401800926 ; mAP relaxed = 0.8143255401800926
(0.8143255401800926, 0.8143255401800926)
Test results with model roberta-large:
{'accuracy': 0.8289550496205488, 'precision': 0.4819915254237288, 'recall': 0.8242753623188406, 'f1': 0.6082887700534759}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.8599426769174622 ; mAP relaxed = 0.859942676917

Starting Training!
Current Time = 23:29:16
Training ended!
Current Time = 23:47:55
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.894935788708505, 'precision': 0.8111243307555027, 'recall': 0.6401408450704226, 'f1': 0.7155602204145894}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7742477140948688 ; mAP relaxed = 0.7742477140948688
Validation results roberta-large:
{'accuracy': 0.8765182186234818, 'precision': 0.7345399698340875, 'recall': 0.6598915989159891, 'f1': 0.6952177016416845}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.8156667166049376 ; mAP relaxed = 0.8156667166049376
(0.8156667166049376, 0.8156667166049376)
Test results with model roberta-large:
{'accuracy': 0.822533566841798, 'precision': 0.46963123644251625, 'recall': 0.7844202898550725, 'f1': 0.5875169606512891}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.7351702272042945 ; mAP relaxed = 0.735170227

Starting Training!
Current Time = 23:53:19
Training ended!
Current Time = 00:11:57
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8995880785073903, 'precision': 0.828134373125375, 'recall': 0.6481220657276995, 'f1': 0.727153015538583}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7921442889527963 ; mAP relaxed = 0.7921442889527963
Validation results roberta-large:
{'accuracy': 0.8658183921341817, 'precision': 0.686141304347826, 'recall': 0.6842818428184282, 'f1': 0.6852103120759837}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.8104217379018902 ; mAP relaxed = 0.8104217379018902
(0.8104217379018902, 0.8104217379018902)
Test results with model roberta-large:
{'accuracy': 0.7775831873905429, 'precision': 0.40625, 'recall': 0.8242753623188406, 'f1': 0.5442583732057417}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.7951464153900786 ; mAP relaxed = 0.7951464153900786
(0.79

In [50]:
set_name = "Train"; compute_mean_var(tr_metrics, set_name)
set_name = "Validation"; compute_mean_var(val_metrics, set_name)
set_name = "Test"; compute_mean_var(test_metrics, set_name)

Train Mean: 0.7825794193330602 and Variance: 7.423525505979528e-05
Validation Mean: 0.807180634243619 and Variance: 6.870847380400754e-05
Test Mean: 0.7647013518721679 and Variance: 0.003345143832373689


### Roberta Base

In [56]:
# Best config of hyper-parameters
config = {}
config['model_type'] = 'roberta-base'
config['epochs'] = 1
config['lr'] = 3e-5
config['eps'] = 1e-6
config['weight_decay'] = 0
config['warmup_steps'] = 0
config['batch_size'] = 16
config['optimizer'] = 'adamW'

In [57]:
tr_metrics_roberta = []; val_metrics_roberta = []; test_metrics_roberta = [];

for i in range(5):
    tr, val, ts = train_best_model(config, df_train, df_val, df_test, max_length, loss, metrics, train_kpm_data, val_kpm_data, test_kpm_data, device)
    tr_metrics_roberta.append(tr)
    val_metrics_roberta.append(val)
    test_metrics_roberta.append(ts)

Starting Training!
Current Time = 02:50:45
Training ended!
Current Time = 02:55:27
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8925127211049189, 'precision': 0.8001175778953556, 'recall': 0.6389671361502347, 'f1': 0.7105194466196814}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7598303888231116 ; mAP relaxed = 0.7598303888231116
Validation results roberta-base:
{'accuracy': 0.8412377096587623, 'precision': 0.6412556053811659, 'recall': 0.5813008130081301, 'f1': 0.6098081023454157}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.7617557141746133 ; mAP relaxed = 0.7617557141746133
(0.7617557141746133, 0.7617557141746133)
Test results with model roberta-base:
{'accuracy': 0.8257443082311734, 'precision': 0.4721189591078067, 'recall': 0.6902173913043478, 'f1': 0.5607064017660045}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.6849173469967372 ; mAP relaxed = 0.6849173469

Starting Training!
Current Time = 02:58:15
Training ended!
Current Time = 03:02:57
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8873273564332445, 'precision': 0.7868366439371479, 'recall': 0.6230046948356808, 'f1': 0.6954015459190358}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7558634668044911 ; mAP relaxed = 0.7558634668044911
Validation results roberta-base:
{'accuracy': 0.8504916136495084, 'precision': 0.6599131693198264, 'recall': 0.6178861788617886, 'f1': 0.6382085374387684}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.7532931058191165 ; mAP relaxed = 0.7532931058191165
(0.7532931058191165, 0.7532931058191165)
Test results with model roberta-base:
{'accuracy': 0.826328079392878, 'precision': 0.4736196319018405, 'recall': 0.6992753623188406, 'f1': 0.5647403072421361}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.6793356337207017 ; mAP relaxed = 0.67933563372

Starting Training!
Current Time = 03:05:50
Training ended!
Current Time = 03:10:32
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8877150472498183, 'precision': 0.7818392805337975, 'recall': 0.6326291079812206, 'f1': 0.6993642143505903}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7530316342929012 ; mAP relaxed = 0.7530316342929012
Validation results roberta-base:
{'accuracy': 0.8062463851937536, 'precision': 0.5398126463700235, 'recall': 0.6246612466124661, 'f1': 0.5791457286432161}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.6925447979408853 ; mAP relaxed = 0.6925447979408853
(0.6925447979408853, 0.6925447979408853)
Test results with model roberta-base:
{'accuracy': 0.805312317571512, 'precision': 0.4385026737967914, 'recall': 0.7427536231884058, 'f1': 0.5514458641560188}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.7522375168392447 ; mAP relaxed = 0.75223751683

Starting Training!
Current Time = 03:13:23
Training ended!
Current Time = 03:18:05
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8839350617882239, 'precision': 0.7782751417487317, 'recall': 0.612206572769953, 'f1': 0.6853238733412166}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7501870158940754 ; mAP relaxed = 0.7501870158940754
Validation results roberta-base:
{'accuracy': 0.7891844997108155, 'precision': 0.5049833887043189, 'recall': 0.6178861788617886, 'f1': 0.5557586837294333}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.7091879130920542 ; mAP relaxed = 0.7091879130920542
(0.7091879130920542, 0.7091879130920542)
Test results with model roberta-base:
{'accuracy': 0.7743724460011675, 'precision': 0.3968253968253968, 'recall': 0.769927536231884, 'f1': 0.5237215033887862}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.7074690838788946 ; mAP relaxed = 0.707469083878

Starting Training!
Current Time = 03:21:04
Training ended!
Current Time = 03:25:46
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8908165737824085, 'precision': 0.8028976758225174, 'recall': 0.6244131455399061, 'f1': 0.7024957084378713}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7666591769542629 ; mAP relaxed = 0.7666591769542629
Validation results roberta-base:
{'accuracy': 0.8522267206477733, 'precision': 0.6765163297045101, 'recall': 0.5894308943089431, 'f1': 0.6299782766111515}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.7530979935665372 ; mAP relaxed = 0.7530979935665372
(0.7530979935665372, 0.7530979935665372)
Test results with model roberta-base:
{'accuracy': 0.808231173380035, 'precision': 0.4418604651162791, 'recall': 0.7228260869565217, 'f1': 0.5484536082474227}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.6834034761849064 ; mAP relaxed = 0.68340347618

In [59]:
set_name = "Train"; compute_mean_var(tr_metrics_roberta, set_name)
set_name = "Validation"; compute_mean_var(val_metrics_roberta, set_name)
set_name = "Test"; compute_mean_var(test_metrics_roberta, set_name)

Train Mean: 0.7571143365537685 and Variance: 3.294036452069443e-05
Validation Mean: 0.7339759049186412 and Variance: 0.0007683014982765409
Test Mean: 0.7014726115240969 and Variance: 0.000740729902820524



### Bert

In [60]:
# Best config of hyper-parameters
config = {}
config['model_type'] = 'bert-base-uncased'
config['epochs'] = 1
config['lr'] = 3e-5
config['eps'] = 1e-6
config['weight_decay'] = 1e-2
config['warmup_steps'] = 0
config['batch_size'] = 8
config['optimizer'] = 'adamW'

In [61]:
tr_metrics_bert = []; val_metrics_bert = []; test_metrics_bert = [];

for i in range(5):
    tr, val, ts = train_best_model(config, df_train, df_val, df_test, max_length, loss, metrics, train_kpm_data, val_kpm_data, test_kpm_data, device)
    tr_metrics_bert.append(tr)
    val_metrics_bert.append(val)
    test_metrics_bert.append(ts)

Starting Training!
Current Time = 09:07:26
Training ended!
Current Time = 09:13:17
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8896050399806155, 'precision': 0.7972405518896221, 'recall': 0.6239436619718309, 'f1': 0.7000263365815117}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7378281893305237 ; mAP relaxed = 0.7378281893305237
Validation results bert-base-uncased:
{'accuracy': 0.772700983227299, 'precision': 0.474025974025974, 'recall': 0.5934959349593496, 'f1': 0.5270758122743683}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.650537089625474 ; mAP relaxed = 0.650537089625474
(0.650537089625474, 0.650537089625474)
Test results with model bert-base-uncased:
{'accuracy': 0.7705779334500875, 'precision': 0.38392857142857145, 'recall': 0.7010869565217391, 'f1': 0.4961538461538461}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.6620536815148249 ; mAP relaxed = 0.66205

Starting Training!
Current Time = 09:16:09
Training ended!
Current Time = 09:22:00
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8881027380663921, 'precision': 0.794802055001511, 'recall': 0.6173708920187794, 'f1': 0.6949398863786498}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7422235518125243 ; mAP relaxed = 0.7422235518125243
Validation results bert-base-uncased:
{'accuracy': 0.7706766917293233, 'precision': 0.47020585048754066, 'recall': 0.5880758807588076, 'f1': 0.522576760987357}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.667085088276972 ; mAP relaxed = 0.667085088276972
(0.667085088276972, 0.667085088276972)
Test results with model bert-base-uncased:
{'accuracy': 0.7848803269118505, 'precision': 0.4053224155578301, 'recall': 0.717391304347826, 'f1': 0.5179856115107915}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.7705481278833463 ; mAP relaxed = 0.770548

Starting Training!
Current Time = 09:25:04
Training ended!
Current Time = 09:31:00
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.887424279137388, 'precision': 0.7928636226186876, 'recall': 0.6154929577464788, 'f1': 0.6930091185410333}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7245120806854372 ; mAP relaxed = 0.7245120806854372
Validation results bert-base-uncased:
{'accuracy': 0.7755928282244072, 'precision': 0.4791666666666667, 'recall': 0.592140921409214, 'f1': 0.5296969696969698}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.6374135515609798 ; mAP relaxed = 0.6374135515609798
(0.6374135515609798, 0.6374135515609798)
Test results with model bert-base-uncased:
{'accuracy': 0.8245767659077642, 'precision': 0.4703030303030303, 'recall': 0.7028985507246377, 'f1': 0.5635439360929557}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.684497479862793 ; mAP relaxed = 0.684

Starting Training!
Current Time = 09:34:21
Training ended!
Current Time = 09:40:12
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8867942815604556, 'precision': 0.7856294536817102, 'recall': 0.6211267605633802, 'f1': 0.6937598321971683}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7372847892301083 ; mAP relaxed = 0.7372847892301083
Validation results bert-base-uncased:
{'accuracy': 0.757085020242915, 'precision': 0.44642857142857145, 'recall': 0.575880758807588, 'f1': 0.5029585798816568}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.6537430781764763 ; mAP relaxed = 0.6537430781764763
(0.6537430781764763, 0.6537430781764763)
Test results with model bert-base-uncased:
{'accuracy': 0.787215411558669, 'precision': 0.4045307443365696, 'recall': 0.6793478260869565, 'f1': 0.5070993914807304}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.6830080396437562 ; mAP relaxed = 0.68

Starting Training!
Current Time = 09:43:27
Training ended!
Current Time = 09:49:19
Epoch 1/1 results:
- Train Metrics: {'accuracy': 0.8856796704628059, 'precision': 0.7898749618786215, 'recall': 0.607981220657277, 'f1': 0.6870937790157846}
- Train Challenge Metrics: 
	loaded predictions for 5583 arguments

** running evalution:
mAP strict= 0.7299253339745607 ; mAP relaxed = 0.7299253339745607
Validation results bert-base-uncased:
{'accuracy': 0.7729901677270098, 'precision': 0.47368421052631576, 'recall': 0.573170731707317, 'f1': 0.5187001839362354}
	loaded predictions for 932 arguments

** running evalution:
mAP strict= 0.6517818415070075 ; mAP relaxed = 0.6517818415070075
(0.6517818415070075, 0.6517818415070075)
Test results with model bert-base-uncased:
{'accuracy': 0.7603619381202569, 'precision': 0.37178265014299333, 'recall': 0.7065217391304348, 'f1': 0.4871955028107433}
	loaded predictions for 723 arguments

** running evalution:
mAP strict= 0.6741351496370575 ; mAP relaxed = 0.

In [62]:
set_name = "Train"; compute_mean_var(tr_metrics_bert, set_name)
set_name = "Validation"; compute_mean_var(val_metrics_bert, set_name)
set_name = "Test"; compute_mean_var(test_metrics_bert, set_name)

Train Mean: 0.7343547890066308 and Variance: 3.9813163638447315e-05
Validation Mean: 0.6521121298293819 and Variance: 8.909750446326312e-05
Test Mean: 0.6948484957083556 and Variance: 0.0014964633567478485
