In [1]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import random
import json

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import random
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange
from tensorboardX import SummaryWriter
import math

from pytorch_transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                                  DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)

from pytorch_transformers import AdamW, WarmupLinearSchedule

from utils import (convert_examples_to_features,
                        output_modes, processors)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
args = {
    'do_eval': True,
    'data_dir': 'data-sarcasm/',
    'output_dir': 'outputs',
    'model_dir': 'ensembles_models/',
    'task_name': 'binary',
    'eval_batch_size': 12,
    'eval_all_checkpoints': True,
    'max_seq_length': 128,
    'output_mode': 'classification',
    'reprocess_input_data': True
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
task = args['task_name']

if task in processors.keys() and task in output_modes.keys():
    processor = processors[task]()
    label_list = processor.get_labels()
    num_labels = len(label_list)
else:
    raise KeyError(f'{task} not found in processors or in output_modes. Please check utils.py.')

In [3]:
def load_and_cache_examples(task, tokenizer, model_type, model_name):
    processor = processors[task]()
    output_mode = args['output_mode']
    
    mode = 'dev'
    cached_features_file = os.path.join(args['data_dir'], f"cached_{mode}_{model_name}_{args['max_seq_length']}_{task}")
    
    if os.path.exists(cached_features_file) and not args['reprocess_input_data']:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
               
    else:
        logger.info("Creating features from dataset file at %s", args['data_dir'])
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args['data_dir']) if evaluate else processor.get_train_examples(args['data_dir'])
        
        if __name__ == "__main__":
            features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode,
                cls_token_at_end=bool(model_type in ['xlnet']),            # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if model_type in ['xlnet'] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=bool(model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(model_type in ['xlnet']),                 # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                pad_token_segment_id=4 if model_type in ['xlnet'] else 0)
        
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

In [7]:
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix
from scipy.stats import pearsonr

def get_mismatched(labels, preds):
    mismatched = labels != preds
    examples = processor.get_dev_examples(args['data_dir'])
    wrong = [i for (i, v) in zip(examples, mismatched) if v]
    
    return wrong

def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }, get_mismatched(labels, preds)

def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def evaluate(model, tokenizer, model_type="bert", model_name="bert-base-cased", prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args['output_dir']

    results = {}
    EVAL_TASK = args['task_name']

    eval_dataset = load_and_cache_examples(EVAL_TASK, tokenizer, model_type, model_name)
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)


    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            if model_type not in ['distilbert']:
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if model_type in ['bert', 'xlnet'] else None,
                          'labels':         batch[3]}
            else:
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args['output_mode'] == "classification":
        preds = np.argmax(preds, axis=1)
    elif args['output_mode'] == "regression":
        preds = np.squeeze(preds)
    result, wrong = compute_metrics(EVAL_TASK, preds, out_label_ids)
    results.update(result)

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results, wrong

In [11]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),
    'distilbert':(DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-uncased')
}

In [6]:
results = {}
if args['do_eval']:
    checkpoints = [args['output_dir']]
    if args['eval_all_checkpoints']:
        checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['model_dir'] + '/**/' + WEIGHTS_NAME, recursive=True)))
        logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    logger.info("Evaluate the following checkpoints: %s", checkpoints)

INFO:__main__:Evaluate the following checkpoints: ['ensembles_models/bert/checkpoint-2000', 'ensembles_models/bert/checkpoint-4000', 'ensembles_models/bert/checkpoint-6000', 'ensembles_models/bert', 'ensembles_models/distilbert/checkpoint-2000', 'ensembles_models/distilbert/checkpoint-4000', 'ensembles_models/distilbert', 'ensembles_models/roberta/checkpoint-2000', 'ensembles_models/roberta/checkpoint-4000', 'ensembles_models/roberta', 'ensembles_models/xlnet/checkpoint-2000', 'ensembles_models/xlnet/checkpoint-4000', 'ensembles_models/xlnet']


In [12]:
if args['do_eval']:    
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        model_type = checkpoint.split('/')[1]
        logger.info("Evaluate the following model type: %s", model_type)
        
        config_class, model_class, tokenizer_class, model_name = MODEL_CLASSES[model_type]
        
        config = config_class.from_pretrained(model_name, num_labels=2, finetuning_task=args['task_name'])
        
        model = model_class.from_pretrained(checkpoint)
        model.to(device)
        
        tokenizer = tokenizer_class.from_pretrained(model_name)
        
        result, wrong_preds = evaluate(model, tokenizer, model_type, model_name, prefix=global_step)
        result = dict((model_type + '_' + k + '_{}'.format(global_step), v) for k, v in result.items())
        results.update(result)

INFO:__main__:Evaluate the following model type: bert
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 10489.49it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_bert-base-cased_128_binary
INFO:__main__:***** Running evaluation 2000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 2000 *****
INFO:__main__:  fn = 785
INFO:__main__:  fp = 418
INFO:__main__:  mcc = 0.2797035315173321
INFO:__main__:  tn = 1862
INFO:__main__:  tp = 623
INFO:__main__:Evaluate the following model type: bert





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 12428.02it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_bert-base-cased_128_binary
INFO:__main__:***** Running evaluation 4000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 4000 *****
INFO:__main__:  fn = 786
INFO:__main__:  fp = 418
INFO:__main__:  mcc = 0.27901831286249507
INFO:__main__:  tn = 1862
INFO:__main__:  tp = 622
INFO:__main__:Evaluate the following model type: bert





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 12033.31it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_bert-base-cased_128_binary
INFO:__main__:***** Running evaluation 6000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 6000 *****
INFO:__main__:  fn = 893
INFO:__main__:  fp = 374
INFO:__main__:  mcc = 0.22913483769035498
INFO:__main__:  tn = 1906
INFO:__main__:  tp = 515
INFO:__main__:Evaluate the following model type: bert





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 10395.12it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_bert-base-cased_128_binary
INFO:__main__:***** Running evaluation ensembles_models/bert *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results ensembles_models/bert *****
INFO:__main__:  fn = 892
INFO:__main__:  fp = 374
INFO:__main__:  mcc = 0.2298533874976484
INFO:__main__:  tn = 1906
INFO:__main__:  tp = 516
INFO:__main__:Evaluate the following model type: distilbert





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 9086.99it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_distilbert-base-uncased_128_binary
INFO:__main__:***** Running evaluation 2000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 2000 *****
INFO:__main__:  fn = 888
INFO:__main__:  fp = 388
INFO:__main__:  mcc = 0.22457836410984341
INFO:__main__:  tn = 1892
INFO:__main__:  tp = 520
INFO:__main__:Evaluate the following model type: distilbert





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 12052.12it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_distilbert-base-uncased_128_binary
INFO:__main__:***** Running evaluation 4000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 4000 *****
INFO:__main__:  fn = 804
INFO:__main__:  fp = 423
INFO:__main__:  mcc = 0.26385940441112005
INFO:__main__:  tn = 1857
INFO:__main__:  tp = 604
INFO:__main__:Evaluate the following model type: distilbert





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 12148.07it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_distilbert-base-uncased_128_binary
INFO:__main__:***** Running evaluation ensembles_models/distilbert *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results ensembles_models/distilbert *****
INFO:__main__:  fn = 833
INFO:__main__:  fp = 413
INFO:__main__:  mcc = 0.24928365841491576
INFO:__main__:  tn = 1867
INFO:__main__:  tp = 575
INFO:__main__:Evaluate the following model type: roberta





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at /home/azureuser/.cache/torch/pytorch_transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 6298.92it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_roberta-base_128_binary
INFO:__main__:***** Running evaluation 2000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 2000 *****
INFO:__main__:  fn = 707
INFO:__main__:  fp = 484
INFO:__main__:  mcc = 0.29711189778238034
INFO:__main__:  tn = 1796
INFO:__main__:  tp = 701
INFO:__main__:Evaluate the following model type: roberta





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at /home/azureuser/.cache/torch/pytorch_transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 5350.78it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_roberta-base_128_binary
INFO:__main__:***** Running evaluation 4000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 4000 *****
INFO:__main__:  fn = 741
INFO:__main__:  fp = 472
INFO:__main__:  mcc = 0.280447502625932
INFO:__main__:  tn = 1808
INFO:__main__:  tp = 667
INFO:__main__:Evaluate the following model type: roberta





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at /home/azureuser/.cache/torch/pytorch_transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at /home/azureuser/.cache/torch/pytorch_transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 6162.45it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_roberta-base_128_binary
INFO:__main__:***** Running evaluation ensembles_models/roberta *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch 

A Jupyter Widget

INFO:__main__:***** Eval results ensembles_models/roberta *****
INFO:__main__:  fn = 722
INFO:__main__:  fp = 478
INFO:__main__:  mcc = 0.2901449944460227
INFO:__main__:  tn = 1802
INFO:__main__:  tp = 686
INFO:__main__:Evaluate the following model type: xlnet





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /home/azureuser/.cache/torch/pytorch_transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 15489.07it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_xlnet-base-cased_128_binary
INFO:__main__:***** Running evaluation 2000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 2000 *****
INFO:__main__:  fn = 853
INFO:__main__:  fp = 388
INFO:__main__:  mcc = 0.24945436132706805
INFO:__main__:  tn = 1892
INFO:__main__:  tp = 555
INFO:__main__:Evaluate the following model type: xlnet





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /home/azureuser/.cache/torch/pytorch_transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 15722.13it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_xlnet-base-cased_128_binary
INFO:__main__:***** Running evaluation 4000 *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results 4000 *****
INFO:__main__:  fn = 907
INFO:__main__:  fp = 345
INFO:__main__:  mcc = 0.23630994091773264
INFO:__main__:  tn = 1935
INFO:__main__:  tp = 501
INFO:__main__:Evaluate the following model type: xlnet





INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model from cache at /home/azureuser/.cache/torch/pytorch_transformers/dad589d582573df0293448af5109cb6981ca77239ed314e15ca63b7b8a318ddd.8b10bd978b5d01c21303cc761fc9ecd464419b3bf921864a355ba807cfbfafa8
INFO:__main__:Creating features from dataset file at data-sarcasm/
100%|██████████| 3688/3688 [00:00<00:00, 15693.08it/s]
INFO:__main__:Saving features into cached file data-sarcasm/cached_dev_xlnet-base-cased_128_binary
INFO:__main__:***** Running evaluation ensembles_models/xlnet *****
INFO:__main__:  Num examples = 3688
INFO:__main__:  Batch size = 12


A Jupyter Widget

INFO:__main__:***** Eval results ensembles_models/xlnet *****
INFO:__main__:  fn = 945
INFO:__main__:  fp = 338
INFO:__main__:  mcc = 0.21277562202875464
INFO:__main__:  tn = 1942
INFO:__main__:  tp = 463





In [13]:
results

{'bert_fn_2000': 785,
 'bert_fn_4000': 786,
 'bert_fn_6000': 893,
 'bert_fn_ensembles_models/bert': 892,
 'bert_fp_2000': 418,
 'bert_fp_4000': 418,
 'bert_fp_6000': 374,
 'bert_fp_ensembles_models/bert': 374,
 'bert_mcc_2000': 0.2797035315173321,
 'bert_mcc_4000': 0.27901831286249507,
 'bert_mcc_6000': 0.22913483769035498,
 'bert_mcc_ensembles_models/bert': 0.2298533874976484,
 'bert_tn_2000': 1862,
 'bert_tn_4000': 1862,
 'bert_tn_6000': 1906,
 'bert_tn_ensembles_models/bert': 1906,
 'bert_tp_2000': 623,
 'bert_tp_4000': 622,
 'bert_tp_6000': 515,
 'bert_tp_ensembles_models/bert': 516,
 'distilbert_fn_2000': 888,
 'distilbert_fn_4000': 804,
 'distilbert_fn_ensembles_models/distilbert': 833,
 'distilbert_fp_2000': 388,
 'distilbert_fp_4000': 423,
 'distilbert_fp_ensembles_models/distilbert': 413,
 'distilbert_mcc_2000': 0.22457836410984341,
 'distilbert_mcc_4000': 0.26385940441112005,
 'distilbert_mcc_ensembles_models/distilbert': 0.24928365841491576,
 'distilbert_tn_2000': 1892,
 'di