In [1]:
import argparse
import glob
import logging
import os
import random

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
import sys
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForMultipleChoice,
    BertTokenizer,
    RobertaConfig,
    RobertaTokenizer,
    RobertaForMultipleChoice,
    XLNetConfig,
    XLNetForMultipleChoice,
    XLNetTokenizer,
    get_linear_schedule_with_warmup,
)


from utils_multiple_choice import convert_examples_to_features, processors
ALL_MODELS = sum(
    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
)

MODEL_CLASSES = {
    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
}


def select_field(features, field):
    return [[choice[field] for choice in feature.choices_features] for feature in features]


def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

def evaluate(args, model, tokenizer, examples_list, prefix="", test=False):
    eval_task_names = (args.task_name,)
    eval_outputs_dirs = (args.output_dir,)
    results = {}
    
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        
        processor = processors[eval_task]()
        label_list = processor.get_labels()
        examples = processor.process_these_examples(examples_list)
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
        all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
        all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)

        eval_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # multi-gpu evaluate
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        output_logit_list = []
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2]
                    if args.model_type in ["bert", "xlnet"]
                    else None,  # XLM don't use segment_ids
                    "labels": batch[3],
                }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            
            output_logit_list += logits
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
        eval_loss = eval_loss / nb_eval_steps
        preds = np.argmax(preds, axis=1)
        acc = simple_accuracy(preds, out_label_ids)
        result = {"eval_acc": acc, "eval_loss": eval_loss}
        results.update(result)

 
    return {
        "results":results,
        "output_logit_list": output_logit_list
    }



In [2]:
args = {
    "model_type": None,
    "task_name": "csqa",
    "model_name_or_path" : None,
    "do_eval": False,
    "do_test": False,
    "do_train": False,
    "do_lower_case": False,
    "data_dir": None,
    "per_gpu_eval_batch_size" : 16,
    "overwrite_output": False,
    "local_rank": -1,
    "server_ip": "",
    "server_port": "",
    "fp16_opt_level": "O1",
    "fp16": False,
    "seed": 42,
    "overwrite_cache": False,
    "overwrite_output_dir": False,
    "no_cuda": False,
    "eval_all_checkpoints": False,
    "save_steps": 500,
    "logging_steps": 500,
    "warmup_steps": 0,
    "max_steps": -1,
    "num_train_epochs": 3.0,
    "max_grad_norm": 1.0,
    "adam_epsilon": 1e-8,
    "weight_decay": 0.0,
    "learning_rate": 5e-5,
    "gradient_accumulation_steps": 1,
    "per_gpu_eval_batch_size": 8,
    "per_gpu_train_batch_size": 8,
    "evaluate_during_training": False,
    "max_seq_length": 128,
    "cache_dir": "",
    "tokenizer_name": "",
    "config_name": "",
    "output_dir": None,
}

args["model_type"] = "roberta"
args["task_name"] = "csqa"
args["model_name_or_path"] = "./csqa_roberta_large_result/"
args["do_eval"] = True
args["do_lower_case"] = True
args["max_seq_length"] = 80
args["data_dir"] = "./common_sense_data/"
args["output_dir"] = "./csqa_result"
args["per_gpu_eval_batch_size"] = 4
args["overwrite_output"] = True
args["overwrite_output_dir"] = True

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

temp_args = AttrDict()
temp_args.update(args)
args = temp_args


if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.n_gpu = 1
args.device = device


set_seed(args)

# Prepare GLUE task
args.task_name = args.task_name.lower()
if args.task_name not in processors:
    raise ValueError("Task not found: %s" % (args.task_name))
processor = processors[args.task_name]()
label_list = processor.get_labels()
num_labels = len(label_list)


args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(
    args.config_name if args.config_name else args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=args.task_name,
    cache_dir=args.cache_dir if args.cache_dir else None,
)
tokenizer = tokenizer_class.from_pretrained(
    args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    do_lower_case=args.do_lower_case,
    cache_dir=args.cache_dir if args.cache_dir else None,
)
model = model_class.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),
    config=config,
    cache_dir=args.cache_dir if args.cache_dir else None,
    
)

model.to(args.device)

RobertaForMultipleChoice(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): Layer

In [38]:
import json
answer_map = {
    0: "A",
    1 : "B",
    2: "C",
    3: "D",
    4: "E",
}

data_str = "What do people aim to do at work?	complete job	learn from each other	kill animals	wear hats	talk to each other	2.7167	-0.4648	-2.1522	-4.8394	-4.4805	0.9519488215	0.03952808306	0.007312688511	0.0004977841745	0.0007127040299	1	0	A	A	1					0	0	"

parts = data_str.split("\t")
data = {'answerKey': parts[18],
 'id': '075e483d21c29a511267ef62bedc0461',
 'question': {'choices': [{'label': 'A', 'para': 'dummy', 'text': parts[1]},
                          {'label': 'B', 'para': 'dummy', 'text': parts[2]},
                          {'label': 'C', 'para': 'dummy', 'text': parts[3]},
                          {'label': 'D', 'para': 'dummy', 'text': parts[4]},
                          {'label': 'E', 'para': 'dummy', 'text': parts[5]}],
              'question_concept': 'dummy',
              'stem': parts[0]}}

raw_examples = [data]
examples_list = [json.dumps(que) for que in raw_examples]
eval_output = evaluate(args, model, tokenizer, examples_list)
results = eval_output["results"]
output_logits = eval_output["output_logit_list"]
print(results)
counter = 0
for counter in range(len(output_logits)):
    point = output_logits[0]
    option_scores = [ele.item() for ele in point]
    print("Question: ", raw_examples[counter]["question"]["stem"])
    print("Options: ", [choice["label"] + " : " + choice["text"] for choice in raw_examples[counter]["question"]["choices"]])
    print("Answer: ", raw_examples[counter]["answerKey"])
    print("Predicted_answer: ", answer_map[option_scores.index(max(option_scores))])
    print("option scores: ", option_scores)

    
    

read CommonSenseQA data: 100%|██████████| 1/1 [00:00<00:00, 274.46it/s]
convert examples to features: 1it [00:00, 130.94it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 23.16it/s]

{'eval_acc': 1.0, 'eval_loss': 0.054653167724609375}
Question:  What do people aim to do at work?
Options:  ['A : complete job', 'B : learn from each other', 'C : kill animals', 'D : wear hats', 'E : talk to each other']
Answer:  A
Predicted_answer:  A



