In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/csqa-logicalcombinations/test_all_hf.json
/kaggle/input/csqa-logicalcombinations/dev_all_hf.json
/kaggle/input/csqa-logicalcombinations/test_logical_combinations_output.json
/kaggle/input/csqa-logicalcombinations/dev_logical_combinations_output.json
/kaggle/input/csqa-logicalcombinations/logical_combinations_output.json
/kaggle/input/csqa-logicalcombinations/train_logical_combinations_output.json
/kaggle/input/csqa-logicalcombinations/train_all_hf.json


In [2]:
!pip install --upgrade --no-cache-dir protobuf==3.20.3
!pip install evaluate

Collecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.0
    Uninstalling protobuf-6.33.0:
      Successfully uninstalled protobuf-6.33.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
a2a-sdk 0.3.10 requires p

In [3]:
import logging
import math
import os
import re
import requests
import time
import json

# Convenience functions for running multi-angle models, either from loaded model or through API

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

SLOT_SHORTFORMS_DEFAULT = {"Q": "question", "C": "context", "A": "answer", "E": "explanation",
                   "M": "mcoptions", "R": "rationale", "P": "proof", "H": "hypothesis", "V": "valid"}

GENERATOR_OPTIONS_DEFAULT = {"min_length": 1, "max_length": 128, "num_beams": 1, "num_return_sequences": 1,
                             "do_sample": False, "top_k": 50, "top_p": 1.0, "temperature": 1.0,
                             "length_penalty": 1.0, "repetition_penalty": 1.0}

DEFAULT_SLOT_FORMAT = {"slot": "$SLOT$", "assign": " = ", "separator": " ; ", "missing_value": "N/A"}


def decompose_slots(string, fmt=None):
    fmt = fmt or DEFAULT_SLOT_FORMAT
    string = string.strip()
    no_slot = "PREFIX"
    slot_re = re.compile('(?i)'+re.escape(fmt['slot']).replace("SLOT", "(\\w*?)"))
    assign_re = re.escape(fmt['assign']).replace('\\ ','\\s*')
    separator_re = re.escape(fmt['separator']).replace('\\ ','\\s*')
    strip_re = re.compile(f"^({assign_re})?(.*?)({separator_re})?$")
    slot_pos = []
    for m in slot_re.finditer(string):
        slot_pos.append((m.span(), m.group(1)))
    if len(slot_pos) == 0:
        return {no_slot: string}
    if slot_pos[0][0][0] > 0:
        slot_pos = [((0,-1), no_slot)] + slot_pos
    res = {}
    for idx, (pos, slot_name) in enumerate(slot_pos):
        if idx == len(slot_pos) - 1:
            value = string[pos[1]+1:]
        else:
            value = string[pos[1]+1:slot_pos[idx+1][0][0]-1]
        m = strip_re.match(value)
        if m is not None:
            value = m.group(2)
        value = value.strip()
        if slot_name in res:
            value = res[slot_name] + " ~AND~ " + value
        res[slot_name] = value
    return res


def split_mcoptions(mcoptions):
    first_option = ord(mcoptions.strip()[1])
    labels = "".join([chr(x) for x in range(first_option, first_option+10)])
    choices = re.split("\\s*\\(["+labels+"]\\)\\s*", mcoptions)[1:]
    return (choices, chr(first_option))


def new_dict_update(old_dict, update_dict):
    if update_dict is None:
        return old_dict
    res = old_dict.copy()
    res.update(update_dict)
    return res


def make_input_string(fields, angle, fmt=None):
    fmt = fmt or DEFAULT_SLOT_FORMAT
    res = []
    # output angles
    for slot in angle[1]:
        res.append(fmt['slot'].replace("SLOT", slot))
    # input angles
    for slot in angle[0]:
        slot_name = fmt['slot'].replace("SLOT", slot)
        value = fields.get(slot, fmt['missing_value'])
        res.append(f"{slot_name}{fmt['assign']}{value}")
    return fmt['separator'].join(res)


def make_api_input_string(fields, angle, slot_key_from_lowercase, explicit_outputs=None, output_prefix=None):
    res = []
    for slot in angle[0]:
        slot_key = slot_key_from_lowercase.get(slot, slot[0].upper())
        value = fields[slot]
        res.append(f"{slot_key}: {value}")
    if explicit_outputs:
        res.append("X: " + make_mcoptions(explicit_outputs))
    for slot in angle[1]:
        slot_key = slot_key_from_lowercase.get(slot, slot[0].upper())
        if output_prefix is not None and slot in output_prefix:
            res.append(slot_key +"-prefix: " + output_prefix[slot])
        else:
            res.append(slot_key)
    return "\n".join(res)



# Load model and tokenizer, also return the cuda device used for input to model
def load_model(model_name_or_path, cuda_devices = None):
    from transformers import T5Tokenizer, T5ForConditionalGeneration

    cuda_devices = cuda_devices or []
    try:
        tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
    except:
        if os.path.exists("/t5-11b-tokenizer"):
            tokenizer = T5Tokenizer.from_pretrained("/t5-11b-tokenizer")
        else:
            tokenizer = T5Tokenizer.from_pretrained("t5-11b")
    model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)
    device_map = None
    if len(cuda_devices) > 1:
        # Split layers across the multiple GPUs, put extras in later devices to leave a bit extra on first one
        num_layers = model.config.num_layers
        n_gpu = len(cuda_devices)
        layers_per_gpu = num_layers // n_gpu
        has_one_extra = n_gpu - (num_layers - layers_per_gpu * n_gpu)
        device_map = {}
        current = 0
        for device in cuda_devices:
            next = current + layers_per_gpu
            if len(device_map) >= has_one_extra:
                next += 1
            device_map[device] = list(range(current, next))
            current = next
    if len(cuda_devices) > 0:
        device = f"cuda:{cuda_devices[0]}"
    else:
        device = "cpu"

    if device_map is not None:
        model.parallelize(device_map)
    else:
        model.to(device)
    return {"model": model, "tokenizer": tokenizer, "cuda_device": device}


# Run model in free generation mode, with optional output_prefix_string
def run_model(model, input_string, generator_options, output_prefix_string=None, output_scores=False):
    import torch
    with torch.no_grad():
        input_ids = model['tokenizer'].encode(input_string, return_tensors="pt").to(model['cuda_device'])
        encoder_outputs = model['model'].encoder(input_ids)
        decoder_input_ids = {}
        if output_prefix_string is not None:
            decoder_start_token_id = model['model'].config.decoder_start_token_id
            output_ids = model['tokenizer'].encode(output_prefix_string, return_tensors="pt", add_special_tokens=False)
            decoder_input_ids = torch.cat((torch.LongTensor([[decoder_start_token_id] * len(output_ids)]), output_ids),
                                          dim=1).to(model['cuda_device'])
            decoder_input_ids = {"decoder_input_ids": decoder_input_ids}

        output = model['model'].generate(encoder_outputs=encoder_outputs, **decoder_input_ids,
                                         output_scores=output_scores, return_dict_in_generate=True, **generator_options)

        output_strings = model['tokenizer'].batch_decode(output.sequences, skip_special_tokens=True)
        res = {"input_raw": input_string, "output_raw_list": output_strings}
        if output_scores:
            # Subtract pad token if output_prefix not given
            num_prefix_tokens = len(decoder_input_ids.get('decoder_input_ids', [0]))
            output_token_probs = []
            for idx in range(len(output.sequences)):
                token_probs = []
                for token, scores in zip(output.sequences[idx][num_prefix_tokens:], output.scores):
                    probs = torch.softmax(scores[idx], dim=0)
                    token_probs.append((model['tokenizer'].convert_ids_to_tokens(token.item()), probs[token].item()))
            output_token_probs.append(token_probs)
            res["output_token_probs_list"] = output_token_probs
    return res


# Run model in forced generation mode, capturing each token probability
def run_model_with_outputs(model, input_string, output_texts, output_angle):
    import torch
    with torch.no_grad():
        input_string = input_string
        input_ids = model['tokenizer'].encode(input_string, return_tensors="pt").to(model['cuda_device'])
        # Compute encoder output once and reuse for each output text
        encoder_outputs = model['model'].encoder(input_ids)
        all_res = []
        for output_text in output_texts:
            output_string = make_input_string({output_angle: output_text}, [[output_angle], []])
            output_ids = model['tokenizer'].encode(output_string, return_tensors="pt").to(model['cuda_device'])
            res = model['model'](encoder_outputs=encoder_outputs, labels=output_ids, return_dict=True)
            res_softmax = torch.softmax(res.logits[0], dim=1)
            raw_probs = [x[y.item()].item() for x,y in list(zip(res_softmax, output_ids[0]))]
            output_prob = 1
            for raw_prob in raw_probs:
                output_prob *= raw_prob
            loss = res.loss.item()
            all_res.append({
                "input_raw": input_string,
                "output_raw": output_string,
                "output_text": output_text,
                "loss": loss,
                "score": math.exp(-loss),
                "output_prob": output_prob,
                "output_token_probs": raw_probs,
                "output_tokens": model['tokenizer'].convert_ids_to_tokens(output_ids[0])
            })
    return all_res


def make_mcoptions(choices, first_label='A'):
    res = []
    for idx, choice in enumerate(choices):
        res.append(f'({chr(idx+ord(first_label))}) {choice}')
    return " ".join(res)


# Interface to a multi-angle generative model, either by loading a model or calling an API
class MultiAngleModel():
    def __init__(self,
                 model_path=None,
                 api_url=None,
                 generator_options=None,
                 slot_ordering_override=None,
                 slot_shortforms=None,
                 cuda_devices=None):
        assert model_path is not None or api_url is not None
        assert not (model_path is not None and api_url is not None)
        self.slot_shortforms = new_dict_update(SLOT_SHORTFORMS_DEFAULT, slot_shortforms)
        self.slot_key_from_lowercase = {v.lower(): k for k, v in self.slot_shortforms.items()}
        self.generator_options = new_dict_update(GENERATOR_OPTIONS_DEFAULT, generator_options)
        self.api_url = api_url
        self.model = None
        if model_path is not None:
            self.model = load_model(model_path, cuda_devices)

    def __call__(self, fields, inputs, outputs, options=None):
        options = options or {}
        generator_options = new_dict_update(self.generator_options, options.get('generator_options'))
        explicit_outputs = options.get('explicit_outputs')
        if explicit_outputs is True:
            # Automatically extract from mcoptions field
            explicit_outputs = split_mcoptions(fields['mcoptions'])[0]
        angle = [inputs, outputs]
        if isinstance(angle[0], str): angle[0] = [angle[0]]
        if isinstance(angle[1], str): angle[1] = [angle[1]]
        output_prefix = options.get('output_prefix')
        output_prefix_string = None
        if output_prefix is not None:
            if explicit_outputs is not None:
                raise ValueError("Cannot specify both 'explicit_outputs' and 'output_prefix'")
            slots_prefix = []
            for slot in angle[1]:
                if slot in output_prefix:
                    slots_prefix.append(slot)
                else:
                    break
            if len(slots_prefix) != len(output_prefix):
                raise ValueError(f"Slots in output_prefix ({output_prefix}) do not match initial slots in output slots ({angle[1]})")
            output_prefix_string = make_input_string(output_prefix, [slots_prefix, []])
        full_res = {}
        if options.get("debug"):
            full_res['debug'] = {}
        if self.model:
            input_string = make_input_string(fields, angle)
            res = run_model(self.model, input_string, generator_options, output_prefix_string=output_prefix_string)
            res_slots = decompose_slots(res['output_raw_list'][0])
            full_res.update(res_slots)
            if explicit_outputs:
                output_slot = angle[1][0]
                res_explicit = run_model_with_outputs(self.model, input_string, explicit_outputs, output_slot)
                res_explicit.sort(key=lambda x:-x['output_prob'])
                if options.get("debug"):
                    full_res['debug'].update({"generated_output": res_slots, "explicit_outputs": res_explicit})
                full_res[output_slot] = res_explicit[0]['output_text']
                full_res['output_prob'] = res_explicit[0]['output_prob']
        else:
            api_generator_options = {}
            if generator_options:
                for k, v in generator_options.items():
                    v_new = v
                    if isinstance(v, bool):
                        v_new = 1 if v else 0
                    api_generator_options[k] = v_new

            input_string = make_api_input_string(fields, angle, self.slot_key_from_lowercase, explicit_outputs, output_prefix)
            try:
                res_raw = requests.get(self.api_url, params={"input": input_string, **api_generator_options})
                res = res_raw.json()
            except:
                logger.warning(f"Failed API call to {self.api_url}")
                return {"error": f"Failed API call to {self.api_url}"}
            res_slots = res['output_slots_list'][0]
            full_res.update(res_slots)
            if explicit_outputs:
                output_slot = angle[1][0]
                res_explicit = res['explicit_outputs']
                if options.get("debug"):
                    full_res['debug'].update({"generated_output": res_slots, "explicit_outputs": res_explicit})
                full_res[output_slot] = res_explicit[0]['output_text']
                full_res['output_prob'] = res_explicit[0]['output_prob']
        if options.get("debug"):
            full_res["debug"].update({"raw_input": input_string, "generator_options": generator_options})
        return full_res


class InformationRetriever():
    def __init__(self, api_url, max_retries=3):
        self.api_url = api_url
        self.max_retries = max_retries

    def __call__(self, fields, inputs=None, outputs=None, options=None):
        if options is not None:
            fields = fields.copy()
            fields.update(options)
        retry = 0
        res = None
        while retry <= self.max_retries and res is None:
            retry += 1
            try:
                res_raw = requests.post(self.api_url, json=fields)
                res = {"retrievals": res_raw.json()}
            except:
                logger.warning(f"Failed retriever API call to {self.api_url}, retry = {retry}")
                time.sleep(2^retry)
        if res is None:
            return {"error": f"Failed retriever API call to {self.api_url}"}
        return res


class NlpAgent():
    def __init__(self,
                 model,
                 default_fields=None,     # These will be used as starting point for any input fields
                 default_outputs=None,
                 default_options=None):
        self.model = model
        self.default_fields = default_fields or {}
        self.default_outputs = default_outputs
        self.default_options = default_options

    def __call__(self, fields, inputs=None, outputs=None, options=None):
        fields_full = self.default_fields.copy()
        fields_full.update(fields)
        if inputs is None:
            inputs = [k for k, v in fields.items() if v]
        outputs = outputs or self.default_outputs
        options_full = options
        if self.default_options is not None:
            options_full = self.default_options.copy()
            if options is not None:
                options_full.update(options)
        res = self.model(fields_full, inputs, outputs, options_full)
        if "error" in res:
            return res
        if isinstance(outputs, str):
            res = res.get(outputs)
        return res

In [4]:
# Load model (adjust GPU settings if needed)
ew_model = MultiAngleModel(model_path="allenai/entailer-large", cuda_devices=[0])

# Agents for reasoning & verification
prover = NlpAgent(model=ew_model, default_outputs="proof")
hyp_verifier = NlpAgent(model=ew_model, default_outputs=["valid"], default_options={"explicit_outputs": ['true', 'false']})

2025-11-17 00:03:02.089264: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763337782.291106      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763337782.345670      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:

hyp_verifier = NlpAgent(
    model=ew_model,
    default_outputs=["valid"],
    default_options={"explicit_outputs": ["true", "false"]}
)

def make_hypothesis(question, choice):
    """
    General-purpose hypothesis template for Entailer.
    """
    q = question.strip()
    c = choice.strip()

    if q.endswith("?"):
        q = q[:-1]

    return f"\"{c}\" is the correct answer to the question: \"{q}\"."

def score_choice(choice, hypothesis):
    """
    Sends a single hypothesis to the Entailer verifier.
    """
    res = hyp_verifier({"hypothesis": hypothesis})
    return res.get("output_prob", 0.0)


def entailer_predict(example):
    """
    Predicts answer index using Entailer verification.
    """
    q = example["question"]
    choices = example["choices"]

    # Generate hypothesis for each choice
    probs = []
    for c in choices:
        hyp = make_hypothesis(q, c)
        prob = score_choice(c, hyp)
        probs.append(prob)

    pred = int(np.argmax(probs))
    return pred, probs


def run_entailer_baseline(path, max_rows=None):
    """
    Runs Entailer baseline and collects predictions, labels, and QA types.
    If max_rows is set, only evaluates that many examples.
    """
    predictions = []
    labels = []
    types = []

    with open(path) as f:
        for i, line in enumerate(f):
            if max_rows is not None and i >= max_rows:
                break

            ex = json.loads(line)

            pred_label, _ = entailer_predict(ex)

            predictions.append(pred_label)
            labels.append(ex["label"])
            types.append(ex.get("qa_type", "unknown"))

    return predictions, labels, types


import numpy as np
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
from collections import defaultdict

def compute_metrics_by_type(predictions, labels, qa_types):
    predictions = np.array(predictions)
    labels = np.array(labels)
    qa_types = np.array(qa_types)

    unique_types = sorted(set(qa_types))

    results = {
        'per_type': {},
        'macro_across_types': {},
        'micro_overall': {},
        'confusion_matrices': {}
    }

    for qa_type in unique_types:
        mask = (qa_types == qa_type)
        type_preds = predictions[mask]
        type_labels = labels[mask]

        if len(type_preds) == 0:
            continue

        macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
            type_labels, type_preds, average='macro', zero_division=0, labels=[0,1,2,3]
        )

        micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(
            type_labels, type_preds, average='micro', zero_division=0, labels=[0,1,2,3]
        )

        per_class_precision, per_class_recall, per_class_f1, per_class_support = precision_recall_fscore_support(
            type_labels, type_preds, average=None, zero_division=0, labels=[0,1,2,3]
        )

        accuracy = (type_preds == type_labels).mean()

        results['per_type'][qa_type] = {
            'macro': {
                'precision': macro_precision,
                'recall': macro_recall,
                'f1': macro_f1,
            },
            'micro': {
                'precision': micro_precision,
                'recall': micro_recall,
                'f1': micro_f1,
                'accuracy': accuracy,
            },
            'per_choice': {
                f'choice_{i}': {
                    'precision': per_class_precision[i],
                    'recall': per_class_recall[i],
                    'f1': per_class_f1[i],
                    'support': per_class_support[i],
                } for i in range(4)
            },
            'support': len(type_preds),
            'correct': (type_preds == type_labels).sum()
        }

        results['confusion_matrices'][qa_type] = confusion_matrix(
            type_labels, type_preds, labels=[0,1,2,3]
        )

    # Macro across types
    macro_acc = np.mean([results['per_type'][t]['micro']['accuracy'] for t in unique_types])
    macro_prec = np.mean([results['per_type'][t]['macro']['precision'] for t in unique_types])
    macro_rec = np.mean([results['per_type'][t]['macro']['recall'] for t in unique_types])
    macro_f1 = np.mean([results['per_type'][t]['macro']['f1'] for t in unique_types])

    results['macro_across_types'] = {
        'accuracy': macro_acc,
        'precision': macro_prec,
        'recall': macro_rec,
        'f1': macro_f1,
    }

    # Micro overall
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(
        labels, predictions, average='micro', zero_division=0
    )

    results['micro_overall'] = {
        'accuracy': (predictions == labels).mean(),
        'precision': micro_precision,
        'recall': micro_recall,
        'f1': micro_f1,
        'support': len(labels),
        'correct': int((predictions == labels).sum()),
    }

    return results

def print_results_table(metrics, model_name="Entailer"):
    print(f"\n{'='*100}")
    print(f"Model: {model_name}")
    print(f"{'='*100}")

    print(f"\n{'Type':<12} {'Accuracy':<12} {'Macro-P':<12} {'Macro-R':<12} {'Macro-F1':<12} {'Support':<10}")
    print("-" * 100)

    for qa_type in sorted(metrics['per_type'].keys()):
        m_macro = metrics['per_type'][qa_type]['macro']
        m_micro = metrics['per_type'][qa_type]['micro']
        support = metrics['per_type'][qa_type]['support']

        print(f"{qa_type:<12} {m_micro['accuracy']:>10.4f} {m_macro['precision']:>10.4f} "
              f"{m_macro['recall']:>10.4f} {m_macro['f1']:>10.4f} {support:>8}")

    print("-" * 100)

    m = metrics['macro_across_types']
    print(f"{'Macro Avg':<12} {m['accuracy']:>10.4f} {m['precision']:>10.4f} "
          f"{m['recall']:>10.4f} {m['f1']:>10.4f} {'-':>8}")

    m = metrics['micro_overall']
    print(f"{'Micro Avg':<12} {m['accuracy']:>10.4f} {m['precision']:>10.4f} "
          f"{m['recall']:>10.4f} {m['f1']:>10.4f} {m['support']:>8}")

    print("=" * 100)



def run_entailer_with_metrics(path, max_rows=None):
    # 1. Run predictions
    preds, labels, types = run_entailer_baseline(path, max_rows=max_rows)

    # 2. Compute full metrics
    metrics = compute_metrics_by_type(preds, labels, types)

    # 3. Print table
    print_results_table(metrics, model_name="Entailer-11b")

    return metrics



In [6]:
dev_metrics = run_entailer_with_metrics('/kaggle/input/csqa-logicalcombinations/dev_all_hf.json')

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.



Model: Entailer-11b

Type         Accuracy     Macro-P      Macro-R      Macro-F1     Support   
----------------------------------------------------------------------------------------------------
AND              0.0760     0.0762     0.0760     0.0761     1500
Mixed            0.2240     0.2241     0.2240     0.2240     1500
NEITHER          0.2387     0.2386     0.2387     0.2386     1500
OR               0.1987     0.1986     0.1987     0.1986     1500
----------------------------------------------------------------------------------------------------
Macro Avg        0.1843     0.1844     0.1843     0.1843        -
Micro Avg        0.1843     0.1843     0.1843     0.1843     6000


In [7]:
test_metrics = run_entailer_with_metrics('/kaggle/input/csqa-logicalcombinations/test_all_hf.json')


Model: Entailer-11b

Type         Accuracy     Macro-P      Macro-R      Macro-F1     Support   
----------------------------------------------------------------------------------------------------
AND              0.0500     0.0493     0.0500     0.0496      500
Mixed            0.2240     0.2255     0.2240     0.2245      500
NEITHER          0.2480     0.2482     0.2480     0.2477      500
OR               0.1820     0.1814     0.1820     0.1816      500
----------------------------------------------------------------------------------------------------
Macro Avg        0.1760     0.1761     0.1760     0.1758        -
Micro Avg        0.1760     0.1760     0.1760     0.1760     2000
