In [2]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm
from sklearn.metrics import accuracy_score

model_name = "roberta-large-mnli"
test_filename = "dataset/legal_text_classifcation_test.csv"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

df = pd.read_csv(test_filename)

df['input_text'] = df['case_text']

# Candidate labels
candidate_labels = [
    "affirmed", "applied", "approved", "cited", "considered",
    "discussed", "distinguished", "followed", "referred to", "related"
]

# Build hypothesis templates
def construct_hypothesis(label):
    return f"This case was {label}."

# Perform zero-shot classification
predictions = []
for premise in tqdm(df['input_text'], desc="Zero-Shot Evaluation"):
    label_scores = []
    for label in candidate_labels:
        hypothesis = construct_hypothesis(label)
        inputs = tokenizer.encode_plus(
            premise,
            hypothesis,
            return_tensors="pt",
            truncation='only_first',
            padding="max_length",
            max_length=512
        )
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            entailment_score = torch.softmax(logits, dim=1)[0][2].item()  # index 2 = entailment
            label_scores.append(entailment_score)

    # Pick label with highest entailment score
    predicted_label = candidate_labels[label_scores.index(max(label_scores))]
    predictions.append(predicted_label)

# Add predictions and evaluate
df['zero_shot_prediction'] = predictions
acc = accuracy_score(df['case_outcome'], df['zero_shot_prediction'])
print(f"\nZero-Shot Accuracy: {acc:.4f}")

df[['input_text', 'case_outcome', 'zero_shot_prediction']].to_csv("roberta_zero_shot_results.csv", index=False)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Zero-Shot Evaluation: 100%|██████████| 2481/2481 [15:03<00:00,  2.75it/s]


Zero-Shot Accuracy: 0.2701



