## **RHETORICAL ROLE CLASSIFICATION(USING PRETRAINED MODEL)**

***Installing Required Libraries: Transformers and PyTorch***

In [None]:
!pip install transformers torch


***Manual label mapping for rhetorical roles***

In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

custom_id2label = {
    0: "Facts",
    1: "Ruling by Lower Court",
    2: "Argument",
    3: "Statute",
    4: "Precedent",
    5: "Ratio of the decision",
    6: "Ruling by Present Court",
    7: "Petitioner Argument",
    8: "Respondent Argument",
    9: "None",
    10: "Analysis",
    11: "Evidence",
    12: "Other"
}


***Load model and tokenizer***

In [None]:
model_name = "engineersaloni159/LegalRo-BERt_for_rhetorical_role_labeling"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

 ***Create classification pipeline***

In [None]:

nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

***Example list of legal sentences***

In [None]:

examples = [
    "The petitioner was arrested without any warrant.",
    "The main issue before the court is whether the arrest was legal.",
    "The respondent argued that the arrest was justified under Section 41 of CrPC.",
    "The court held that the arrest was unconstitutional under Article 21.",
    "This case relies on the precedent set in D.K. Basu vs State of West Bengal."
]


***Classify each sentence and print the result***

In [None]:

for sentence in examples:
    results = nlp(sentence, return_all_scores=True)[0]
    best = max(results, key=lambda x: x['score'])
    label_id = int(best['label'].split("_")[-1])
    readable_label = custom_id2label.get(label_id, "Unknown")
    print(f"[{readable_label} | {best['score']}] → {sentence}")


# **RHETORICAL ROLE CLASSIFICATION(FINETUNED MODEL)**

***Installing Required Libraries: Transformers, Datasets, Torch, Scikit-learn***


In [None]:
!pip install transformers datasets torch scikit-learn -q


***Imports***

In [None]:

import pandas as pd
import torch
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

***Load CSV data***

In [None]:

df = pd.read_csv("/content/drive/MyDrive/WORKSHOP_2025/rhetorical_data.csv")


***Preprocess: Clean labels and sentences***

In [None]:

df = df[['sentence', 'label']].dropna()
df['sentence'] = df['sentence'].astype(str).str.lower().str.strip()
df['label'] = df['label'].str.lower().str.strip()

***Define valid labels***

In [None]:

valid_labels = ['facts', 'argument', 'precedent', 'ratio of the decision',
                'ruling by lower court', 'ruling by present court', 'statute']
df = df[df['label'].isin(valid_labels)]


***Encode labels and Save Lables***

In [None]:

label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])


label_encoder_path = "/content/drive/MyDrive/WORKSHOP_2025/Legal_BERT/label_encoder.pkl"
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)
print("Label encoder saved to:", label_encoder_path)

***Train-test split***

In [None]:

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['sentence'], df['label_id'], test_size=0.3, stratify=df['label_id'], random_state=42
)

***Load tokenizer***

In [None]:

tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128, return_tensors='pt')


***Custom Dataset class and Prepare datasets***

In [None]:


class LegalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)


train_dataset = LegalDataset(train_encodings, train_labels.tolist())
val_dataset = LegalDataset(val_encodings, val_labels.tolist())

***Load model***

In [None]:

model = BertForSequenceClassification.from_pretrained(
    "nlpaueb/legal-bert-base-uncased",
    num_labels=len(label_encoder.classes_)
)


***Define training arguments***

In [None]:

training_args = TrainingArguments(
    output_dir="/content/legalbert_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="/content/legalbert_logs",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
)

***Metric function***

In [None]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {"accuracy": acc, "macro_f1": f1}


***Create Trainer***

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

***Train model , Evaluate , Save model and tokenizer***

In [None]:

trainer.train()


results = trainer.evaluate()
print("Evaluation Results:", results)


model_path = "/content/drive/MyDrive/WORKSHOP_2025/Legal_BERT"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print("Model and tokenizer saved to:", model_path)


# **Using the Trained Model for Rhetorical Role Classification**

***Imports***

In [None]:

import pandas as pd
import re
import torch
import pickle
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from google.colab import files

***Load the trained model and tokenizer***

In [None]:

model_path = "/content/drive/MyDrive/WORKSHOP_2025/Legal_BERT"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

***Load label encoder***

In [None]:

with open(f"{model_path}/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

***Preprocessing- cleaning the text***

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII
    text = re.sub(r'\[\d+\]\s*\d+\s*[a-zA-Z]+\s*\d*', '', text)  # Remove citations
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

***Upload raw sentences file***


In [None]:

print("Upload plain raw text file (one sentence per line):")
uploaded = files.upload()
raw_file = next(f for f in uploaded if f.endswith(".txt"))

with open(raw_file, 'r', encoding='utf-8') as f:
    raw_sentences = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(raw_sentences)} raw sentences")

***Clean raw sentences***

In [None]:


cleaned_raw_sentences = [clean_text(s) for s in raw_sentences if len(clean_text(s)) >= 3]

***Predict rhetorical roles***


In [None]:

predictions = []
for original_sent, cleaned_sent in zip(raw_sentences, cleaned_raw_sentences):
    inputs = tokenizer(cleaned_sent, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    pred_id = torch.argmax(outputs.logits, dim=1).item()
    pred_label = label_encoder.classes_[pred_id]
    predictions.append((cleaned_sent, pred_label, original_sent))

In [None]:
pred_df = pd.DataFrame(predictions, columns=["cleaned_sentence", "predicted_label", "original_sentence"])

***Upload annotated file***

In [None]:

print("Upload annotated file (sentence<TAB>label format):")
uploaded = files.upload()
anno_file = next(f for f in uploaded if f.endswith(".txt"))

annotated_data = []
with open(anno_file, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 2:
            sentence, label = parts
            cleaned = clean_text(sentence)
            if len(cleaned) >= 3:
                annotated_data.append((cleaned, label.strip().lower(), sentence.strip()))

anno_df = pd.DataFrame(annotated_data, columns=['cleaned_sentence', 'true_label', 'original_sentence_anno'])


***Merge on cleaned sentences***

In [None]:

merged_df = pd.merge(pred_df, anno_df, on="cleaned_sentence", how="inner")
if merged_df.empty:
    raise ValueError("No matching cleaned sentences found between prediction and annotated data.")

***Evaluation***

In [None]:

accuracy = accuracy_score(merged_df['true_label'], merged_df['predicted_label'])
macro_f1 = f1_score(merged_df['true_label'], merged_df['predicted_label'], average='macro')
labels = sorted(set(merged_df['true_label']) | set(merged_df['predicted_label']))
report = classification_report(
    merged_df['true_label'],
    merged_df['predicted_label'],
    labels=labels,
    target_names=labels,
    zero_division=0
)

***Display all sentences with true and predicted labels***

In [None]:

print("\nEvaluation Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print("\nClassification Report:\n")
print(report)


print("\nSentence-wise Predictions:\n")
for idx, row in merged_df.iterrows():
    print(f"Sentence: {row['original_sentence']}")
    print(f"True Label: {row['true_label']}")
    print(f"Predicted Label: {row['predicted_label']}")
    print("-" * 60)
