# Indic-BERT Error Type Classifier (Hindi)

This notebook fine-tunes `ai4bharat/indic-bert` to classify the type of introduced error between a corrupted input sentence and its clean output target.

Labels:
- identity: input == output
- word: word-level changes (insertion/deletion/reordering)
- character: character-level changes (insert/delete/swap chars)
- both: both word-level and character-level changes

Data sources (searched in this order):
- generated_hindi_dataset.csv (from the generation notebook)
- combined_test_dataset.csv (if you previously merged)
- test.csv (if present)

In [1]:
# Install dependencies (run once)
import sys, subprocess
def pip_install(pkg):
    try:
        __import__(pkg.split('==')[0].split('[')[0])
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg])

for pkg in [
    'pandas',
    'scikit-learn',
    'datasets',
    'transformers',
    'torch',
    'tqdm'
]:
    pip_install(pkg)

print('All packages are ready.')

  from .autonotebook import tqdm as notebook_tqdm


All packages are ready.


In [2]:
# Imports
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
import numpy as np
from tqdm import tqdm
import itertools
from collections import Counter

MODEL_CHECKPOINT = 'ai4bharat/indic-bert'
OUTPUT_DIR = 'models/indic-bert-error-classifier'
MAX_LENGTH = 192
RANDOM_SEED = 42
LABELS = ['identity', 'word', 'character', 'both']
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

print('Config ready.')

Config ready.


In [5]:
# Load dataset from available CSV
candidate_paths = [
    
    'combined_test_dataset.csv'
]
data_path = None
for p in candidate_paths:
    if os.path.exists(p):
        data_path = p
        break

if data_path is None:
    raise FileNotFoundError('No dataset CSV found. Expected one of: ' + ', '.join(candidate_paths))

print(f'Using dataset: {data_path}')
df = pd.read_csv(data_path, encoding='utf-8')
# Expect columns: input, output
expected_cols = ['input', 'output']
if list(df.columns) != expected_cols:
    # Try to coerce if two columns exist
    if len(df.columns) >= 2:
        df = df.iloc[:, :2]
        df.columns = expected_cols
    else:
        raise ValueError(f'Unexpected columns in {data_path}: {list(df.columns)}')

print(df.head(2))
print('Rows:', len(df))

Using dataset: combined_test_dataset.csv
                                               input  \
0                                    शिक्षा क्या है?   
1  किसी भी कार्य को सीख लेने की क्रिया को शिक्षा ...   

                                              output  
0                                    शिक्षा क्या है?  
1  किसी भी कार्य को सीख लेने की क्रिया को शिक्षा ...  
Rows: 10599


In [6]:
# Heuristics to derive error labels (identity, word, character, both)
import difflib

def derive_error_label(inp: str, out: str) -> str:
    if inp == out:
        return 'identity'
    in_tok = inp.split()
    out_tok = out.split()

    # Word-level difference detection
    word_diff = False
    if len(in_tok) != len(out_tok):
        word_diff = True
    else:
        # Detect token reordering or insert/delete via sequence matcher
        sm = difflib.SequenceMatcher(a=in_tok, b=out_tok)
        for tag, i1, i2, j1, j2 in sm.get_opcodes():
            if tag in ('delete', 'insert'):
                word_diff = True
                break
        # If tokens are same bag but different order, count as word-level
        if not word_diff and Counter(in_tok) == Counter(out_tok) and in_tok != out_tok:
            word_diff = True

    # Character-level difference detection (aligned by position)
    char_diff = False
    for i in range(min(len(in_tok), len(out_tok))):
        if in_tok[i] != out_tok[i]:
            char_diff = True
            break

    if word_diff and char_diff:
        return 'both'
    if word_diff:
        return 'word'
    if char_diff:
        return 'character'
    # Fallback
    return 'identity'

tqdm.pandas(desc='Deriving labels')
df['label_name'] = df.progress_apply(lambda r: derive_error_label(str(r['input']), str(r['output'])), axis=1)
df['label'] = df['label_name'].map(label2id)
print(df['label_name'].value_counts())
df.head()

Deriving labels: 100%|██████████| 10599/10599 [00:00<00:00, 31839.96it/s]

label_name
identity     5063
both         3787
character    1723
word           26
Name: count, dtype: int64





Unnamed: 0,input,output,label_name,label
0,शिक्षा क्या है?,शिक्षा क्या है?,identity,0
1,किसी भी कार्य को सीख लेने की क्रिया को शिक्षा ...,किसी भी कार्य को सीख लेने की क्रिया को शिक्षा ...,identity,0
2,ये केवल किताबी ज्ञान अर्जन तक ही सिमित नहीं है।,ये केवल किताबी ज्ञान अर्जन तक ही सीमित नहीं है।,character,2
3,यह कई विभागों में बांटा जा सकता है।,यह कई विभागों में बांटा जा सकता है।,identity,0
4,"जैसे - व्यावहारिक शिक्षा, किताबी शिक्षा अथवा अ...","जैसे - व्यावहारिक शिक्षा, किताबी शिक्षा अथवा आ...",both,3


In [7]:
# Train/validation split (stratified)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED, stratify=df['label'])
print('Train:', train_df.shape, 'Val:', val_df.shape)

train_ds = Dataset.from_pandas(train_df[['input','output','label']].reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df[['input','output','label']].reset_index(drop=True))

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess(batch):
    return tokenizer(batch['input'], batch['output'], truncation=True, max_length=MAX_LENGTH)

train_tokenized = train_ds.map(preprocess, batched=True)
val_tokenized = val_ds.map(preprocess, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(LABELS)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
print('Model loaded.')

Train: (9539, 4) Val: (1060, 4)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 9539/9539 [00:07<00:00, 1272.79 examples/s]
Map: 100%|██████████| 1060/1060 [00:00<00:00, 2465.73 examples/s]
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.


In [9]:
# Training setup
metric_name = 'f1'
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1w = f1_score(labels, preds, average='weighted')
    f1m = f1_score(labels, preds, average='macro')
    return {'accuracy': acc, 'f1_weighted': f1w, 'f1_macro': f1m}

args = TrainingArguments(
    OUTPUT_DIR,
    eval_strategy='epoch',  # Changed from evaluation_strategy
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

metrics = trainer.evaluate()
metrics

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,0.3367,0.467444,0.717925,0.703259,0.478222
2,0.2265,0.262089,0.910377,0.904538,0.643863
3,0.1429,0.266564,0.915094,0.914407,0.658724


{'eval_loss': 0.26656442880630493,
 'eval_accuracy': 0.9150943396226415,
 'eval_f1_weighted': 0.9144068041833044,
 'eval_f1_macro': 0.6587239035530444,
 'eval_runtime': 6.8344,
 'eval_samples_per_second': 155.098,
 'eval_steps_per_second': 4.975,
 'epoch': 3.0}

In [10]:
# Detailed evaluation: classification report and confusion matrix
preds = trainer.predict(val_tokenized)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=-1)
print(classification_report(y_true, y_pred, target_names=LABELS, digits=4))
print('Confusion Matrix:')
print(confusion_matrix(y_true, y_pred))

# Save the best model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f'Model saved to: {OUTPUT_DIR}')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

    identity     0.9863    0.9941    0.9902       506
        word     0.0000    0.0000    0.0000         3
   character     0.7268    0.7733    0.7493       172
        both     0.9101    0.8813    0.8954       379

    accuracy                         0.9151      1060
   macro avg     0.6558    0.6621    0.6587      1060
weighted avg     0.9141    0.9151    0.9144      1060

Confusion Matrix:
[[503   0   3   0]
 [  0   0   2   1]
 [  7   0 133  32]
 [  0   0  45 334]]
Model saved to: models/indic-bert-error-classifier
Model saved to: models/indic-bert-error-classifier


In [None]:
# Inference helper
import torch
from transformers import pipeline

def predict_error_type(inp: str, out: str):
    # Get the device the model is on
    device = next(model.parameters()).device
    
    # Encode as sentence pair using the tokenizer directly
    enc = tokenizer(inp, out, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    
    # Move inputs to the same device as the model
    enc = {k: v.to(device) for k, v in enc.items()}
    
    with torch.no_grad():
        logits = model(**enc).logits
    probs = logits.softmax(dim=-1).cpu().numpy()[0]
    pred_id = int(probs.argmax())
    return { 'label': id2label[pred_id], 'scores': {LABELS[i]: float(p) for i,p in enumerate(probs)} }

# Quick demo
sample = df.sample(3, random_state=RANDOM_SEED)
for _, r in sample.iterrows():
    res = predict_error_type(str(r['input']), str(r['output']))
    print('Input :', r['input'][:120])
    print('Output:', r['output'][:120])
    print('True label:', r['label_name'], ' Pred:', res['label'])
    print('Scores:', res['scores'])
    print()

Device set to use cuda:0


RuntimeError: Expected all tensors to be on the same device, but got index is on cpu, different from other tensors on cuda:0 (when checking argument in method wrapper_CUDA__index_select)