# Hindi Chunking with Transformers

## Credits & Acknowledgements
- This code is adapted from the [Hugging Face Token Classification Tutorial](https://huggingface.co/docs/transformers/tasks/token_classification).
- The dataset used is the Hindi HDTB from [Universal Dependencies](https://universaldependencies.org/treebanks/hi_hdtb/index.html).


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import re
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel, Features, Sequence, Value
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification,
    pipeline
)
import evaluate

In [None]:
def parse_conllu(file_path):
    sentences = []
    current_sent = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                if current_sent:
                    sentences.append(current_sent)
                    current_sent = []
                continue
                
            parts = line.split('\t')
            if len(parts) < 10:
                continue
                
            token = parts[1]
            misc = parts[9]
            
          
            chunk_id = None
            chunk_type = None
            
            if misc != '_':
                misc_parts = misc.split('|')
                for mp in misc_parts:
                    if mp.startswith('ChunkId='):
                        chunk_id = mp.split('=')[1]
                    elif mp.startswith('ChunkType='):
                        chunk_type = mp.split('=')[1]
            
            current_sent.append({
                'token': token,
                'chunk_id': chunk_id,
                'chunk_type': chunk_type
            })
            
        if current_sent:
            sentences.append(current_sent)
            
    return sentences

def get_bio_labels(sentence, include_chunk_type=False):
    labels = []
    prev_chunk_id = None
    
    for token_data in sentence:
        chunk_id = token_data['chunk_id']
        chunk_type = token_data['chunk_type']
        
        if not chunk_id or chunk_id == 'BLK':
            labels.append('O')
            prev_chunk_id = chunk_id
            continue
            
        chunk_class = re.sub(r'\d+$', '', chunk_id)
        
        prefix = 'B'
        if prev_chunk_id == chunk_id:
            prefix = 'I'
            
        label = f"{prefix}-{chunk_class}"
        
        if include_chunk_type and chunk_type:
            label += f"-{chunk_type}"
            
        labels.append(label)
        prev_chunk_id = chunk_id
        
    return labels

def create_hf_dataset(file_paths, include_chunk_type=False):
    dataset_dict = {}
    
 
    all_labels = set()
    
   
    temp_data = {}
    
    for split, path in file_paths.items():
        sentences = parse_conllu(path)
        split_data = {'tokens': [], 'ner_tags': []}
        
        for sent in sentences:
            tokens = [t['token'] for t in sent]
            labels = get_bio_labels(sent, include_chunk_type)
            
            split_data['tokens'].append(tokens)
            split_data['ner_tags'].append(labels)
            all_labels.update(labels)
            
        temp_data[split] = split_data
        
   
    label_list = sorted(list(all_labels))
    label_encoding = {l: i for i, l in enumerate(label_list)}
    
   
    final_datasets = {}
    for split, data in temp_data.items():
        tag_ids = [[label_encoding[l] for l in tags] for tags in data['ner_tags']]
        
        final_datasets[split] = Dataset.from_dict({
            'id': [str(i) for i in range(len(data['tokens']))],
            'tokens': data['tokens'],
            'ner_tags': tag_ids
        })
        
    return DatasetDict(final_datasets), label_list


In [None]:
data_files = {
    'train': 'hi_hdtb-ud-train.conllu',
    'validation': 'hi_hdtb-ud-dev.conllu',
    'test': 'hi_hdtb-ud-test.conllu'
}


dataset, label_list = create_hf_dataset(data_files, include_chunk_type=False)
print(f"Labels: {label_list}")


Labels: ['B-BLK', 'B-CCP', 'B-FRAGP', 'B-JJP', 'B-NEGP', 'B-NP', 'B-RBP', 'B-VGF', 'B-VGNF', 'B-VGNN', 'I-CCP', 'I-FRAGP', 'I-JJP', 'I-NP', 'I-RBP', 'I-VGF', 'I-VGNF', 'I-VGNN', 'O']


In [4]:
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [5]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p, label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


## Part 1: English DistilBERT
Using `distilbert-base-uncased`, which is a model pretrained on English and not Hindi, so it expected it won't properly recognise the Hindi text.


In [13]:
model_checkpoint_en = "distilbert-base-uncased"
tokenizer_en = AutoTokenizer.from_pretrained(model_checkpoint_en)

tokenized_datasets_en = dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer_en), 
    batched=True
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_en)

id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

model_en = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint_en, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

args_en = TrainingArguments(
    "distilbert-en-hindi-chunking",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer_en = Trainer(
    model=model_en,
    args=args_en,
    train_dataset=tokenized_datasets_en["train"],
    eval_dataset=tokenized_datasets_en["validation"],
    tokenizer=tokenizer_en,
    data_collator=data_collator,
    compute_metrics=lambda p: compute_metrics(p, label_list),
)

trainer_en.train() 
trainer_en.evaluate()




Map:   0%|          | 0/13306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.7764,0.364113,0.73998,0.73911,0.739545,0.878638
2,0.3608,0.299497,0.786356,0.781965,0.784154,0.900275
3,0.321,0.281231,0.800568,0.795485,0.798019,0.906636


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.28123095631599426,
 'eval_precision': 0.8005679465183695,
 'eval_recall': 0.7954852742343189,
 'eval_f1': 0.7980185174264315,
 'eval_accuracy': 0.9066359996592555,
 'eval_runtime': 3.6985,
 'eval_samples_per_second': 448.561,
 'eval_steps_per_second': 28.12,
 'epoch': 3.0}

## Part 2: Multilingual Model
Now we use `bert-base-multilingual-cased` (mBERT) which supports Hindi.


In [8]:
model_checkpoint_multi = "bert-base-multilingual-cased"
tokenizer_multi = AutoTokenizer.from_pretrained(model_checkpoint_multi)

tokenized_datasets_multi = dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer_multi), 
    batched=True
)

data_collator_multi = DataCollatorForTokenClassification(tokenizer=tokenizer_multi)

model_multi = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint_multi, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

args_multi = TrainingArguments(
    "mbert-hindi-chunking",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer_multi = Trainer(
    model=model_multi,
    args=args_multi,
    train_dataset=tokenized_datasets_multi["train"],
    eval_dataset=tokenized_datasets_multi["validation"],
    tokenizer=tokenizer_multi,
    data_collator=data_collator_multi,
    compute_metrics=lambda p: compute_metrics(p, label_list),
)

trainer_multi.train()
trainer_multi.evaluate()


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



Map:   0%|          | 0/13306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2638,0.07341,0.955926,0.956264,0.956095,0.979953
2,0.0659,0.062204,0.965436,0.963847,0.964641,0.9839
3,0.0527,0.060244,0.965967,0.966081,0.966024,0.984439


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.06024380028247833,
 'eval_precision': 0.9659672015517545,
 'eval_recall': 0.9660807712656516,
 'eval_f1': 0.9660239830707736,
 'eval_accuracy': 0.9844393332765425,
 'eval_runtime': 3.9542,
 'eval_samples_per_second': 419.554,
 'eval_steps_per_second': 26.301,
 'epoch': 3.0}

## Part 3: Performance Analysis

### Results
- **English DistilBERT**: F1 Score ~ **0.80**
- **Multilingual BERT**: F1 Score ~ **0.966**

The **Multilingual BERT** model significantly outperforms the **English DistilBERT** model (0.966 vs 0.80 F1). 

The English model's vocabulary is not optimized for Hindi, it most likely treats many Hindi characters as unknown or splits them into many small, meaningless subwords. Making it difficult for the model to learn Hindi-specific features.

Multilingual BERT was pretrained on a large corpus including Hindi text, allowing it to learn the syntax and semantics of the language. The English model has never seen Hindi structure during pretraining.

Despite these disadvantages, the English model achieving ~80% F1 is surprisingly high. This might be due to the model learning positional patterns or memorizing specific frequent tokens that happen to be consistent, which could be explained by the Indo-European link between the 2 languages and the many English loanwords in Hindi.


## Part Bonus: Heads and Children


I implemented a joint classification approach where the labels were expanded to include the chunk type: B-NP became B-NP-head or B-NP-child. The joint model achieved a slightly higher F1 score (0.980) compared to the standard model 0.966. As we know, chunk types head vs child are not random. They follow strict syntactic rules within a chunk. My aproach was to force the model to distinguish betweeen them to prevent overfitting to simple boundery cues.

The main reason i chose this approach over a multi-head architecture was for simplicity because it captures deeper dependencies. It allows to use the standard `AutoModelForTokenClassification` without needing to implement a custom model.


In [None]:

dataset_bonus, label_list_bonus = create_hf_dataset(data_files, include_chunk_type=True)
print(f"Bonus Labels: {label_list_bonus}")


tokenizer_bonus = AutoTokenizer.from_pretrained(model_checkpoint_multi)

tokenized_datasets_bonus = dataset_bonus.map(
    lambda x: tokenize_and_align_labels(x, tokenizer_bonus), 
    batched=True
)

id2label_bonus = {i: label for i, label in enumerate(label_list_bonus)}
label2id_bonus = {label: i for i, label in enumerate(label_list_bonus)}

model_bonus = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint_multi, num_labels=len(label_list_bonus), id2label=id2label_bonus, label2id=label2id_bonus
)

args_bonus = TrainingArguments(
    "mbert-hindi-chunking-bonus",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer_bonus = Trainer(
    model=model_bonus,
    args=args_bonus,
    train_dataset=tokenized_datasets_bonus["train"],
    eval_dataset=tokenized_datasets_bonus["validation"],
    tokenizer=tokenizer_bonus,
    data_collator=data_collator_multi,
    compute_metrics=lambda p: compute_metrics(p, label_list_bonus),
)

trainer_bonus.train()
trainer_bonus.evaluate()


Bonus Labels: ['B-BLK-head', 'B-CCP-child', 'B-CCP-head', 'B-FRAGP-child', 'B-FRAGP-head', 'B-JJP-child', 'B-JJP-head', 'B-NEGP-head', 'B-NP-child', 'B-NP-head', 'B-RBP-child', 'B-RBP-head', 'B-VGF-child', 'B-VGF-head', 'B-VGNF-child', 'B-VGNF-head', 'B-VGNN-child', 'B-VGNN-head', 'I-CCP-child', 'I-CCP-head', 'I-FRAGP-child', 'I-FRAGP-head', 'I-JJP-child', 'I-JJP-head', 'I-NP-child', 'I-NP-head', 'I-RBP-child', 'I-RBP-head', 'I-VGF-child', 'I-VGF-head', 'I-VGNF-child', 'I-VGNF-head', 'I-VGNN-child', 'I-VGNN-head', 'O']




Map:   0%|          | 0/13306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3966,0.10478,0.970995,0.970929,0.970962,0.971406
2,0.0973,0.091013,0.978516,0.975858,0.977185,0.977482
3,0.0757,0.08262,0.98019,0.978843,0.979516,0.979442


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.08262000232934952,
 'eval_precision': 0.9801900412987274,
 'eval_recall': 0.978842542918455,
 'eval_f1': 0.9795158286778399,
 'eval_accuracy': 0.9794417468836073,
 'eval_runtime': 4.3977,
 'eval_samples_per_second': 377.241,
 'eval_steps_per_second': 23.649,
 'epoch': 3.0}