# Fine-Tuning Preparation
Based on the analysis, the confidence score is correlated with the number of labels predicted. This means that by increasing the number of predicted labels, the confidence score will increase as wel. However, this would only appply after the prediction.
The good news is that from the analysis, there are a few label types that have shown to have poor high confidence score ratio against low confidence score. This is while some low frequency labels have good ratio. So, to increase the confidence score with less label frequency. The training data needs to be augmented. Here, there are two things that can be done for this augmentation. These are:
- get contextual texts that corresponds to poor ratio labels.
- synthesize training data for rare texts.

------------
-----------
## Data Preparation

### Import Libraries

In [11]:
import pandas as pd
from gliner import GLiNER
from torch.utils.data import DataLoader
import torch
import os
import json
from sklearn.model_selection import train_test_split

### Import Model

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   # use GPU if available
print(f"Using device: {device}")

model = GLiNER.from_pretrained("urchade/gliner_multi")
model.model.to(device)

Using device: cpu


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 30448.67it/s]


SpanModel(
  (token_rep_layer): Encoder(
    (bert_layer): Transformer(
      (model): DebertaV2Model(
        (embeddings): DebertaV2Embeddings(
          (word_embeddings): Embedding(250105, 768, padding_idx=0)
          (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): DebertaV2Encoder(
          (layer): ModuleList(
            (0-11): 12 x DebertaV2Layer(
              (attention): DebertaV2Attention(
                (self): DisentangledSelfAttention(
                  (query_proj): Linear(in_features=768, out_features=768, bias=True)
                  (key_proj): Linear(in_features=768, out_features=768, bias=True)
                  (value_proj): Linear(in_features=768, out_features=768, bias=True)
                  (pos_dropout): Dropout(p=0.1, inplace=False)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
                (output): DebertaV2SelfOutput(
  

### Data Preparation

In [None]:
# config
data_path = "your_data.json"
batch_size = 2
num_epoch = 10
learning_rate = 2e-5
output_dir = "../gliner_fine_tuned"

In [13]:
# load data
with open(data_path, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

NameError: name 'data_path' is not defined

In [None]:
# data validation
valid_data = []
for sample in raw_data:
    if 'text' not in sample or 'entities' not in sample:
        continue
        
    valid_entities = []
    for entity in sample['entities']:
        if ('start' in entity and 'end' in entity and 'label' in entity and
            0 <= entity['start'] < entity['end'] <= len(sample['text'])):
            valid_entities.append(entity)
    
    if valid_entities:
        valid_data.append({
            'text': sample['text'],
            'entities': valid_entities
        })

In [None]:
# data spliting (train/val/test)
train_val_data, test_data = train_test_split(
    valid_data, 
    test_size=0.2,              # 20% for test
    random_state=42
)

train_data, val_data = train_test_split(
    train_val_data,
    test_size=0.15/(1-0.15),    # 15% for validation
    random_state=42
)

# display
print(f"Total samples: {len(valid_data)}")
print(f"Train: {len(train_data)} | Val: {len(val_data)} | Test: {len(test_data)}")

In [14]:
# data loader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

NameError: name 'batch_size' is not defined

In [None]:
# optimizer
optimizer = torch.optim.AdamW(model.model.parameters(), lr=learning_rate)

### Training

In [None]:
os.makedirs(output_dir, exist_ok=True)
print(f"Starting training for {num_epoch} epochs...")

best_val_loss = float('inf')
for epoch in range(num_epoch):
    # Training
    model.model.train()
    train_loss = 0
    
    for batch in train_loader:
        texts = [item['text'] for item in batch]
        spans_list = [item['entities'] for item in batch]
        
        inputs = model.tokenize(texts, spans_list)
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
                 for k, v in inputs.items()}
        
        outputs = model.model(**inputs)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        train_loss += loss.item()
    
    # Validation
    model.model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            texts = [item['text'] for item in batch]
            spans_list = [item['entities'] for item in batch]
            
            inputs = model.tokenize(texts, spans_list)
            inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
                     for k, v in inputs.items()}
            
            outputs = model.model(**inputs)
            val_loss += outputs.loss.item()
    
    # Calculate average losses
    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    
    # Save checkpoint
    epoch_dir = os.path.join(output_dir, f"epoch_{epoch+1}")
    model.save_pretrained(epoch_dir)
    
    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained(os.path.join(output_dir, "best_model"))
    
    print(f"Epoch {epoch+1}/{num_epoch} | "
          f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

### Save Model

In [None]:
model.save_pretrained(os.path.join(output_dir, "final_model"))
print("Training complete! Saved to:", output_dir)