### Notebook for finetuning H2o-danube-1.8b-base Model using Pytorch Lightning.  

You can find more details about the model: 

**Research paper:** https://arxiv.org/abs/2401.16818

**Model Huggingface card:** https://huggingface.co/h2oai/h2o-danube-1.8b-base

#### Inference Notebook: https://www.kaggle.com/code/nischaydnk/h2o-danube-1-8b-llm-submission

#### Settings to get 0.962+ CV:
- Training Sequence Length - 1400
- Downsample competition Data with samples having only 'O' labels with 0.75 ratio
- Use MPware dataset shared here: https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/477989


# üöö Imports

In [1]:
import os
import gc
from tqdm.auto import tqdm
import json

import numpy as np 
import pandas as pd 
from itertools import chain

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
from datasets import concatenate_datasets,load_dataset,load_from_disk

from sklearn.metrics import log_loss

from transformers import AutoModel, AutoTokenizer, AdamW, DataCollatorWithPadding

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification


2024-03-02 20:46:26.223840: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-02 20:46:26.223978: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-02 20:46:26.347078: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
!pip install peft


Collecting peft
  Downloading peft-0.9.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.9.0-py3-none-any.whl (190 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m190.9/190.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.9.0


In [3]:
from datasets import Dataset, load_from_disk
import pickle
import re
from transformers import TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification


# ‚öôÔ∏è Config

Notebook was ran on my local Instance, you will need to change the paths for Kaggle accordingly. 

In [4]:
class config:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 69
    # dataset path 
    train_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    test_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
    sample_submission_path = "/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv"
    save_dir="/kaggle/working/exp1"

    #tokenizer params
    downsample = 0.75
    truncation = True 
    padding = False #'max_length'
    max_length = 1024
    freeze_layers = 0
    # model params
    model_name = "h2oai/h2o-danube-1.8b-base"
    
    target_cols = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL','O']

    load_from_disk = None
    #training params
    learning_rate = 1e-4
    batch_size = 1
    epochs = 3
    NFOLDS = 4
    trn_fold = 0


seed_everything(config.seed)

69

In [5]:
if not os.path.exists(config.save_dir):
  os.makedirs(config.save_dir)

# üìä Preprocessing

In [6]:
data = json.load(open(config.train_dataset_path))
test_data = json.load(open(config.test_dataset_path))

print('num_samples:', len(data))
print(data[0].keys())


num_samples: 6807
dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])


In [7]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [8]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(f'{config.save_dir}')


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

('/kaggle/working/exp1/tokenizer_config.json',
 '/kaggle/working/exp1/special_tokens_map.json',
 '/kaggle/working/exp1/tokenizer.model',
 '/kaggle/working/exp1/added_tokens.json',
 '/kaggle/working/exp1/tokenizer.json')

In [9]:
df_train = pd.DataFrame(data)
df_train.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment:¬† Visualization¬†Reflection¬† Submitt...,"[Assignment, :, ¬† , Visualization, ¬†, Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [10]:
df_mpware = json.load(open('/kaggle/input/pii-mixtral8x7b-generated-essays/mpware_mixtral8x7b_v1.1-no-i-username.json'))
df_mpware = pd.DataFrame(df_mpware)
df_mpware['document'] =  [i+30000 for i in range(len(df_mpware))]
df_mpware.columns = df_train.columns
df_mpware['fold'] = -1
df_mpware.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,fold
0,30000,Storytelling Challenge: Crafting Compelling Na...,"[Storytelling, Challenge, :, Crafting, Compell...","[True, False, True, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",-1
1,30001,Storytelling Challenge: The Power of Narrative...,"[Storytelling, Challenge, :, The, Power, of, N...","[True, False, True, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",-1
2,30002,Storytelling Challenge: The Power of Narrative...,"[Storytelling, Challenge, :, The, Power, of, N...","[True, False, True, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",-1


#### Creating Split to compare results with @conjuring92 validation folds as shared here https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/473139#2635230 

In [11]:
df_train['fold'] = df_train['document'] % 4
df_train.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,fold
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",3
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",2
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",0


In [12]:
df_train.shape

(6807, 6)

In [13]:
def tokenize_row(example):
    text = []
    token_map = []
    labels = []
    targets = []
    idx = 0
    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):
        text.append(t)
        labels.extend([l]*len(t))
        token_map.extend([idx]*len(t))

        if l in config.target_cols:  
            targets.append(1)
        else:
            targets.append(0)
        
        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1)
        idx += 1

    if config.valid_stride:
        tokenized = tokenizer("".join(text), return_offsets_mapping=True, padding='longest', truncation=True, max_length=2048)  # Adjust max_length if needed
    else:
        tokenized = tokenizer("".join(text), return_offsets_mapping=True, padding='longest', truncation=True, max_length=config.max_length)  # Adjust max_length if needed
        
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue
        
        if text[start_idx].isspace():
            start_idx += 1
        try:
            token_labels.append(label2id[labels[start_idx]])
        except:
            continue
    length = len(tokenized.input_ids)
    
    return {
        "input_ids": tokenized.input_ids,
        "attention_mask": tokenized.attention_mask,
        "offset_mapping": tokenized.offset_mapping,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num > 0 else 0,
        "token_map": token_map,
    }

In [14]:
import pandas as pd

def downsample_df(train_df, percent):

    train_df['is_labels'] = train_df['labels'].apply(lambda labels: any(label != 'O' for label in labels))
    
    true_samples = train_df[train_df['is_labels'] == True]
    false_samples = train_df[train_df['is_labels'] == False]
    
    n_false_samples = int(len(false_samples) * percent)
    downsampled_false_samples = false_samples.sample(n=n_false_samples, random_state=42)
    
    downsampled_df = pd.concat([true_samples, downsampled_false_samples])    
    return downsampled_df


In [15]:
def add_token_indices(doc_tokens):
    token_indices = list(range(len(doc_tokens)))
    return token_indices

df_train['token_indices'] = df_train['tokens'].apply(add_token_indices)

In [16]:
df_train.describe()

Unnamed: 0,document,fold
count,6807.0,6807.0
mean,14739.782283,1.499927
std,4920.715769,1.118182
min,7.0,0.0
25%,11061.5,1.0
50%,14818.0,1.0
75%,18974.0,3.0
max,22687.0,3.0


In [17]:
%%time
if config.load_from_disk is None:
  for i in range(-1, config.NFOLDS):

      
      train_df = df_train[df_train['fold']==i].reset_index(drop=True)
      
      if i==config.trn_fold:
          config.valid_stride = True
      if i!=config.trn_fold and config.downsample > 0:
          train_df = downsample_df(train_df, config.downsample)
          config.valid_stride = False
          

      print(len(train_df))
      ds = Dataset.from_pandas(train_df)

      ds = ds.map(
          tokenize_row,
          batched=False,
          num_proc=2,
          desc="Tokenizing",
      )

      ds.save_to_disk(f"{config.save_dir}fold_{i}.dataset")
      with open(f"{config.save_dir}_pkl", "wb") as fp:
          pickle.dump(train_df, fp)
      print("Saving dataset to disk:", config.save_dir)

      
        

0
Saving dataset to disk: /kaggle/working/exp1
1698
   

Tokenizing #0:   0%|          | 0/849 [00:00<?, ?ex/s]

 

Tokenizing #1:   0%|          | 0/849 [00:00<?, ?ex/s]

Saving dataset to disk: /kaggle/working/exp1
1346
   

Tokenizing #0:   0%|          | 0/673 [00:00<?, ?ex/s]

 

Tokenizing #1:   0%|          | 0/673 [00:00<?, ?ex/s]

Saving dataset to disk: /kaggle/working/exp1
1324
   

Tokenizing #0:   0%|          | 0/662 [00:00<?, ?ex/s]

 

Tokenizing #1:   0%|          | 0/662 [00:00<?, ?ex/s]

Saving dataset to disk: /kaggle/working/exp1
1335
   

Tokenizing #0:   0%|          | 0/668 [00:00<?, ?ex/s]

 

Tokenizing #1:   0%|          | 0/667 [00:00<?, ?ex/s]

Saving dataset to disk: /kaggle/working/exp1
CPU times: user 5.18 s, sys: 3.4 s, total: 8.58 s
Wall time: 1min 50s


In [18]:
ds[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'fold', 'token_indices', 'is_labels', '__index_level_0__', 'input_ids', 'attention_mask', 'offset_mapping', 'length', 'target_num', 'group', 'token_map'])

# üîù Competition Metrics

In [19]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    
    for parameter in module.parameters():
        parameter.requires_grad = False

In [20]:
import sys
sys.path.append('/kaggle/input/piimetric')
from comp_metric import compute_metrics

In [21]:
import pandas as pd

def backwards_map_preds(sub_predictions, max_len):
    if max_len != 1: # nothing to map backwards if sequence is too short to be split in the first place
        if i == 0:
            # First sequence needs no SEP token (used to end a sequence)
            sub_predictions = sub_predictions[:,:-1,:]
        elif i == max_len-1:
            # End sequence needs to CLS token + Stride tokens 
            sub_predictions = sub_predictions[:,1+STRIDE:,:] # CLS tokens + Stride tokens
        else:
            # Middle sequence needs to CLS token + Stride tokens + SEP token
            sub_predictions = sub_predictions[:,1+STRIDE:-1,:]
    return sub_predictions

def backwards_map_(row_attribute, max_len):
    # Same logics as for backwards_map_preds - except lists instead of 3darray
    if max_len != 1:
        if i == 0:
            row_attribute = row_attribute[:-1]
        elif i == max_len-1:
            row_attribute = row_attribute[1+STRIDE:]
        else:
            row_attribute = row_attribute[1+STRIDE:-1]
    return row_attribute

def predictions_to_df(preds, ds, id2label=id2label):
    triplets = []
    pairs = set()
    document, token, label, token_str = [], [], [], []
    for p, token_map, offsets, tokens, doc in zip(preds, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
        # p = p.argmax(-1).cpu().detach().numpy()
        p = p.cpu().detach().numpy()
        
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[(token_pred)]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): 
                break

            
            token_id = token_map[start_idx]

            if label_pred == "O" or token_id == -1:
                continue
            
            pair = (doc, token_id)
    
            if pair in pairs:
                continue

            
            
            document.append(doc)
            token.append(token_id)
            label.append(label_pred)
            token_str.append(tokens[token_id])
            pairs.add(pair)
                
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })
    df["row_id"] = list(range(len(df)))
    
    return df


# üß† Model

In [22]:


def process_predictions(flattened_preds, threshold=0.9):
    
    preds_final = []
    for predictions in flattened_preds:
        
        predictions_softmax = torch.softmax(predictions, dim=-1)        
        predictions_argmax = predictions.argmax(-1)
        predictions_without_O = predictions_softmax[ :, :12].argmax(-1)
        
        O_predictions = predictions_softmax[ :, 12]
        pred_final = torch.where(O_predictions < threshold, predictions_without_O, predictions_argmax)        
        preds_final.append(pred_final)
    
    return preds_final



In [23]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers.models.llama.modeling_llama import *
from transformers.modeling_outputs import TokenClassifierOutput

class LlamaForTokenClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]

        return sequence_output


In [24]:
import random

class LSTMHead(nn.Module):
    def __init__(self, in_features, hidden_dim, n_layers):
        super().__init__()
        self.lstm = nn.LSTM(in_features,
                            hidden_dim,
                            n_layers,
                            batch_first=True,
                            bidirectional=True,
                            dropout=0.1)
        self.out_features = hidden_dim

    def forward(self, x):
        self.lstm.flatten_parameters()
        hidden, (_, _) = self.lstm(x)
        out = hidden
        return out

    
class PIIModel(pl.LightningModule):
    def __init__(self,config, val_ds,true_val_df):
        super().__init__()
        self.cfg = config
        self.val_ds = val_ds
        self.true_val_df = true_val_df
        self.model_config = AutoConfig.from_pretrained(
            config.model_name,
        )

        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        self.model_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )

        self.transformers_model = LlamaForTokenClassification.from_pretrained(
        config.model_name, num_labels=len(self.cfg.target_cols), id2label=id2label, label2id=label2id, 
        )
        peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.0)
        self.transformers_model = get_peft_model(self.transformers_model, peft_config)
        self.transformers_model.gradient_checkpointing_enable()  
        self.transformers_model.print_trainable_parameters()
        self.head = LSTMHead(in_features=self.model_config.hidden_size, hidden_dim=self.model_config.hidden_size//2, n_layers=1)
        self.output = nn.Linear(self.model_config.hidden_size, len(self.cfg.target_cols))

        self.loss_function = nn.CrossEntropyLoss(reduction='mean',ignore_index=-100) 
        self.validation_step_outputs = []


    def forward(self, input_ids, attention_mask,train):
        
        transformer_out = self.transformers_model(input_ids,attention_mask = attention_mask)#[0]
        sequence_output = self.head(transformer_out)
        logits = self.output(sequence_output)
        
        return (logits, _)
    

    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target = batch['labels'] 

        outputs = self(input_ids,attention_mask,train=True)
        output = outputs[0]
        loss = self.loss_function(output.view(-1,len(self.cfg.target_cols)), target.view(-1))
        
        self.log('train_loss', loss , prog_bar=True)
        return {'loss': loss}
    
    def train_epoch_end(self,outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        print(f'epoch {trainer.current_epoch} training loss {avg_loss}')
        return {'train_loss': avg_loss} 
    
    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target = batch['labels'] 

        outputs = self(input_ids,attention_mask,train=False)
        output = outputs[0]

        loss = self.loss_function(output.view(-1,len(self.cfg.target_cols)), target.view(-1))
        
        self.log('val_loss', loss , prog_bar=True)
        self.validation_step_outputs.append({"val_loss": loss, "logits": output, "targets": target})
        return {'val_loss': loss, 'logits': output,'targets':target}        

    
    def on_validation_epoch_end(self):
        outputs = self.validation_step_outputs
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()

        flattened_preds = [logit for batch in outputs for logit in batch['logits']]

        flattened_preds = process_predictions(flattened_preds)
        # print(flattened_preds.shape)
        pred_df = predictions_to_df(flattened_preds, self.val_ds)
        
        print(pred_df.shape)
        print(pred_df)
        
        self.validation_step_outputs = []

        # print(output_val.shape)
        avg_score = compute_metrics(pred_df,self.true_val_df)
        f5_score = avg_score['ents_f5']
        print(f'epoch {trainer.current_epoch} validation loss {avg_loss}')
        print(f'epoch {trainer.current_epoch} validation scores {avg_score}')
        
        return {'val_loss': avg_loss,'val_f5':f5_score}
    
        
    def train_dataloader(self):
        return self._train_dataloader 
    
    def validation_dataloader(self):
        return self._validation_dataloader

    def get_optimizer_params(self, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in self.transformers_model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in self.transformers_model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.named_parameters() if "transformers_model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr = config.learning_rate)

        epoch_steps = self.cfg.data_length
        batch_size = self.cfg.batch_size

        warmup_steps = 0.0 * epoch_steps // batch_size
        training_steps = self.cfg.epochs * epoch_steps // batch_size
        # scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,training_steps,-1)
        # scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, warmup_steps, training_steps, lr_end=1e-6, power=3.0)
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps, num_cycles=1)
        
        lr_scheduler_config = {
                'scheduler': scheduler,
                'interval': 'step',
                'frequency': 1,
            }

        return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_config}
    
    

In [25]:

collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=512)

In [26]:

def create_val_df(df, fold):
    val_df = df[df['fold']==fold].reset_index(drop=True).copy()
    
    val_df = val_df[['document', 'tokens', 'labels']].copy()
    val_df = val_df.explode(['tokens', 'labels']).reset_index(drop=True).rename(columns={'tokens': 'token', 'labels': 'label'})
    val_df['token'] = val_df.groupby('document').cumcount()
    
    label_list = val_df['label'].unique().tolist()
    
    reference_df = val_df[val_df['label'] != 'O'].copy()
    reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
    reference_df = reference_df[['row_id', 'document', 'token', 'label']].copy()
    return reference_df
    

In [27]:
for fold in range(-1, config.NFOLDS):
    if fold != config.trn_fold:
        continue
    train_ds_list = []


    print(f"====== FOLD RUNNING {fold}======")

    
    for i in range(-1, config.NFOLDS):
      if i == fold:
        continue
      if len(train_ds_list) >= 0:
        print(len(train_ds_list))
        train_ds_list.append(load_from_disk(f'{config.save_dir}fold_{i}.dataset'))

    keep_cols = {"input_ids", "attention_mask", "labels"}
    train_ds = concatenate_datasets(train_ds_list).sort("length") #.select([i for i in range(30)])

    train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in keep_cols])
    valid_ds = load_from_disk(f'{config.save_dir}fold_{fold}.dataset').sort("length")
    valid_ds = valid_ds.remove_columns([c for c in valid_ds.column_names if c not in keep_cols])
    val_ds = load_from_disk(f'{config.save_dir}fold_{fold}.dataset').sort("length")

    true_val_df = create_val_df(df_train, fold)
    
    config.data_length = len(train_ds)
    config.len_token = len(tokenizer)
    # swa_callback = pl.callbacks.StochasticWeightAveraging(swa_epoch_start=0.8, swa_lrs=None, 
                                                              # annealing_epochs=1, annealing_strategy='cos', 
                                                              # avg_fn=None, device="cuda")
    print('Dataset Loaded....')
    print((train_ds[0].keys()))
    print((valid_ds)[0].keys())
    print("Generating Train DataLoader")
    train_dataloader = DataLoader(train_ds, batch_size = config.batch_size, shuffle = True, num_workers= 4, pin_memory=False,collate_fn = collator)
    
    print("Generating Validation DataLoader")
    validation_dataloader = DataLoader(valid_ds, batch_size = config.batch_size, shuffle = False, num_workers= 4, pin_memory=False,collate_fn = collator)

    
    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=8, verbose= True, mode="min")
    checkpoint_callback = ModelCheckpoint(monitor='val_loss',
                                          dirpath= config.save_dir,
                                      save_top_k=1,
                                      save_last= True,
                                      save_weights_only=True,
                                      filename= f'ckeckpoint_{fold}',
                                      verbose= True,
                                      mode='min')
    
    print("Model Creation")
    

    model = PIIModel(config, val_ds,true_val_df)
    # model.load_state_dict(torch.load('/home/nischay/PID/nbs/outputs2/exp12_baseline_debv3base_1024_extv1/ckeckpoint_0-v2.ckpt','cpu')['state_dict'])
    trainer = Trainer(max_epochs= config.epochs,
                      deterministic=True,
                      val_check_interval=0.5,
                      accumulate_grad_batches=2, 
                      devices=[0],
                      precision=16, 
                      accelerator="gpu" ,
                      callbacks=[checkpoint_callback,early_stop_callback])    
    # print("Trainer Starting")
    trainer.fit(model , train_dataloader , validation_dataloader)  

    print("prediction on validation data")

    
    del model,train_dataloader,validation_dataloader,train_ds,valid_ds
    gc.collect()
    torch.cuda.empty_cache()


0
1
2
3
Dataset Loaded....
dict_keys(['labels', 'input_ids', 'attention_mask'])
dict_keys(['labels', 'input_ids', 'attention_mask'])
Generating Train DataLoader
Generating Validation DataLoader
Model Creation


config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/3.66G [00:00<?, ?B/s]

trainable params: 1,597,440 || all params: 1,750,878,720 || trainable%: 0.09123647353484313


/opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:558: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/exp1 exists and is not empty.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

(183, 5)
     document  token           label   token_str  row_id
0       10472      0        I-ID_NUM       Cardo       0
1       10472      1        I-ID_NUM     Dalisay       1
2       10472      3      B-USERNAME  Reflection       2
3       10472      4        B-ID_NUM           :       3
4       10472      6      B-USERNAME       After       4
..        ...    ...             ...         ...     ...
178     16612    100  I-URL_PERSONAL          to     178
179     16612    101      B-USERNAME  understand     179
180     16612    102  I-URL_PERSONAL          my     180
181     16612    104         B-EMAIL   challenge     181
182     16612    105      B-USERNAME           .     182

[183 rows x 5 columns]
epoch 0 validation loss 2.7717409133911133
epoch 0 validation scores {'ents_p': 0.0, 'ents_r': 0.0, 'ents_f5': 0.0, 'ents_per_type': {'ID_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'PHONE_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'URL_PERSONAL': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'NAME_STUDE

Training: |          | 0/? [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Validation: |          | 0/? [00:00<?, ?it/s]

(1022, 5)
      document  token           label  token_str  row_id
0        10472      0  B-NAME_STUDENT      Cardo       0
1        10472      1  I-NAME_STUDENT    Dalisay       1
2         8236    133  B-NAME_STUDENT   Geovanny       2
3         8236    134  I-NAME_STUDENT      Lopez       3
4         5944      0  B-NAME_STUDENT  Margarita       4
...        ...    ...             ...        ...     ...
1017      6296      0  B-NAME_STUDENT     Carlos    1017
1018      6296      1  I-NAME_STUDENT  Hernandez    1018
1019      6296   1213  B-NAME_STUDENT       Mary    1019
1020      6296   1214  B-NAME_STUDENT    Murillo    1020
1021      6296   1760  B-NAME_STUDENT    Murillo    1021

[1022 rows x 5 columns]
epoch 0 validation loss 0.0025332930963486433
epoch 0 validation scores {'ents_p': 0.5724070450097848, 'ents_r': 0.8890577507598785, 'ents_f5': 0.8705357142857143, 'ents_per_type': {'ID_NUM': {'p': 0.41818181818181815, 'r': 0.8846153846153846, 'f5': 0.8482269503546099}, 'NAME_STUD



Validation: |          | 0/? [00:00<?, ?it/s]

(1019, 5)
      document  token           label                          token_str  \
0        10472      0  B-NAME_STUDENT                              Cardo   
1        10472      1  I-NAME_STUDENT                            Dalisay   
2         8236    133  B-NAME_STUDENT                           Geovanny   
3         8236    134  I-NAME_STUDENT                              Lopez   
4         5944      0  B-NAME_STUDENT                          Margarita   
...        ...    ...             ...                                ...   
1014     21720   1479  I-NAME_STUDENT                             Bocken   
1015      6784    733  B-URL_PERSONAL  https://soto.com/listregister.asp   
1016      6784   1705  B-NAME_STUDENT                           Cristian   
1017      6296      0  B-NAME_STUDENT                             Carlos   
1018      6296      1  I-NAME_STUDENT                          Hernandez   

      row_id  
0          0  
1          1  
2          2  
3          3  
4 



Validation: |          | 0/? [00:00<?, ?it/s]

(969, 5)
     document  token           label                          token_str  \
0       10472      0  B-NAME_STUDENT                              Cardo   
1       10472      1  I-NAME_STUDENT                            Dalisay   
2        8236    133  B-NAME_STUDENT                           Geovanny   
3        8236    134  I-NAME_STUDENT                              Lopez   
4        5944      0  B-NAME_STUDENT                          Margarita   
..        ...    ...             ...                                ...   
964     21720   1354  B-NAME_STUDENT                              Lammi   
965      6784    733  B-URL_PERSONAL  https://soto.com/listregister.asp   
966      6784   1705  B-NAME_STUDENT                           Cristian   
967      6296      0  B-NAME_STUDENT                             Carlos   
968      6296      1  I-NAME_STUDENT                          Hernandez   

     row_id  
0         0  
1         1  
2         2  
3         3  
4         4  
..    



Validation: |          | 0/? [00:00<?, ?it/s]

(843, 5)
     document  token           label                          token_str  \
0       10472      0  B-NAME_STUDENT                              Cardo   
1       10472      1  I-NAME_STUDENT                            Dalisay   
2        8236    133  B-NAME_STUDENT                           Geovanny   
3        8236    134  I-NAME_STUDENT                              Lopez   
4        5944      0  B-NAME_STUDENT                          Margarita   
..        ...    ...             ...                                ...   
838      7804   1297  B-NAME_STUDENT                            Giorgia   
839      7804   1298  I-NAME_STUDENT                            Piccolo   
840      6784    733  B-URL_PERSONAL  https://soto.com/listregister.asp   
841      6296      0  B-NAME_STUDENT                             Carlos   
842      6296      1  I-NAME_STUDENT                          Hernandez   

     row_id  
0         0  
1         1  
2         2  
3         3  
4         4  
..    



Validation: |          | 0/? [00:00<?, ?it/s]

(814, 5)
     document  token           label                          token_str  \
0       10472      0  B-NAME_STUDENT                              Cardo   
1       10472      1  I-NAME_STUDENT                            Dalisay   
2        8236    133  B-NAME_STUDENT                           Geovanny   
3        8236    134  I-NAME_STUDENT                              Lopez   
4        5944      0  B-NAME_STUDENT                          Margarita   
..        ...    ...             ...                                ...   
809      7804   1297  B-NAME_STUDENT                            Giorgia   
810      7804   1298  I-NAME_STUDENT                            Piccolo   
811      6784    733  B-URL_PERSONAL  https://soto.com/listregister.asp   
812      6296      0  B-NAME_STUDENT                             Carlos   
813      6296      1  I-NAME_STUDENT                          Hernandez   

     row_id  
0         0  
1         1  
2         2  
3         3  
4         4  
..    



Validation: |          | 0/? [00:00<?, ?it/s]

(838, 5)
     document  token           label                          token_str  \
0       10472      0  B-NAME_STUDENT                              Cardo   
1       10472      1  I-NAME_STUDENT                            Dalisay   
2        8236    133  B-NAME_STUDENT                           Geovanny   
3        8236    134  I-NAME_STUDENT                              Lopez   
4        5944      0  B-NAME_STUDENT                          Margarita   
..        ...    ...             ...                                ...   
833      7804   1297  B-NAME_STUDENT                            Giorgia   
834      7804   1298  I-NAME_STUDENT                            Piccolo   
835      6784    733  B-URL_PERSONAL  https://soto.com/listregister.asp   
836      6296      0  B-NAME_STUDENT                             Carlos   
837      6296      1  I-NAME_STUDENT                          Hernandez   

     row_id  
0         0  
1         1  
2         2  
3         3  
4         4  
..    



prediction on validation data
