In [1]:
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
INFERENCE_MAX_LENGTH = 3500
# This be a function called 'tokenize', me hearties!
def tokenize(example, tokenizer):
    # We be creatin' two empty lists, 'text' and 'token_map', to store our tokens and their respective maps.
    text = []
    token_map = []
    
    # We start the 'idx' at 0, it be used to keep track of the tokens.
    idx = 0
    
    # Now, we be loopin' through the tokens and their trailin' white spaces.
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        # We add the token 't' to the 'text' list.
        text.append(t)
        
        # We be extendin' the 'token_map' list by repeatin' the 'idx' as many times as the length of token 't'.
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        # We increment 'idx' to keep track of the next token.
        idx += 1
        
    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'token_map'.
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
    # We return a dictionary containin' the tokenized data and the 'token_map'.
    return {
        **tokenized,
        "token_map": token_map,
    }

2024-03-29 18:14:57.639972: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 18:14:57.640109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 18:14:57.780580: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))

# Create a dataset from the loaded data
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

# Initialize a tokenizer and model from the pretrained model path
model_paths = {'/kaggle/input/pii-deberta-models/cuerpo-de-piiranha': 2/10,
    '/kaggle/input/pii-deberta-models/cabeza-del-piinguuino': 5/10,
    '/kaggle/input/pii-models/piidd-org-sakura': 2/10 }

first_model_path = list(model_paths.keys())[0]

tokenizer = AutoTokenizer.from_pretrained(first_model_path)

# Tokenize the dataset using the 'tokenize' function in parallel
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 2)

import gc
import torch
import numpy as np

from scipy.special import softmax


all_preds = []

# Calculate the total weight
total_weight = sum(model_paths.values())

for model_path, weight in model_paths.items():
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 16)
    args = TrainingArguments(
        ".", 
        per_device_eval_batch_size=1, 
        report_to="none",
    )
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=collator,
        tokenizer=tokenizer,
    )
    predictions = trainer.predict(ds).predictions

    weighted_predictions = softmax(predictions, axis = -1) * weight
    all_preds.append(weighted_predictions)
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

# Calculate the weighted average of predictions
weighted_average_predictions = np.sum(all_preds, axis=0) / total_weight

   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

In [3]:
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = weighted_average_predictions.argmax(-1)
preds_without_O = weighted_average_predictions[:,:,:12].argmax(-1)
O_preds = weighted_average_predictions[:,:,12]


threshold = 0.99
preds_final = np.where(O_preds < threshold, preds_without_O , preds)
triplets = []
document, token, label, token_str = [], [], [], []

# For each prediction, token mapping, offsets, tokens, and document in the dataset
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    # Iterate through each token prediction and its corresponding offsets
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  # Predicted label from token

        # If start and end indices sum to zero, continue to the next iteration
        if start_idx + end_idx == 0:
            continue

        # If the token mapping at the start index is -1, increment start index
        if token_map[start_idx] == -1:
            start_idx += 1

        # Ignore leading whitespace tokens ("\n\n")
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        # If start index exceeds the length of token mapping, break the loop
        if start_idx >= len(token_map):
            brea
        token_id = token_map[start_idx]  # Token ID at start index

        # Ignore "O" predictions and whitespace tokens
        if label_pred != "O" and token_id != -1:
            triplet = (doc, token_id)  # Form a triplet

            # If the triplet is not in the list of triplets, add it
            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)


In [4]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

# Assign each row a unique 'row_id'
df["row_id"] = list(range(len(df)))

# Display a glimpse of the first 100 rows of your data
display(df.head(100))

# Cast your findings into a CSV file for further exploration
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

# May the winds of fortune guide ye to untold discoveries!

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
