<a href="https://colab.research.google.com/github/nawabro/ML/blob/master/TrainPII.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'pii-detection-removal-from-educational-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F66653%2F7500999%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240422%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240422T103212Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9fd817028d385aafb3ef6283bdff0f4823b9cb389e917c3095f9a411cb5da38e6a016b1ea3c8685d759a286502a7bdcff921dac71cf895c3e59bd9aa13a6deee9b503ce8a5f21291f3c40a383d58ed34440dec7d48127b5b30eeccccbabd437d51d42f0be3ddef9d7d10d7c482ee81e09829df67fb267ec785b4e49cedb338c6b7bdead0a1aa69ba894c62758459c5719a1f065c0c1a7866ad2f2b45e692b723e17cdc1e54143e43c01109d887b63bd3cf7c6f1d6001284d5ec3ed5faae9199604aa7770a4dd83e6b7a44d5782edfbd623fd2d4a0f54d63e36380a437ef80ca5266acb0b06dc09025337e7e7fbd1536a60771f55ebe5be7476a7edddab1056b1'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

Downloading pii-detection-removal-from-educational-data, 22403094 bytes compressed
Downloaded and uncompressed: pii-detection-removal-from-educational-data
Data source import complete.


In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json')
data_test = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/test.json')

In [4]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2TokenizerFast, GPT2ForTokenClassification, Trainer, TrainingArguments

In [5]:
####def add_tokens_to_vocab(tokens, tokenizer):
####    # Initialize a list to store the tokens that are not in the vocabulary
####    tokens_not_in_vocab = []
####
####    # Check if each token is in the vocabulary
####    for token in tokens:
####        # Convert the token to lowercase, since the GPT-2 tokenizer is case-sensitive
####        token_lower = token.lower()
####
####        # Try to convert the token to its corresponding ID
####        token_id = tokenizer.convert_tokens_to_ids(token_lower)
####
####        # If the token is not in the vocabulary, add it to the list of tokens to be added
####        if token_id == tokenizer.unk_token_id:
####            tokens_not_in_vocab.append(token)
####
####    # If there are any tokens not in the vocabulary, add them as special tokens
####    if tokens_not_in_vocab:
####        special_tokens_dict = {'additional_special_tokens': tokens_not_in_vocab}
####        tokenizer.add_special_tokens(special_tokens_dict)
####
####    return tokenizer

In [6]:
features = data['tokens'].tolist()[:10]
targets = data['labels'].tolist()[:10]

In [7]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW

# Prepare your data
###features = [
###    ['I', 'am', 'Nawaraj'],
###    ['He', 'is', 'John'],
###    ['She', 'is', 'Emily'],
###    ['We', 'are', 'students'],
###    ['They', 'are', 'teachers']
###]
###targets = [
###    ['O', 'O', 'B-NAME_STUDENT'],
###    ['O', 'O', 'B-NAME_STUDENT'],
###    ['O', 'O', 'B-NAME_STUDENT'],
###    ['O', 'O', 'O'],
###    ['O', 'O', 'O']]


In [8]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

new_features = []
new_targets = []

for feature, target in zip(features, targets):
    new_feature = []
    new_target = []

    for word, label in zip(feature, target):
        # Tokenize the word
        subwords = tokenizer.tokenize(word)
        for subword in subwords:
            new_feature.append(subword)
            new_target.append(label)

    new_features.append(new_feature)
    new_targets.append(new_target)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
len(new_features),len(new_targets)

(10, 10)

In [10]:
def create_windows_and_pad(features, window_size):
    # Flatten the list
    flat_features = [item for sublist in features for item in sublist]

    # Create sliding window of size window_size without overlap
    new_features = [flat_features[i:i+window_size] for i in range(0, len(flat_features), window_size)]

    # Pad 'O' if length of sublist is less than window_size
    new_features_padded = [feature + ['O'] * (window_size - len(feature)) for feature in new_features]

    return new_features_padded

In [11]:
new_features,new_targets = create_windows_and_pad(new_features,512)[:-1],create_windows_and_pad(new_targets,512)[:-1]

In [12]:
len(new_features),len(new_targets)

(15, 15)

In [13]:
# Split data into training and validation sets
train_features, val_features, train_targets, val_targets = train_test_split(new_features, new_targets, test_size=0.2, random_state=42)

In [14]:
label_map = {'B-EMAIL': 0,
                'B-ID_NUM': 1,
                'B-NAME_STUDENT': 2,
                'B-PHONE_NUM': 3,
                'B-STREET_ADDRESS': 4,
                'B-URL_PERSONAL': 5,
                'B-USERNAME': 6,
                'I-ID_NUM': 7,
                'I-NAME_STUDENT': 8,
                'I-PHONE_NUM': 9,
                'I-STREET_ADDRESS': 10,
                'I-URL_PERSONAL': 11,
                'O': 12}
train_targets = [[label_map[label] for label in target] for target in train_targets]
val_targets = [[label_map[label] for label in target] for target in val_targets]

In [15]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_features, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_features, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")

In [16]:
# Create your datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_targets)
val_dataset = CustomDataset(val_encodings, val_targets)



In [17]:
# Load the model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Freeze all layers except the last one
##for param in model.parameters():
##    param.requires_grad = False
##for param in model.classifier.parameters():
##    param.requires_grad = True

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Prepare for training
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)



In [19]:
# Training loop
for epoch in range(3):
    for batch in DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        total_loss = 0
        num_batches = 0
        for batch in DataLoader(val_dataset, batch_size=16, shuffle=False, drop_last=True):  # Set drop_last to True
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            total_loss += loss.item()
            num_batches += 1
        if num_batches == 0:  # Add a check to prevent division by zero
            print('No batches were created during validation. Consider reducing your batch size.')
        else:
            avg_loss = total_loss / num_batches
            print(f'Validation loss: {avg_loss}')

    model.train()
    model.save_pretrained('./model')

No batches were created during validation. Consider reducing your batch size.
No batches were created during validation. Consider reducing your batch size.
No batches were created during validation. Consider reducing your batch size.


In [20]:
# Load the trained model
model = BertForTokenClassification.from_pretrained('./model')
model.to(device)

# Prepare a sentence for prediction
sentence = ['Hello', 'my', 'name', 'is', 'Nawaraj','Anusueeya']
encoding = tokenizer(sentence, is_split_into_words=True, return_tensors='pt')
print(encoding)

# Move the encoding to the same device as the model
encoding = {k: v.to(device) for k, v in encoding.items()}

# Predict the labels
with torch.no_grad():
    output = model(**encoding)

# Get the predicted labels
predictions = torch.argmax(output.logits, dim=-1)
# Print out the label map and predictions
print("Label map:", label_map)
print("Predictions:", predictions)

# Remove the predictions for [CLS] and [SEP] tokens
# Truncate the predictions to match the length of the input sentence
# Get the predicted labels
predictions = torch.argmax(output.logits, dim=-1)

# Convert the label IDs to labels
label_map_inverse = {v: k for k, v in label_map.items()}
predicted_labels = [label_map_inverse[id] for id in predictions[0].tolist()]

# Merge the labels of subwords
merged_labels = []
for word, label in zip(sentence, predicted_labels):
    if not word.startswith('##'):
        merged_labels.append(label)

print(merged_labels)

{'input_ids': tensor([[  101,  7592,  2026,  2171,  2003,  6583, 11872,  3501,  2019,  2271,
          5657,  3240,  2050,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Label map: {'B-EMAIL': 0, 'B-ID_NUM': 1, 'B-NAME_STUDENT': 2, 'B-PHONE_NUM': 3, 'B-STREET_ADDRESS': 4, 'B-URL_PERSONAL': 5, 'B-USERNAME': 6, 'I-ID_NUM': 7, 'I-NAME_STUDENT': 8, 'I-PHONE_NUM': 9, 'I-STREET_ADDRESS': 10, 'I-URL_PERSONAL': 11, 'O': 12}
Predictions: tensor([[ 3,  4,  5,  3,  3,  1,  3,  1,  1, 10,  1,  1,  3,  5]])
['B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-PHONE_NUM', 'B-PHONE_NUM', 'B-ID_NUM']
