#### Firstly import everything.
Using:
```
LabelEncoder for labeling the target ner values and encoding it as numbers.
Hugging Face models.
```



In [86]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from transformers import DistilBertTokenizerFast, DistilBertModel, AutoTokenizer, AutoModel
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm
from torch.optim import AdamW
from seqeval.metrics import classification_report
import json
import joblib
# !pip install transformers torch
# !pip install seqeval==0.0.10
# !pip install huggingface_hub[hf_xet]

In [338]:
def _to_df(path):
    rows = []
    sentence_id = 0

    with open(path, encoding="utf-8") as file_text:
        for line in file_text:
            line = line.strip()

            if not line:
                sentence_id += 1
                continue

            if line.startswith("-DOCSTART-"):
                continue

            token, pos, chunk, ner = line.split()
            rows.append({"sentence_id": sentence_id, "token": token, "pos": pos, "chunk": chunk, "ner": ner})

    return pd.DataFrame(rows)

In [339]:
train_df = _to_df(os.path.join("_resources", "train.txt"))
test_df = _to_df(os.path.join("_resources", "test.txt"))
valid_df = _to_df(os.path.join("_resources", "valid.txt"))

In [340]:
def check_duplicates(df: pd.DataFrame):
	return df.groupby("sentence_id")[['token', 'pos', 'chunk', 'ner']]\
		.apply(lambda x: tuple(map(tuple, x.values)))\
		.duplicated().sum()
					

In [341]:
def drop_sentence_duplicates(df: pd.DataFrame):
    sentence_repr = df.groupby("sentence_id")[['token', 'pos', 'chunk', 'ner']]\
                      .apply(lambda x: tuple(map(tuple, x.values)))

    unique_sentence_ids = sentence_repr[~sentence_repr.duplicated()].index

    return df[df["sentence_id"].isin(unique_sentence_ids)].reset_index(drop=True)


In [342]:
print(f"""Duplicate (Sentences) in Training: {check_duplicates(train_df)}
Duplicate (Sentences) in Validation: {check_duplicates(valid_df)}
Duplicate (Sentences) in Testing: {check_duplicates(test_df)}""")

Duplicate (Sentences) in Training: 1348
Duplicate (Sentences) in Validation: 179
Duplicate (Sentences) in Testing: 266


In [343]:
train_df = drop_sentence_duplicates(train_df)
valid_df = drop_sentence_duplicates(valid_df)
test_df = drop_sentence_duplicates(test_df)

In [344]:
train_df.columns.tolist()

['sentence_id', 'token', 'pos', 'chunk', 'ner']

In [346]:
def add_features(df):
    df.loc[:, 'is_capitalized'] = df['token'].str[0].str.isupper().astype(int)
    df.loc[:, 'is_all_caps'] = df['token'].str.isupper().astype(int)
    df.loc[:, 'is_all_lower'] = df['token'].str.islower().astype(int)
    df.loc[:, 'token_len'] = df['token'].str.len().astype(int)
    df.loc[:, 'contains_digit'] = df['token'].str.contains(r'\d', regex=True).astype(int)
    df.loc[:, 'contains_dash'] = df['token'].str.contains('-').astype(int)

In [347]:
add_features(train_df)
add_features(valid_df)
add_features(test_df)

In [None]:
def encode_labels(df_list):
    all_tags = pd.concat([df["ner"] for df in df_list]).unique()
    label_encoder = LabelEncoder()
    label_encoder.fit(all_tags)

    for df in df_list:
        df["ner_id"] = label_encoder.transform(df["ner"])

    return label_encoder

def add_sentence_position(df):
    return df.groupby("sentence_id").apply(
        lambda s: s.assign(pos_in_sentence=[(i + 1) / len(s) for i in range(len(s))])
    ).reset_index(drop=True)

def encode_for_transformers(df, tokenizer, max_len=50):
    grouped = df.groupby("sentence_id").apply(
        lambda s: {
            "tokens": list(s["token"]),
            "labels": list(s["ner_id"]),
            "pos_in_sentence": list(s["pos_in_sentence"]),
            "is_capitalized": list(s["is_capitalized"]),
            "is_all_caps": list(s["is_all_caps"]),
            "is_all_lower": list(s["is_all_lower"]),
            "token_len": list(s["token_len"]),
            "contains_digit": list(s["contains_digit"]),
            "contains_dash": list(s["contains_dash"])
        }
    ).tolist()

    tokens_list = [[str(tok) for tok in g["tokens"]] for g in grouped]
    encodings = tokenizer(tokens_list, is_split_into_words=True, truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")

    aligned_labels, aligned_positions = [], []
    extras = [("is_capitalized", -1), ("is_all_caps", -1), ("is_all_lower", -1), ("token_len", 0), ("contains_digit", -1), ("contains_dash", -1)]
    aligned_extra = {c[0]: ([], c[1]) for c in extras}

    for i, g in enumerate(grouped):
        word_ids = encodings.word_ids(batch_index=i)
        
        labels = [-100 if w_id is None else g["labels"][w_id] for w_id in word_ids]
        positions = [0.0 if w_id is None else g["pos_in_sentence"][w_id] for w_id in word_ids]
        
        aligned_labels.append(labels)
        aligned_positions.append(positions)

        for feat, pad in aligned_extra.items():
            aligned_extra[feat][0].append([pad[1] if w_id is None else g[feat][w_id] for w_id in word_ids])
    
        if i < 3:
            print("=" * 50)
            print(f"[DEBUG] Sentence {i + 1}")
            print(f"Tokens: {g['tokens']}")
            print(f"Original Labels: {g['labels']}")
            print(f"Positions: {g['pos_in_sentence']}")

            tokens_subwords = tokenizer.convert_ids_to_tokens(encodings["input_ids"][i])
            print(f"Tokenized (with subwords): {tokens_subwords}")
            print(f"Word IDs: {word_ids}")
            print(f"Aligned Labels: {labels}")
            print(f"Aligned Positions: {positions}")
            for feat in aligned_extra:
                print(f"Aligned {feat}: {aligned_extra[feat][0][-1]}")

    encodings["labels"] = torch.tensor(aligned_labels, dtype=torch.long)
    encodings["pos_in_sentence"] = torch.tensor(aligned_positions, dtype=torch.float32)

    for feat in aligned_extra:
        encodings[feat] = torch.tensor(aligned_extra[feat][0], dtype=torch.float32)

    return encodings

def save_preprocessed_data(train_df, test_df, valid_df):
    combined = pd.concat([
        train_df.assign(split="train"),
        test_df.assign(split="test"),
        valid_df.assign(split="valid")
    ], ignore_index=True)
    path = "./_resources/preprocessed.csv"
    combined.to_csv(path, index=False)
    print(f"Preprocessed data saved to {path}")

In [358]:
os.makedirs("./_resources/models", exist_ok=True)

In [63]:
label_encoder = encode_labels([train_df, test_df, valid_df])
label_encoder.classes_

array(['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG',
       'I-PER', 'O'], dtype=object)

In [360]:
train_df = add_sentence_position(train_df)
test_df = add_sentence_position(test_df)
valid_df = add_sentence_position(valid_df)

In [361]:
save_preprocessed_data(train_df, test_df, valid_df)

[INFO] Preprocessed data saved to ./_resources/preprocessed.csv


In [48]:
def load_preprocessed_data():
	df: pd.DataFrame = pd.read_csv("./_resources/preprocessed.csv").astype({
		"sentence_id": "int64",
		"token": "string",
		"pos": "string",
		"chunk": "string",
		"ner": "string",
		"is_capitalized": "int64",
		"is_all_caps": "int64",
		"is_all_lower": "int64",
		"token_len": "int64",
		"contains_digit": "int64",
		"contains_dash": "int64",
		"ner_id": "int64",
		"pos_in_sentence": "float64",
		"split": "string"
	})
	_train_df = df[df['split'] == 'train']
	_valid_df = df[df['split'] == 'valid']
	_test_df = df[df['split'] == 'test']

	return _train_df, _valid_df, _test_df

In [49]:
train_df, valid_df, test_df = load_preprocessed_data()

In [50]:
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
# ("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [51]:
train_enc = encode_for_transformers(train_df, tokenizer, 60)
test_enc = encode_for_transformers(test_df, tokenizer, 60)
valid_enc = encode_for_transformers(valid_df, tokenizer, 60)

[DEBUG] Sentence 1
Tokens: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
Original Labels: [2, 8, 1, 8, 8, 8, 1, 8, 8]
Positions: [0.1111111111111111, 0.2222222222222222, 0.3333333333333333, 0.4444444444444444, 0.5555555555555556, 0.6666666666666666, 0.7777777777777778, 0.8888888888888888, 1.0]
Tokenized (with subwords): ['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
Word IDs: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None, None, None, None, None, None, None, None, None, None, None, N

In [52]:

class NERWithFeatures(nn.Module):
    def __init__(self, num_labels, transformer_model="google-bert/bert-base-uncased"):
        super().__init__()
        # self.transformer = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.transformer = AutoModel.from_pretrained(transformer_model)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.transformer.config.hidden_size, num_labels)
        # for name, param in self.transformer.named_parameters():
        #     if "layer" in name and int(name.split(".")[2]) < 8:
        #         param.requires_grad = False
        
    def forward(self, input_ids, attention_mask, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = transformer_outputs.last_hidden_state
        token_embeddings = self.dropout(token_embeddings)
        logits = self.classifier(token_embeddings)
        return logits


In [None]:
class NERDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# DataLoaders
batch_size = 64
train_loader = DataLoader(NERDataset(train_enc), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(NERDataset(valid_enc), batch_size=batch_size)

In [59]:
_labels_len = len(train_df['ner_id'].unique())
model = NERWithFeatures(_labels_len, transformer_model="google/electra-small-discriminator")

In [None]:
# optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()))
optimizer = AdamW(model.parameters())
num_epochs = 3
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

100%|██████████| 199/199 [10:31<00:00,  3.17s/it]


Epoch 1: Train Loss = 0.3451


100%|██████████| 199/199 [10:32<00:00,  3.18s/it]

Epoch 2: Train Loss = 0.1081





In [70]:
def train_epoch(model, loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        logits = model(input_ids, attention_mask, labels)
        active_loss = labels.view(-1) != -100
        active_logits = logits.view(-1, _labels_len)[active_loss]
        active_labels = labels.view(-1)[active_loss]
        loss = nn.functional.cross_entropy(active_logits, active_labels)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
def evaluate(model, loader, label_encoder, tokenizer, save_predictions=False):
    model.eval()
    token_predictions = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()

            for i in range(len(input_ids)):
                tokens = tokenizer.convert_ids_to_tokens(input_ids[i], skip_special_tokens=False)
                active = labels[i] != -100
                active_tokens = [t for t, a in zip(tokens, active) if a]
                
                active_labels = labels[i][active].cpu().numpy()
                active_preds = preds[i][active]
                decoded_labels = label_encoder.inverse_transform(active_labels)
                decoded_preds = label_encoder.inverse_transform(active_preds)
                
                for token, actual, pred in zip(active_tokens, decoded_labels, decoded_preds):
                    token_predictions.append((token, actual, pred))
    
    actuals = [t[1] for t in token_predictions]
    preds = [t[2] for t in token_predictions]

    validation_report = classification_report(actuals, preds)

    print("\nEvaluating on a data set:")
    print("\nClassification report:")
    print(validation_report)
    print("\nSample Token Predictions (first 20):")
    for t in token_predictions[:20]:
        print(t)

    if save_predictions:
        os.makedirs("./_resources", exist_ok=True)
        with open("./_resources/ner_predictions.json", "w") as f:
            json.dump(token_predictions, f, indent=2)

    return validation_report, token_predictions

In [None]:
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler)
    evaluate(model, valid_loader, label_encoder, tokenizer)
    print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}")

100%|██████████| 199/199 [10:21<00:00,  3.12s/it]


Epoch 1: Train Loss = 0.0635


100%|██████████| 199/199 [10:24<00:00,  3.14s/it]

Epoch 2: Train Loss = 0.0628





In [84]:
def save_model(model, tokenizer, label_encoder, save_dir="./_resources/models"):
    os.makedirs(save_dir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(save_dir, "model.pth"))
    tokenizer.save_pretrained(save_dir)
    joblib.dump(label_encoder, os.path.join(save_dir, "label_encoder.joblib"))
    print(f"Model, tokenizer, and label encoder saved to {save_dir}")

def load_model(_train_df, save_dir="./_resources/models", transformer_model="google/electra-small-discriminator"):
    _labels_len = len(_train_df['ner_id'].unique())
    model = NERWithFeatures(_labels_len, transformer_model=transformer_model)
    model.load_state_dict(torch.load(os.path.join(save_dir, "model.pth")))
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    label_encoder = joblib.load(os.path.join(save_dir, "label_encoder.joblib"))
    print(f"Model, tokenizer, and label encoder loaded from {save_dir}")
    return model, tokenizer, label_encoder

In [87]:
save_model(model, tokenizer, label_encoder)
model, tokenizer, label_encoder = load_model(train_df)

Model, tokenizer, and label encoder saved to ./_resources/models
Model, tokenizer, and label encoder loaded from ./_resources/models


In [88]:
test_loader = DataLoader(NERDataset(test_enc), batch_size=batch_size)
validation_report, predictions = evaluate(model, test_loader, label_encoder, tokenizer, save_predictions=True)

Evaluating: 100%|██████████| 50/50 [00:39<00:00,  1.27it/s]



Evaluating on a data set:

Classification report:
           precision    recall  f1-score   support

      LOC       0.85      0.89      0.87      1940
      PER       0.87      0.92      0.89      2644
     MISC       0.71      0.66      0.68       991
      ORG       0.79      0.78      0.78      2498

micro avg       0.82      0.83      0.83      8073
macro avg       0.82      0.83      0.83      8073


Sample Token Predictions (first 20):
('soccer', 'O', 'O')
('-', 'O', 'O')
('japan', 'B-LOC', 'B-LOC')
('get', 'O', 'O')
('lucky', 'O', 'O')
('win', 'O', 'O')
(',', 'O', 'O')
('china', 'B-PER', 'B-LOC')
('in', 'O', 'O')
('surprise', 'O', 'O')
('defeat', 'O', 'O')
('.', 'O', 'O')
('nad', 'B-PER', 'B-PER')
('##im', 'B-PER', 'B-PER')
('lad', 'I-PER', 'I-PER')
('##ki', 'I-PER', 'I-PER')
('al', 'B-LOC', 'B-LOC')
('-', 'B-LOC', 'B-LOC')
('ain', 'B-LOC', 'B-LOC')
(',', 'O', 'O')
