In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = np.load('data/ontonotes_embeddings_full.npz')
ontonotes = load_dataset("conll2012_ontonotesv5", "english_v12", cache_dir="./dataset/ontonotes")

In [3]:
print(f"Data keys: {data.keys()}")

Data keys: KeysView(NpzFile 'data/ontonotes_embeddings_full.npz' with keys: X, Y)


In [4]:
X = data['X']
y = data['Y']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")


X shape: (2200865, 768)
y shape: (2200865,)


In [5]:
num_positives = np.sum(y == 1)
num_negatives = np.sum(y == 0)
print(f"Positives: {num_positives}, Negatives: {num_negatives}")


Positives: 125904, Negatives: 2074961


In [6]:
# this part fries my computer, so I limit the dataset to 200k samples, but still does

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)


# Create Dataset
dataset = TensorDataset(X_tensor, y_tensor)

# Split into training and validation sets, later on we use the real validation set, but for now...
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512)


In [7]:
print(len(X_tensor))
print(len(y_tensor))

2200865
2200865


In [None]:
import sys
sys.path.append("./src/")
from confidence_model import confidence_model

In [9]:
model = confidence_model()
optimizer = optim.Adam(model.parameters(), lr=0.001) 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# because this dataset is unbalanced, we use a weighted loss function
pos_weight = torch.tensor([num_negatives / num_positives], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

epochs = 30
for epoch in range(epochs):  
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device).unsqueeze(1)

        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader):.4f}")
    
torch.save(model.state_dict(), "confidence_model.pth")


Epoch 1, Training Loss: 0.7105
Epoch 2, Training Loss: 0.6100
Epoch 3, Training Loss: 0.5768
Epoch 4, Training Loss: 0.5594
Epoch 5, Training Loss: 0.5460
Epoch 6, Training Loss: 0.5366
Epoch 7, Training Loss: 0.5294
Epoch 8, Training Loss: 0.5246
Epoch 9, Training Loss: 0.5173
Epoch 10, Training Loss: 0.5125
Epoch 11, Training Loss: 0.5095
Epoch 12, Training Loss: 0.5063
Epoch 13, Training Loss: 0.5045
Epoch 14, Training Loss: 0.4976
Epoch 15, Training Loss: 0.4958
Epoch 16, Training Loss: 0.4958
Epoch 17, Training Loss: 0.4873
Epoch 18, Training Loss: 0.4855
Epoch 19, Training Loss: 0.4841
Epoch 20, Training Loss: 0.4817
Epoch 21, Training Loss: 0.4811
Epoch 22, Training Loss: 0.4806
Epoch 23, Training Loss: 0.4772
Epoch 24, Training Loss: 0.4785
Epoch 25, Training Loss: 0.4739
Epoch 26, Training Loss: 0.4727
Epoch 27, Training Loss: 0.4676
Epoch 28, Training Loss: 0.4710
Epoch 29, Training Loss: 0.4656
Epoch 30, Training Loss: 0.4658


# Testset

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification


In [5]:
test_data = ontonotes["test"]
print(f"test_data: {test_data}")
print((test_data["sentences"][0][0]["words"]))
print(test_data["sentences"][0][0])

test_data: Dataset({
    features: ['document_id', 'sentences'],
    num_rows: 1200
})
['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by', 'the', 'various', 'relevant', 'parties', '.']
{'part_id': 0, 'words': ['--', 'basically', ',', 'it', 'was', 'unanimously', 'agreed', 'upon', 'by', 'the', 'various', 'relevant', 'parties', '.'], 'pos_tags': [9, 33, 5, 31, 41, 33, 43, 18, 18, 14, 19, 19, 28, 8], 'parse_tree': '(TOP(S (: --) (ADVP (RB basically) ) (, ,) (NP (PRP it) )(VP (VBD was) (ADVP (RB unanimously) )(VP (VBN agreed) (PP (IN upon) )(PP (IN by) (NP (DT the)  (JJ various)  (JJ relevant)  (NNS parties) )))) (. .) ))', 'predicate_lemmas': [None, None, None, None, 'be', None, 'agree', None, None, None, None, None, None, None], 'predicate_framenet_ids': [None, None, None, None, '03', None, '01', None, None, None, None, None, None, None], 'word_senses': [None, None, None, None, None, None, 1.0, None, None, None, None, None, None, None], 'speaker': 'speaker#1', 'na

In [8]:
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)
embedding_model.eval()
ner_model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_model.eval()

model = confidence_model()
model.load_state_dict(torch.load("confidence_model.pth"))
model.eval()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


confidence_model(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [20]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device = 'mps'
print(f"Using device: {device}")
embedding_model.to(device)
ner_model.to(device)
model.to(device)

Using device: mps


confidence_model(
  (model): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [21]:
def align_predictions_to_words(predicted_token_ids, word_ids):
    """
    Align token-level predictions back to word-level by choosing
    the first subtoken's prediction for each word.
    """
    aligned_preds = []
    prev_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if word_idx != prev_word_idx:
            aligned_preds.append(predicted_token_ids[idx])
            prev_word_idx = word_idx
    return aligned_preds

def process_sentence_with_confidence_model(words):
    x_list, y_list = [], []
    ner_tags_all = []

    for i in range(1, len(words) + 1):
        partial = words[:i]

        # Tokenize and get embeddings
        inputs = tokenizer(" ".join(partial), return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = embedding_model(**inputs)
            cls = outputs.last_hidden_state[0][0].unsqueeze(0)  # [1, 768]

        # Get NER tags for the partial using BERT-NER model
        with torch.no_grad():
            ner_outputs = ner_model(**inputs)
            logits = ner_outputs.logits
            predicted_token_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

        word_ids = inputs.word_ids()
        word_level_pred_ids = align_predictions_to_words(predicted_token_ids, word_ids)
        label_list = [ner_model.config.id2label[i] for i in range(len(ner_model.config.id2label))]
        ner_tags = [label_list[i] for i in word_level_pred_ids]

        ner_tags_all.extend(ner_tags)

        # Confidence model prediction
        with torch.no_grad():
            y_pred = model(cls)
            y_pred = torch.sigmoid(y_pred).item()

        x_list.append(cls.cpu().numpy())
        y_list.append(1 if y_pred > 0.8 else 0)

    # Unique tags in sentence (no duplicates)
    unique_tags = list(dict.fromkeys(ner_tags_all))

    # Only keep partials where entity ends
    entity_partials = [words[:i] for i in range(1, len(words)+1) if y_list[i-1]==1]

    return (unique_tags, entity_partials)

In [22]:
dummy_sentence = ["Barack", "Obama", "was", "born", "in", "Hawaii"]

result = process_sentence_with_confidence_model(dummy_sentence)

print("Input sentence:", " ".join(dummy_sentence))
print("Result tuple:")
print("Unique tags:", result[0])
print("Entity-ending partials:")
for partial in result[1]:
    print(partial)

Input sentence: Barack Obama was born in Hawaii
Result tuple:
Unique tags: ['B-PER', 'I-PER', 'O', 'B-LOC']
Entity-ending partials:
['Barack']
['Barack', 'Obama']
['Barack', 'Obama', 'was', 'born', 'in', 'Hawaii']


In [24]:
# Loop over test data
results = []

for row in tqdm(test_data, desc="Documents"):
    for sentence_dict in row["sentences"]:
        words = sentence_dict["words"]
        result = process_sentence_with_confidence_model(words)
        results.append(result)

# Save as npz
results_array = np.array(results, dtype=object)
np.savez_compressed("confidence_model_evaluation_results.npz", data=results_array)

print(f"Processed {len(results)} sentences with confidence model and saved results to confidence_model_evaluation_results.npz")

Documents:   0%|          | 1/1200 [02:25<48:23:36, 145.30s/it]


KeyboardInterrupt: 