In [1]:
%pip install staticvectors

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm

In [3]:
data = np.load('data/ner_trigger_dataset_validation.npz')

In [4]:
print(f"Data keys: {data.keys()}")

Data keys: KeysView(NpzFile 'data/ner_trigger_dataset_validation.npz' with keys: X, y)


In [5]:
X = data['X']
y = data['y']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

print(X[0])
print(y[0])

X shape: (297852, 6)
y shape: (297852,)
['In' 'the' 'summer' 'of' '2005' ',']
1


In [13]:
# w2v average pooling
# w2v max pooling
# bert cls token

# Embeddings test

In [6]:
from staticvectors import StaticVectors

model = StaticVectors("neuml/word2vec/model.sqlite")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
print("X[0]:", X[0])
print("Type of X[0]:", type(X[0]))
print("X[0][0]:", X[0][0])
print("Type of X[0][0]:", type(X[0][0]))


X[0]: ['In' 'the' 'summer' 'of' '2005' ',']
Type of X[0]: <class 'numpy.ndarray'>
X[0][0]: In
Type of X[0][0]: <class 'numpy.str_'>


In [8]:
X_avgpool = []
X_maxpool = []

for window in tqdm(X):
    token_list = window.tolist()

    
    vectors = [model.embeddings([word])[0] for word in token_list if model.embeddings([word]) is not None]
    
    if vectors:
        avg_vector = np.mean(vectors, axis=0)
        max_vector = np.max(vectors, axis=0)
    else:
        avg_vector = np.zeros(model.dim)
        max_vector = np.zeros(model.dim)
    
    X_avgpool.append(avg_vector)
    X_maxpool.append(max_vector)

X_avgpool = np.array(X_avgpool)   # shape: (297852, 300)
X_maxpool = np.array(X_maxpool)   # shape: (297852, 300)

100%|██████████| 297852/297852 [03:04<00:00, 1613.22it/s]


In [9]:
print(X_avgpool.shape)
# print(X_avgpool[0])
print(X_maxpool.shape)
# print(X_maxpool[0])

(297852, 300)
(297852, 300)


In [9]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
bert_model = AutoModel.from_pretrained("dslim/bert-base-NER")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.mps.is_available():
    device = 'mps'
print(f"Using device: {device}")

Using device: mps


In [11]:
bert_model.eval()
bert_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [12]:
@torch.no_grad()
def get_cls_embeddings_batch(sentence_tokens_list, batch_size=256):
    """
    Get CLS embeddings for a batch of token sequences.
    
    Args:
        sentence_tokens_list: List of token sequences
        batch_size: Number of sequences to process at once
    
    Returns:
        numpy array of CLS embeddings
    """
    all_embeddings = []
    
    for i in tqdm(range(0, len(sentence_tokens_list), batch_size), desc="Processing BERT embeddings"):
        batch = sentence_tokens_list[i:i + batch_size]
        texts = [" ".join(tokens) for tokens in batch]
        
        inputs = tokenizer(texts, return_tensors="pt", truncation=True, 
                          max_length=512, padding=True).to(device)
        outputs = bert_model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        all_embeddings.append(cls_embeddings)
    
    return np.vstack(all_embeddings)

In [19]:
# Convert X to list of token lists for batched processing
token_lists = [window.tolist() for window in X]

# Use batched processing
bert_embeddings = get_cls_embeddings_batch(token_lists, batch_size=32)
print("BERT Embedding shape:", bert_embeddings.shape)


Processing BERT embeddings: 100%|██████████| 9308/9308 [02:40<00:00, 58.12it/s]

BERT Embedding shape: (297852, 768)





In [13]:
import sys
sys.path.append("./src/")
from window_slide_model import WindowSlideModel

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

def train_and_eval_pytorch(X, y, name, epochs=3, batch_size=512):
    print(f"\n Training {name}")
    
    # Convert to PyTorch tensors
    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
    y_tensor = torch.tensor(y, dtype=torch.float32).to(device)

    # Create dataset and split
    dataset = TensorDataset(X_tensor, y_tensor)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_ds, test_ds = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size)

    # Define model
    input_dim = X.shape[1]
    model = WindowSlideModel(input_dim).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Training loop
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.detach().cpu().item()
        print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

    # Evaluation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int()
            all_preds.extend(preds.tolist())
            all_labels.extend(yb.int().tolist())

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    print(f" Precision: {precision:.4f}")
    print(f" Recall:    {recall:.4f}")
    print(f" F1-score:  {f1:.4f}")
    return model, precision, recall, f1


In [24]:
train_and_eval_pytorch(X_avgpool, y, name="Word2Vec - Avg Pooling")
train_and_eval_pytorch(X_maxpool, y, name="Word2Vec - Max Pooling")
train_and_eval_pytorch(bert_embeddings, y, name="BERT - CLS Pooling")



 Training Word2Vec - Avg Pooling
Epoch 1: Loss = 200.9151
Epoch 1: Loss = 200.9151
Epoch 2: Loss = 159.3520
Epoch 2: Loss = 159.3520
Epoch 3: Loss = 150.9359
Epoch 3: Loss = 150.9359
 Precision: 0.7341
 Recall:    0.7619
 F1-score:  0.7477

 Training Word2Vec - Max Pooling
 Precision: 0.7341
 Recall:    0.7619
 F1-score:  0.7477

 Training Word2Vec - Max Pooling
Epoch 1: Loss = 231.6797
Epoch 1: Loss = 231.6797
Epoch 2: Loss = 193.5017
Epoch 2: Loss = 193.5017
Epoch 3: Loss = 188.7371
Epoch 3: Loss = 188.7371
 Precision: 0.6790
 Recall:    0.5753
 F1-score:  0.6229

 Training BERT - CLS Pooling
 Precision: 0.6790
 Recall:    0.5753
 F1-score:  0.6229

 Training BERT - CLS Pooling
Epoch 1: Loss = 182.8686
Epoch 1: Loss = 182.8686
Epoch 2: Loss = 154.1647
Epoch 2: Loss = 154.1647
Epoch 3: Loss = 147.3581
Epoch 3: Loss = 147.3581
 Precision: 0.7175
 Recall:    0.7881
 F1-score:  0.7511
 Precision: 0.7175
 Recall:    0.7881
 F1-score:  0.7511


## Training model for approach 2

In [15]:
data = np.load('data/ner_trigger_dataset_embeddings.npz')
X, y = data['X'], data['y']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

print(X[0])
print(y[0])

X shape: (2148223, 768)
y shape: (2148223,)
[-2.14220770e-03 -1.46557152e-01  8.20886314e-01 -5.50802112e-01
 -1.86557919e-01 -1.59491926e-01 -5.53287566e-01 -6.91483259e-01
 -4.10862356e-01 -4.71345186e-01  1.10230744e-01  7.69049168e-01
  6.23638749e-01 -3.25382173e-01 -8.74595881e-01 -2.00361326e-01
  6.48040652e-01  6.46536052e-02  4.49972570e-01 -2.00347498e-01
  9.68802094e-01 -2.54475147e-01  1.10917699e+00  2.05043226e-01
  3.03130656e-01 -1.02155015e-01  6.52716339e-01  7.25103557e-01
  1.46968469e-01 -4.07497168e-01  3.07938963e-01 -3.11432093e-01
  2.23321721e-01  1.51446715e-01 -1.07112598e+00 -3.66511852e-01
 -4.27757710e-01 -8.20988566e-02  8.21811914e-01 -6.05800927e-01
 -4.19731081e-01  2.65417576e-01  1.59787774e+00 -1.00400102e+00
  5.19288704e-02  7.48898908e-02 -2.02435911e-01 -1.31769925e-01
 -8.93584043e-02 -1.60512000e-01 -7.22794950e-01  1.58097640e-01
  6.56224370e-01 -6.17231965e-01  1.19555938e+00  4.97628003e-01
  4.32722658e-01  6.50399745e-01 -2.55437493e-

In [35]:
model, _, _, _ = train_and_eval_pytorch(X, y, name="BERT - CLS Pooling", epochs=20, batch_size=256)


 Training BERT - CLS Pooling
Epoch 1: Loss = 2007.2094
Epoch 2: Loss = 1810.6204
Epoch 3: Loss = 1751.5439
Epoch 4: Loss = 1712.2786
Epoch 5: Loss = 1688.4551
Epoch 6: Loss = 1667.4398
Epoch 7: Loss = 1648.9284
Epoch 8: Loss = 1634.2220
Epoch 9: Loss = 1623.6756
Epoch 10: Loss = 1616.3725
Epoch 11: Loss = 1605.7175
Epoch 12: Loss = 1596.6743
Epoch 13: Loss = 1590.2830
Epoch 14: Loss = 1583.8722
Epoch 15: Loss = 1578.0360
Epoch 16: Loss = 1574.1797
Epoch 17: Loss = 1568.8902
Epoch 18: Loss = 1562.2486
Epoch 19: Loss = 1559.7728
Epoch 20: Loss = 1554.4390
 Precision: 0.7831
 Recall:    0.7957
 F1-score:  0.7893


In [36]:
torch.save(model.state_dict(), "models/bert_cls_pooling_model.pkl")

In [41]:
model = WindowSlideModel(X.shape[1])
model.load_state_dict(torch.load("models/bert_cls_pooling_model.pkl", weights_only=True))
model.eval()
model.to(device)

# find ix where y is 1
ix = np.where(y == 1)[0][1]
y_hat = model(torch.tensor(X[ix], dtype=torch.float32).to(device).unsqueeze(0))
probs = (torch.sigmoid(y_hat) > 0.5).int()
print(f"y_hat shape: {y_hat}, {torch.sigmoid(y_hat)}, probs: {probs}")

y_hat shape: tensor([0.0953], device='mps:0', grad_fn=<SqueezeBackward1>), tensor([0.5238], device='mps:0', grad_fn=<SigmoidBackward0>), probs: tensor([1], device='mps:0', dtype=torch.int32)
