In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm

In [10]:
data = np.load('data/ner_trigger_dataset_validation.npz')

In [11]:
print(f"Data keys: {data.keys()}")

Data keys: KeysView(NpzFile 'data/ner_trigger_dataset_validation.npz' with keys: X, y)


In [12]:
X = data['X']
y = data['y']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

print(X[0])
print(y[0])

X shape: (297852, 6)
y shape: (297852,)
['In' 'the' 'summer' 'of' '2005' ',']
1


In [13]:
# w2v average pooling
# w2v max pooling
# bert cls token

# Embeddings test

In [None]:
from staticvectors import StaticVectors

model = StaticVectors("neuml/word2vec/model.sqlite")

In [21]:
print("X[0]:", X[0])
print("Type of X[0]:", type(X[0]))
print("X[0][0]:", X[0][0])
print("Type of X[0][0]:", type(X[0][0]))


X[0]: ['In' 'the' 'summer' 'of' '2005' ',']
Type of X[0]: <class 'numpy.ndarray'>
X[0][0]: In
Type of X[0][0]: <class 'numpy.str_'>


In [24]:
X_avgpool = []
X_maxpool = []

for window in tqdm(X):
    token_list = window.tolist()

    
    vectors = [model.embeddings([word])[0] for word in token_list if model.embeddings([word]) is not None]
    
    if vectors:
        avg_vector = np.mean(vectors, axis=0)
        max_vector = np.max(vectors, axis=0)
    else:
        avg_vector = np.zeros(model.dim)
        max_vector = np.zeros(model.dim)
    
    X_avgpool.append(avg_vector)
    X_maxpool.append(max_vector)

X_avgpool = np.array(X_avgpool)   # shape: (297852, 300)
X_maxpool = np.array(X_maxpool)   # shape: (297852, 300)

100%|██████████| 297852/297852 [17:58<00:00, 276.19it/s]


In [32]:
print(X_avgpool.shape)
# print(X_avgpool[0])
print(X_maxpool.shape)
# print(X_maxpool[0])

(297852, 300)
(297852, 300)


In [29]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
bert_model = AutoModel.from_pretrained("dslim/bert-base-NER")

In [28]:
bert_model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
@torch.no_grad()
def get_cls_embedding(sentence_tokens):
    text = " ".join(sentence_tokens)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = bert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    
    return cls_embedding  


In [None]:
bert_embeddings = []

for window in tqdm(X):
    token_list = window.tolist()
    emb = get_cls_embedding(token_list)
    bert_embeddings.append(emb)

bert_embeddings = np.array(bert_embeddings)  
print("BERT Embedding shape:", bert_embeddings.shape)


  0%|          | 1028/297852 [00:47<3:47:20, 21.76it/s]


KeyboardInterrupt: 

In [38]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1) 
        )

    def forward(self, x):
        return self.net(x).squeeze(1)  

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def train_and_eval_pytorch(X, y, name, epochs=3, batch_size=512):
    print(f"\n Training {name}")
    
    # Convert to PyTorch tensors
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32)

    # Create dataset and split
    dataset = TensorDataset(X_tensor, y_tensor)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_ds, test_ds = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size)

    # Define model
    input_dim = X.shape[1]
    model = SimpleClassifier(input_dim)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Training loop
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

    # Evaluation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in test_loader:
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).int()
            all_preds.extend(preds.tolist())
            all_labels.extend(yb.int().tolist())

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    print(f" Precision: {precision:.4f}")
    print(f" Recall:    {recall:.4f}")
    print(f" F1-score:  {f1:.4f}")


In [None]:
train_and_eval_pytorch(X_avgpool, y, name="Word2Vec - Avg Pooling")
train_and_eval_pytorch(X_maxpool, y, name="Word2Vec - Max Pooling")
train_and_eval_pytorch(bert_embeddings, y, name="BERT - CLS Pooling")



📊 Training Word2Vec - Avg Pooling
Epoch 1: Loss = 199.4137
Epoch 2: Loss = 157.8514
Epoch 3: Loss = 149.7757
 Precision: 0.7296
 Recall:    0.7560
 F1-score:  0.7425

📊 Training Word2Vec - Max Pooling
Epoch 1: Loss = 230.7837
Epoch 2: Loss = 192.7383
Epoch 3: Loss = 188.6838
 Precision: 0.6735
 Recall:    0.5917
 F1-score:  0.6300
