In [None]:
!pip install llm2vec datasets seqeval bitsandbytes

Collecting llm2vec
  Downloading llm2vec-0.2.2-py2.py3-none-any.whl.metadata (16 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft (from llm2vec)
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<=4.40.2,>=4.39.1 (from llm2vec)
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate (from llm2vec)
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from dat

In [2]:
# this code works on Mila cluster
import torch
from torch import nn
from llm2vec import LLM2Vec
from transformers import AutoTokenizer, BitsAndBytesConfig
from torch.utils.data import DataLoader, Dataset, Subset
from datasets import load_from_disk, load_dataset
from seqeval.metrics import classification_report
import os
import random

print("Starting script...")

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Set model name
model_name = "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp"

print("Loading and quantizing model...")
l2v = LLM2Vec.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)
print("Model loaded and quantized successfully.")

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer loaded successfully.")

# Define the NER model
class NERModel(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(NERModel, self).__init__()
        self.classifier = nn.Linear(input_dim, num_labels)

    def forward(self, x):
        return self.classifier(x)

print("Loading dataset from disk...")
dataset_path = "./conll2003_dataset"  # Adjust this path as needed
if os.path.exists(dataset_path):
    dataset = load_from_disk(dataset_path)
    print("Dataset loaded from disk successfully.")
else:
    print("Dataset not found on disk. Downloading and saving...")
    dataset = load_dataset("conll2003", trust_remote_code=True)
    dataset.save_to_disk(dataset_path)
    print("Dataset downloaded and saved to disk.")

print("Processing dataset...")
# Get unique labels
label_list = dataset["train"].features["ner_tags"].feature.names
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
print("Dataset processed.")

# Custom dataset for NER
class NERDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        words = item["tokens"]
        labels = [label_list[label] for label in item["ner_tags"]]

        encoding = self.tokenizer(words, is_split_into_words=True, truncation=True, padding='max_length', max_length=self.max_length)

        # Align labels with tokens
        word_ids = encoding.word_ids()
        aligned_labels = [-100] * len(word_ids)
        for i, word_id in enumerate(word_ids):
            if word_id is not None:
                aligned_labels[i] = label_to_id[labels[word_id]]

        return {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': torch.tensor(aligned_labels)
        }

print("Creating dataset and dataloader...")
# Create dataset and dataloader
full_train_dataset = NERDataset(dataset["train"], tokenizer)
full_eval_dataset = NERDataset(dataset["validation"], tokenizer)

# Select a subset of examples for training
num_train_examples = 10000
total_train_examples = len(full_train_dataset)
train_subset_indices = random.sample(range(total_train_examples), num_train_examples)
train_dataset = Subset(full_train_dataset, train_subset_indices)

# Select a subset of examples for evaluation
num_eval_examples = 1000  # You can adjust this number
total_eval_examples = len(full_eval_dataset)
eval_subset_indices = random.sample(range(total_eval_examples), num_eval_examples)
eval_dataset = Subset(full_eval_dataset, eval_subset_indices)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=4)
print(f"Dataset and dataloader created successfully. Training on {num_train_examples} examples, evaluating on {num_eval_examples} examples.")

print("Initializing NER model...")
# Initialize the NER model
input_dim = l2v.model.config.hidden_size
num_labels = len(label_list)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ner_model = NERModel(input_dim, num_labels).to(device)
print(f"NER model initialized. Using device: {device}")

# Training loop
print("Setting up optimizer and loss function...")
optimizer = torch.optim.AdamW(ner_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
print("Optimizer and loss function set up.")

print("Starting training loop...")
num_epochs = 2
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    ner_model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(train_dataloader):
        print(f"  Processing batch {batch_idx+1}/{len(train_dataloader)}")
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Get embeddings from LLM2Vec
        with torch.no_grad():
            embeddings = l2v.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

        # Convert embeddings to float32
        embeddings = embeddings.to(torch.float32)

        # Forward pass through NER model
        logits = ner_model(embeddings)

        # Compute loss
        loss = criterion(logits.view(-1, num_labels), labels.view(-1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}")

    print(f"Starting evaluation on {num_eval_examples} examples...")
    # Evaluation
    ner_model.eval()
    all_preds = []
    all_labels = []
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to('cpu').numpy()

        with torch.no_grad():
            embeddings = l2v.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
            embeddings = embeddings.to(torch.float32)
            logits = ner_model(embeddings)

        preds = torch.argmax(logits, dim=-1).to('cpu').numpy()

        for pred, label in zip(preds, labels):
            pred = [id_to_label[p] for (p, l) in zip(pred, label) if l != -100]
            label = [id_to_label[l] for l in label if l != -100]
            all_preds.append(pred)
            all_labels.append(label)

    print("Evaluation results:")
    print(classification_report(all_labels, all_preds))

print("Training completed.")

# Inference function
def predict_ner(text):
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        embeddings = l2v.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        embeddings = embeddings.to(torch.float32)
        logits = ner_model(embeddings)

    predictions = torch.argmax(logits, dim=-1)
    predicted_labels = [[id_to_label[pred.item()] for pred in sent_preds] for sent_preds in predictions]

    return predicted_labels

# Example usage
print("Running inference on test text...")
test_text = "Microsoft Corporation is headquartered in Redmond, Washington"
predictions = predict_ner(test_text)
print(f"Text: {test_text}")
print(f"Predicted NER tags: {predictions[0]}")

print("Script completed.")

ModuleNotFoundError: No module named 'llm2vec'