In [37]:
from transformers import LlamaModel, LlamaTokenizer
from datasets import DatasetDict, Features, Sequence, Value
import datasets
import transformers
import torch
import os
import sys
sys.path.append(os.getcwd()+"/../..")
from src import paths
import tqdm

In [None]:
# # Download model
# checkpoint = "meta-llama/Llama-2-7b-hf"
# model = AutoModelForCausalLM.from_pretrained(checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(model)

# # Save model
# model.save_pretrained(paths.MODEL_PATH/'llama2')

# # Save tokenizer
# tokenizer.save_pretrained(paths.MODEL_PATH/'llama2')

In [3]:
# Load model and tokenizer
tokenizer = LlamaTokenizer.from_pretrained(paths.MODEL_PATH/'llama2', padding_side='left')
model = LlamaModel.from_pretrained(paths.MODEL_PATH/'llama2', device_map="auto", load_in_4bit=True)

In [7]:
# Check device allocation
for name, param in model.named_parameters():
    print(f"Device of {name}: ", param.device)

Device of model.embed_tokens.weight:  cuda:0
Device of model.layers.0.self_attn.q_proj.weight:  cuda:0
Device of model.layers.0.self_attn.k_proj.weight:  cuda:0
Device of model.layers.0.self_attn.v_proj.weight:  cuda:0
Device of model.layers.0.self_attn.o_proj.weight:  cuda:0
Device of model.layers.0.mlp.gate_proj.weight:  cuda:0
Device of model.layers.0.mlp.up_proj.weight:  cuda:0
Device of model.layers.0.mlp.down_proj.weight:  cuda:0
Device of model.layers.0.input_layernorm.weight:  cuda:0
Device of model.layers.0.post_attention_layernorm.weight:  cuda:0
Device of model.layers.1.self_attn.q_proj.weight:  cuda:0
Device of model.layers.1.self_attn.k_proj.weight:  cuda:0
Device of model.layers.1.self_attn.v_proj.weight:  cuda:0
Device of model.layers.1.self_attn.o_proj.weight:  cuda:0
Device of model.layers.1.mlp.gate_proj.weight:  cuda:0
Device of model.layers.1.mlp.up_proj.weight:  cuda:0
Device of model.layers.1.mlp.down_proj.weight:  cuda:0
Device of model.layers.1.input_layernorm.w

In [38]:
# Dataset
dataset = DatasetDict.load_from_disk(paths.DATA_PATH_PREPROCESSED/'line_labelling/line_labelling_clean_dataset')

# Tokenize
def tokenize(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256, return_tensors='pt')

# # Set format of labels to FloatTensor
features = Features({'labels': Sequence(Value(dtype='float32')),
                     'input_ids': Sequence(Value(dtype='int32')),
                     'attention_mask': Sequence(Value(dtype='int32')),
                     'token_type_ids': Sequence(Value(dtype='int32')),
                     'class_agg': Value(dtype='string'),
                     'rid': Value(dtype='string'),
                     'text': Value(dtype='string'),
                     'class': Value(dtype='string')
                     })

# Tokenize dataset
dataset = dataset.map(tokenize, batched=True, features=features)

# Train/Val/Test 
train_dataset = dataset['train']
val_dataset = dataset['val']
test_dataset = dataset['test']

In [None]:
# Generate train embeddings, use mean over sequence of last layer
def embed_dataset(
        dataset: datasets.Dataset = None,
        model: transformers.PreTrainedModel = None, 
        batch_size: int = 4
        ) -> dict:
    """ 
    Embeds a dataset using a model.

    Args:

        dataset (datasets.Dataset): Dataset to embed.
        model (transformers.PreTrainedModel): Model to use for embedding.
        batch_size (int): Batch size to use for embedding.

    Returns:
    
            torch.Tensor: Embeddings of the dataset. Size (len(dataset), model.config.hidden_size, ).
    """
    embeddings = []
    
    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]
        with torch.no_grad():
            outputs = model(**batch)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(embeddings)
        

    return {'embeddings': torch.cat(embeddings, dim=0), 'labels': dataset['labels']}

In [None]:
# Embed train dataset
BATCH_SIZE = 4
train_embeddings = embed_dataset(train_dataset, model, BATCH_SIZE)
val_embeddings = embed_dataset(val_dataset, model, BATCH_SIZE)
test_embeddings = embed_dataset(test_dataset, model, BATCH_SIZE)

# Save embeddings
torch.save(train_embeddings, paths.DATA_PATH_PREPROCESSED/'line_label_pred/llama2-train_embeddings.pt')
torch.save(val_embeddings, paths.DATA_PATH_PREPROCESSED/'line_label_pred/llama2-val_embeddings.pt')
torch.save(test_embeddings, paths.DATA_PATH_PREPROCESSED/'line_label_pred/llama2-test_embeddings.pt')

In [None]:
# Free up memory
del model
torch.cuda.empty_cache()

In [None]:
# Classification Head (Linear)
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

class ClassificationDataset(Dataset):
    """Dataset for classification."""
    def __init__(self, embeddings: torch.Tensor, labels: torch.Tensor):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

class ClassificationHead(nn.Module):
    """Classification Head for n classes."""
    def __init__(self, input_dim: int = None, output_dim: int = 3):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.linear1 = nn.Linear(self.input_dim, self.output_dim)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear1(x)

In [None]:
# Hyperparameters
LEARNING_RATE = 0.001
EPOCHS = 10
HIDDEN_DIM = train_embeddings['embeddings'].shape[1]
OUTPUT_DIM = train_embeddings['labels'].shape[1]

# Initialize datasets
train_dataset = ClassificationDataset(train_embeddings['embeddings'], train_embeddings['labels'])
val_dataset = ClassificationDataset(val_embeddings['embeddings'], val_embeddings['labels'])
test_dataset = ClassificationDataset(test_embeddings['embeddings'], test_embeddings['labels'])

# Initialize dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Initialize model
model = ClassificationHead(input_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Initialize loss function
loss_fn = nn.CrossEntropyLoss()

In [None]:
# Train Loop
for epoch in range(EPOCHS):
    # Training
    model.train()
    bar = tqdm(train_dataloader)

    for batch in bar:
        optimizer.zero_grad()
        embeddings, labels = batch
        logits = model(embeddings)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        bar.set_description(f"Epoch {epoch} loss: {loss.item():.5f}")
    
    # Validation
    model.eval()
    bar = tqdm(val_dataloader)
    val_loss = 0
    for batch in bar:
        embeddings, labels = batch
        with torch.no_grad():
            logits = model(embeddings)
            loss = loss_fn(logits, labels)
            val_loss += loss.item()
    val_loss /= len(val_dataloader)
    print(f"Validation loss: {val_loss:.5f}")

    # Save model if validation loss is lower than previous validation loss
    if epoch == 0:
        best_val_loss = val_loss
    elif val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), paths.MODEL_PATH/'line_label_pred/llama2-classification-head.pt')

In [None]:
# Test
model.load_state_dict(torch.load(paths.MODEL_PATH/'line_label_pred/llama2-classification-head.pt'))
model.eval()
bar = tqdm(test_dataloader)
logits = []
labels = []
embeddings = []

for batch in bar:
    embeddings, labels = batch
    with torch.no_grad():
        logits = model(embeddings)
        logits.append(logits)
        labels.append(labels)
        embeddings.append(embeddings)

results = {'logits': torch.cat(logits, dim=0), 'labels': torch.cat(labels, dim=0), 'embeddings': torch.cat(embeddings, dim=0)}
torch.save(results, paths.RESULTS_PATH/'line_labelling-LLAMA2-classification-test_output.pt')

In [20]:
model_inputs = tokenizer(['A list of colors: red, blue'], return_tensors="pt").to("cuda")
model_inputs = {k: v.to(torch.int32).to("cuda") for k, v in model_inputs.items()}

In [21]:
model_inputs['input_ids'].dtype

torch.int32

In [26]:
generated_ids = model.generate(**model_inputs, max_new_tokens=20)

In [27]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'A list of colors: red, blue, and yellow. профн.\n1. The list of colors: red, blue, and'

In [31]:
pipeline = transformers.pipeline(
    "text-generation",
    tokenizer=tokenizer,
    model=model,
    torch_dtype=torch.float32,
    device_map="auto",
)

In [35]:
sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
 Unterscheidung: Die Schlacht im Hürtgenwald. München: List, 2003.
# 3
# 1945
Ich warf den Kopf auf die Seite und starrte auf den Boden. Die Erde war von einer schmutzigen, dicken Asche bedeckt. Das Blut von den Wunden, die ich auf der Stelle bekommen hatte, war von der Erde verblasst.
Ich schlug die Augen wieder auf. Auf dem Boden vor mir lag ein Mann mit einem weißen Hemd, das mit Blut und Schmutz übersät war. Seine Hände lagen auf der Brust, die Augen waren geschlossen, und die Mundwinkel waren gesenkt. Ich scha


In [34]:
import subprocess as sp
import os

def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

get_gpu_memory()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[535]