In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk
from config import data_path
import torch
import numpy as np
import random

from utilities.dataset import prepare_dataset

[2024-09-06 10:36:46,437] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
model = AutoModelForCausalLM.from_pretrained("allenai/OLMo-1B-hf")
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf")

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
model.eval()

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [6]:
dataset = load_from_disk(data_path)
dataset

Dataset({
    features: ['dataset', 'id', 'messages'],
    num_rows: 988
})

In [7]:
dataset["messages"][0][0]["content"]

'Can brain cells move? By movement I mean long distance migration (preferably within the brain only).'

In [8]:
dataset["messages"][0][1]["content"]

'The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells migrate over long distances in response to injury (Imitola et al., 2004) and they migrate from specific stem-cell locations (e.g.

In [9]:
train_dataloader = prepare_dataset(dataset=dataset, model=model, tokenizer=tokenizer)

Tokenizing and reformatting instruction data:   0%|          | 0/988 [00:00<?, ? examples/s]

Filter:   0%|          | 0/988 [00:00<?, ? examples/s]

In [10]:
print(len(train_dataloader))

988


In [11]:
list(train_dataloader)[0]

{'input_ids': tensor([[   29,    93,  4537, 49651,   187,  5804,  3998,  1341,  2118,    32,
          2896,  4866,   309,  1599,  1048,  4181, 10346,   313, 11499, 31821,
          1561,   253,  3998,   760,   481,   187,    29,    93,   515,  5567,
         49651,   187,   510,  1953,   310,  4942,  3862,   285,   581,   943,
          1379,   715,  2395,   326,   253,  3998,   417,   760,  8414,   273,
          8512,    13,   533,   671, 42782,  1341,   313, 13821,   422,  1341,
            10,   285,   638,    14,  2225,  3875, 16069,  8424,  1341,    15,
          5488,    13,   347,  4619,  7715,    14, 30202,  1346,   452,  4860,
            13, 16743,  3924,   310,  1077,  1774,    13,   347,   253,  6684,
         24022,  3998,   310,  1077,  1027,   432,   253,  6782,  3998,    15,
           187,  6436,    13,   846,   256, 12545,   949,  2710, 16516,    13,
           253,  3662,   281,   253,  1953,   310,  2686, 24678,  2969,    27,
          6279,    13,  3998,  1341, 3

In [11]:
def get_gradients(batch):
    gradients = {}
    
    model.zero_grad()

    output = model(**batch, use_cache=False)
    loss = output.loss
    
    loss.backward()
    
    for name, param in model.named_parameters():
        if param.grad is not None:
            gradients[name] = param.grad.clone().detach()
            
    return gradients

In [12]:
decoded_input = tokenizer.decode(token_ids=list(train_dataloader)[0]["input_ids"][0].tolist())
decoded_input

'<|user|>\nCan brain cells move? By movement I mean long distance migration (preferably within the brain only).\n<|assistant|>\nThe question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells mi

In [12]:
decoded_output = tokenizer.decode(token_ids=[(tokenizer.pad_token_id if token == -100 else token) for token in list(train_dataloader)[0]["labels"][0].tolist()])
decoded_output

'<|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|><|padding|>The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of mig

---
# gradient checking - start

In [14]:
training_sample_0 = list(train_dataloader)[0]
training_sample_1 = list(train_dataloader)[1]

In [15]:
# first and foremost, check if two samples are different after tokenizing
if training_sample_0["input_ids"].equal(training_sample_1["input_ids"]): 
    print("Tokenized inputs are the same. Check tokenizing functionality!")
else:
    print("As expected, tokenized inputs are not the same. ")

As expected, tokenized inputs are not the same. 


In [16]:
# check idempotency of some input samples
gradients_sample_0 = get_gradients(training_sample_0)
gradients_sample_1 = get_gradients(training_sample_1)
gradients_sample_0_later = get_gradients(training_sample_0)

In [17]:
# gradient dictionary keys of sample_0, sample_0_later and sample_1 should be the same
assert gradients_sample_0.keys() == gradients_sample_0_later.keys() == gradients_sample_1.keys()

In [18]:
# compare gradient of the same sample
assert gradients_sample_0.keys() == gradients_sample_0_later.keys(), "Gradient dictionaries must have same keys."

for key in gradients_sample_0.keys():
    assert gradients_sample_0[key].equal(gradients_sample_0_later[key]), f"Gradient '{key}' not equal!"
    
print("Gradients are equal when using the same sample.")

Gradients are equal when using the same sample.


In [19]:
# compare gradients of two different samples
assert gradients_sample_0.keys() == gradients_sample_1.keys(), "Gradient dictionaries must have same keys."

for key in gradients_sample_0.keys():
    assert not gradients_sample_0[key].equal(gradients_sample_1[key]), f"Gradient '{key}' equal!"
    
print("Gradients are different when using two different samples.")

Gradients are different when using two different samples.


In [20]:
def get_flattened_weight_vector(weight_dict: dict) -> torch.Tensor:
    flattened_weights = []
    for weights in weight_dict.values():
        flattened_weights.append(weights.flatten())
        
    return torch.cat(flattened_weights)

In [21]:
get_flattened_weight_vector(gradients_sample_0)

tensor([-6.6105e-09,  2.3362e-09, -6.1912e-09,  ..., -2.1295e-04,
         1.4201e-03, -1.9168e-03])

In [22]:
cosine_similarity(get_flattened_weight_vector(gradients_sample_0), get_flattened_weight_vector(gradients_sample_0), dim=0)

NameError: name 'cosine_similarity' is not defined

In [21]:
#del training_sample_0
#del training_sample_1

#del gradients_sample_0
#del gradients_sample_1
#del gradients_sample_0_later

# gradient checking - end

---

In [14]:
training_sample_0 = list(train_dataloader)[0]

gradients_sample_0 = get_gradients(training_sample_0)

In [19]:
for key, value in gradients_sample_0.items():
    
    print(f"{key}: {value.shape}")

model.embed_tokens.weight: torch.Size([50304, 2048])
model.layers.0.self_attn.q_proj.weight: torch.Size([2048, 2048])
model.layers.0.self_attn.k_proj.weight: torch.Size([2048, 2048])
model.layers.0.self_attn.v_proj.weight: torch.Size([2048, 2048])
model.layers.0.self_attn.o_proj.weight: torch.Size([2048, 2048])
model.layers.0.mlp.gate_proj.weight: torch.Size([8192, 2048])
model.layers.0.mlp.up_proj.weight: torch.Size([8192, 2048])
model.layers.0.mlp.down_proj.weight: torch.Size([2048, 8192])
model.layers.1.self_attn.q_proj.weight: torch.Size([2048, 2048])
model.layers.1.self_attn.k_proj.weight: torch.Size([2048, 2048])
model.layers.1.self_attn.v_proj.weight: torch.Size([2048, 2048])
model.layers.1.self_attn.o_proj.weight: torch.Size([2048, 2048])
model.layers.1.mlp.gate_proj.weight: torch.Size([8192, 2048])
model.layers.1.mlp.up_proj.weight: torch.Size([8192, 2048])
model.layers.1.mlp.down_proj.weight: torch.Size([2048, 8192])
model.layers.2.self_attn.q_proj.weight: torch.Size([2048, 2

In [None]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity