In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk

from config import hf_model_id, lima_filtered_paraphrased_dataset_path
import torch
import numpy as np
import random
import gc

from utilities.preprocessing import prepare_dataset
from utilities.gradient_operations import get_gradients, get_flattened_weight_vector

  from .autonotebook import tqdm as notebook_tqdm


[2024-09-18 23:08:22,416] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
torch.cuda.empty_cache()
gc.collect()

109

In [4]:
model = AutoModelForCausalLM.from_pretrained(hf_model_id)
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
model.eval()

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [7]:
dataset = load_from_disk(lima_filtered_paraphrased_dataset_path)

In [8]:
train_dataloader = prepare_dataset(dataset=dataset, model=model, tokenizer=tokenizer)

Tokenizing and reformatting instruction data: 100%|██████████| 988/988 [00:03<00:00, 304.85 examples/s]
Filter: 100%|██████████| 988/988 [00:00<00:00, 11492.90 examples/s]


In [13]:
sample_0 = list(train_dataloader)[0]

gradients_sample_0 = get_gradients(model, sample_0)

In [14]:
sample_0

{'input_ids': tensor([[   29,    93,  4537, 49651,   187,  5804,  3998,  1341,  2118,    32,
          2896,  4866,   309,  1599,  1048,  4181, 10346,   313, 11499, 31821,
          1561,   253,  3998,   760,   481,   187,    29,    93,   515,  5567,
         49651,   187,   510,  1953,   310,  4942,  3862,   285,   581,   943,
          1379,   715,  2395,   326,   253,  3998,   417,   760,  8414,   273,
          8512,    13,   533,   671, 42782,  1341,   313, 13821,   422,  1341,
            10,   285,   638,    14,  2225,  3875, 16069,  8424,  1341,    15,
          5488,    13,   347,  4619,  7715,    14, 30202,  1346,   452,  4860,
            13, 16743,  3924,   310,  1077,  1774,    13,   347,   253,  6684,
         24022,  3998,   310,  1077,  1027,   432,   253,  6782,  3998,    15,
           187,  6436,    13,   846,   256, 12545,   949,  2710, 16516,    13,
           253,  3662,   281,   253,  1953,   310,  2686, 24678,  2969,    27,
          6279,    13,  3998,  1341, 3

In [10]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity

In [None]:
# random projections to reduce weight vector size
# compare ranking to other algorithms: bm25, tf-idf, (rouge optionally)