### !!!
### Keep in mind, that the google colab session will crash after executing the first cell. This is necessary to load open-instruct as an editable package. Just continue by executing the next cell after the session has crashed.
### !!!

In [1]:
# google colab related stuff
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_running_on_colab():
    import os
    os.chdir("/content")
    !git clone https://github.com/lukas-hinterleitner/master-thesis.git

    os.chdir("/content/master-thesis")
    !git submodule init
    !git submodule update

    !pip install -r google_colab_requirements.txt
    os.kill(os.getpid(), 9)

In [2]:
# google colab related stuff
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_running_on_colab():
    import os
    os.chdir("/content/master-thesis/code")

    from google.colab import output
    output.enable_custom_widget_manager()

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk
from tqdm.notebook import tqdm

import torch
import random
import gc
import time

import numpy as np
import pandas as pd

from config import hf_model_id, lima_paraphrased_dataset_path, get_dataset_config, get_gradient_similarity_file_path

from utilities.preprocessing import prepare_dataset
from utilities.gradient_operations import get_gradients, get_flattened_weight_vector

In [4]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
torch.cuda.empty_cache()
gc.collect()

20

In [6]:
model = AutoModelForCausalLM.from_pretrained(hf_model_id)
tokenizer = AutoTokenizer.from_pretrained(hf_model_id, return_tensors="pt")

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [9]:
use_gpu = False

device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
device

device(type='cpu')

In [10]:
model.to(device)
model.eval() # set to evaluation because we don't need to update weights

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [11]:
model.num_parameters()

1176764416

In [12]:
dataset = load_from_disk(lima_paraphrased_dataset_path)

In [13]:
dataset.column_names

['id', 'messages', 'paraphrased_messages']

In [14]:
sample_size = 5 # original_dataset.num_rows
sample_size

5

In [15]:
original_dataset_config = get_dataset_config(model, sft_messages_key="messages")
original_dataset_config

DatasetConfig(chat_template='tulu', preference_chosen_key='chosen', preference_rejected_key='rejected', sft_messages_key='messages', binary_messages_key='messages', label='binary_labels', convert_preference_to_binary_dataset=False, max_token_length=2048, max_prompt_token_length=None, sanity_check=False, sanity_check_max_samples=100, batched=True, load_from_cache_file=True, num_proc=12, train_only_on_prompt=True, ncols=2)

In [16]:
paraphrased_dataset_config = get_dataset_config(model, sft_messages_key="paraphrased_messages")
paraphrased_dataset_config

DatasetConfig(chat_template='tulu', preference_chosen_key='chosen', preference_rejected_key='rejected', sft_messages_key='paraphrased_messages', binary_messages_key='messages', label='binary_labels', convert_preference_to_binary_dataset=False, max_token_length=2048, max_prompt_token_length=None, sanity_check=False, sanity_check_max_samples=100, batched=True, load_from_cache_file=True, num_proc=12, train_only_on_prompt=True, ncols=2)

In [17]:
original_dataset_tokenized = prepare_dataset(dataset=dataset, tokenizer=tokenizer, dataset_config=original_dataset_config, sample_size=sample_size)

Tokenizing and reformatting SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

Filtering SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

In [18]:
paraphrased_dataset_tokenized = prepare_dataset(dataset=dataset, tokenizer=tokenizer, dataset_config=paraphrased_dataset_config, sample_size=sample_size)

Tokenizing and reformatting SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

Filtering SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

In [21]:
start_time = time.time()

data = []

gradients = dict()

original_ids = set()
paraphrased_ids = set()

progress_wrapper = tqdm(original_dataset_tokenized, desc="Calculating gradients and corresponding similarities")

for original in progress_wrapper:
    original_id = original["id"][0][0]
    original_ids.add(original_id)

    original_gradients = get_gradients(model, original, device)
    original_flattened_gradients = get_flattened_weight_vector(original_gradients)

    for paraphrased in paraphrased_dataset_tokenized:
        paraphrased_id = paraphrased["id"][0][0]
        paraphrased_ids.add(paraphrased_id)

        progress_wrapper.set_description(desc=f"Processing original ({original_id}) and paraphrased ({paraphrased_id})")

        paraphrased_gradients = get_gradients(model, paraphrased, device)
        paraphrased_flattened_gradients = get_flattened_weight_vector(paraphrased_gradients)

        similarity = original_flattened_gradients.dot(paraphrased_flattened_gradients).item()
        data.append((original_id, paraphrased_id, similarity))

progress_wrapper.set_description("Calculating gradients and corresponding similarities")

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Calculating gradients and corresponding similarities:   0%|          | 0/20 [00:00<?, ?it/s]

[{'id': 'lima_0', 'input_ids': tensor([151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
           553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
            13, 151645,    198, 151644,    872,    198,   6713,   8109,   7761,
          3271,     30,   3216,   7203,    358,   3076,   1293,   6010,  11906,
           320,  80060,   2845,   2878,    279,   8109,   1172,    568, 151645,
           198, 151644,  77091,    198,    785,   3405,    374,  12040,   7205,
           323,    825,   1265,   1896,   1119,   2692,    429,    279,   8109,
           537,   1172,  17167,    315,  33213,     11,    714,   1083,   2770,
           530,   7761,    320,  23362,    533,   7761,      8,    323,    855,
          1448,    275,  14212,  78302,  19101,   7761,     13,  23405,     11,
           438,   9023,  12357,   1331,  12295,   1671,    614,  16317,     11,
         46906,   6430,    374,   1602,   2989,     11,    438,    279,  11220,
         

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


OutOfMemoryError: CUDA out of memory. Tried to allocate 238.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 224.56 MiB is free. Including non-PyTorch memory, this process has 3.58 GiB memory in use. Of the allocated memory 3.39 GiB is allocated by PyTorch, and 130.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [30]:
df = pd.DataFrame(data, columns=['original_id', 'paraphrased_id', 'value'])
df_pivot = df.pivot(index='original_id', columns='paraphrased_id', values='value')
df_pivot = df_pivot.reindex(index=sorted(original_ids), columns=sorted(paraphrased_ids))

In [31]:
df_pivot

paraphrased_id,lima_0,lima_1,lima_10,lima_11,lima_12,lima_13,lima_14,lima_15,lima_16,lima_17,lima_18,lima_19,lima_2,lima_3,lima_4,lima_5,lima_6,lima_7,lima_8,lima_9
original_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
lima_0,704.572021,7.378688,11.461467,12.120822,18.128651,22.099636,6.780462,44.812889,27.117548,13.710809,22.579773,21.729382,26.238413,5.326207,-2.626504,35.678165,30.139153,7.959303,6.076345,8.485947
lima_1,-3.731161,197.466629,12.230677,13.620668,10.966014,0.252617,5.198039,26.114071,15.929282,9.757661,8.133455,6.573665,-1.583301,-5.860563,-2.444476,5.129814,15.379052,-12.913933,5.85069,5.515174
lima_10,24.279398,7.066396,690.942078,21.41239,9.884799,26.556271,9.687924,33.067459,28.441954,29.32243,9.918931,17.085218,19.193981,6.010208,-1.21536,21.6073,13.291119,15.29687,44.949654,0.306385
lima_11,11.765073,8.633123,11.026361,275.722687,4.08941,-1.336506,10.571337,32.076054,29.813339,9.991892,1.225374,-0.875437,16.728966,4.716888,15.499039,20.712608,16.65794,13.541716,11.697985,9.997189
lima_12,22.049469,5.885386,6.101056,1.379657,282.684845,-4.566419,17.275784,33.573536,23.106266,1.961985,4.333669,31.534882,19.186174,-4.832311,7.06675,7.814498,11.545047,4.826401,-3.1691,0.732262
lima_13,37.223312,11.876122,17.526442,7.855901,57.621433,380.768677,-4.262122,68.318329,35.388733,11.544577,14.263382,16.682079,14.65345,22.357937,-2.479635,5.0073,35.098022,6.769525,34.623363,20.330606
lima_14,4.918539,4.930912,10.846545,9.903999,16.727095,12.838992,458.224579,28.426016,50.470036,11.161835,11.464369,17.788528,0.854584,-2.163798,0.188994,24.338095,0.405535,-5.581327,26.88941,-10.534447
lima_15,40.746933,18.665174,29.513676,24.577553,50.159138,32.227482,39.014606,547.299866,71.611191,19.945551,27.825581,25.614035,53.939739,0.416932,15.009769,31.623775,40.51981,10.406205,47.468437,2.686373
lima_16,40.143883,14.668742,23.587147,26.588387,6.933215,31.908892,35.974327,68.808434,783.27832,32.690968,9.085128,35.049385,13.432009,9.141353,11.337858,67.491585,23.455008,7.400437,34.523525,9.427513
lima_17,8.32119,1.392809,9.488236,3.398846,12.877096,0.269927,1.323111,9.783303,5.843002,569.872925,10.957796,15.750617,23.347006,7.131196,-2.547755,-7.429474,11.035873,20.937025,-4.053282,2.733387


In [34]:
os.chdir("/content/master-thesis/code")
df_pivot.to_csv(get_gradient_similarity_file_path(sample_size), index=True, header=True)

In [35]:
# google colab related stuff to publish data automatically to github repository
if is_running_on_colab():
    from google.colab import userdata
    os.chdir("/content/master-thesis")

    !git config user.email "{userdata.get('GIT_EMAIL')}"
    !git config user.name "{userdata.get('GIT_NAME')}"

    !git add .

    commit_message = "\"executing gradient similarity for sample size " + str(sample_size) + "\""
    !git commit -m '"{commit_message}"'
    !git remote set-url origin "https://{userdata.get('GIT_TOKEN')}@github.com/lukas-hinterleitner/master-thesis.git"
    !git push

[main c3d2faa] ""executing gradient similarity for sample size 20""
 1 file changed, 21 insertions(+)
 create mode 100644 data/gradient_similarity/sample_size_20.csv
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 12 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 4.28 KiB | 4.28 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/lukas-hinterleitner/master-thesis.git
   d40ac03..c3d2faa  main -> main


In [None]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity

In [None]:
# random projections to reduce weight vector size
# compare ranking to other algorithms: bm25, tf-idf, (rouge optionally)