### !!!
### Keep in mind, that the google colab session will crash after executing the first cell. This is necessary to load open-instruct as an editable package. Just continue by executing the next cell after the session has crashed.
### !!!

In [None]:
# google colab related stuff
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_running_on_colab():
    import os
    os.chdir("/content")
    !git clone https://github.com/lukas-hinterleitner/master-thesis.git

    os.chdir("/content/master-thesis")
    !git submodule init
    !git submodule update

    !pip install -r google_colab_requirements.txt
    os.kill(os.getpid(), 9)

Cloning into 'master-thesis'...
remote: Enumerating objects: 313, done.[K
remote: Counting objects: 100% (197/197), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 313 (delta 109), reused 128 (delta 47), pack-reused 116 (from 1)[K
Receiving objects: 100% (313/313), 46.10 MiB | 37.56 MiB/s, done.
Resolving deltas: 100% (153/153), done.
Submodule 'submodules/open-instruct' (https://github.com/allenai/open-instruct) registered for path 'submodules/open-instruct'
Cloning into '/content/master-thesis/submodules/open-instruct'...
Submodule path 'submodules/open-instruct': checked out 'bdc3fa6e79c455b1b9ef6c4170b688ed4aa0c5e0'
Obtaining file:///content/master-thesis/submodules/open-instruct (from -r google_colab_requirements.txt (line 12))
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproje

In [1]:
# google colab related stuff
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_running_on_colab():
    import os
    os.chdir("/content/master-thesis/code")

    from google.colab import output
    output.enable_custom_widget_manager()

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk
from tqdm.notebook import tqdm

import torch
import random
import gc
import time

import numpy as np
import pandas as pd

from config.model import hf_model_id
from config.dataset import get_dataset_config
from config.storage import lima_paraphrased_dataset_path, get_gradient_similarity_file_path

from utilities.preprocessing import prepare_dataset
from utilities.gradient_operations import get_gradients, get_flattened_weight_vector

In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
torch.cuda.empty_cache()
gc.collect()

25

In [4]:
model = AutoModelForCausalLM.from_pretrained(hf_model_id)
tokenizer = AutoTokenizer.from_pretrained(hf_model_id, return_tensors="pt")

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [6]:
use_gpu = True

device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
device

device(type='cpu')

In [7]:
model.to(device)
model.eval() # set to evaluation because we don't need to update weights

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [8]:
model.num_parameters()

1176764416

In [9]:
dataset = load_from_disk(lima_paraphrased_dataset_path)

In [10]:
dataset.column_names

['id', 'messages', 'paraphrased_messages']

In [11]:
sample_size = 5 # original_dataset.num_rows
sample_size

5

In [12]:
original_dataset_config = get_dataset_config(model, sft_messages_key="messages")
original_dataset_config

DatasetConfig(chat_template='tulu', preference_chosen_key='chosen', preference_rejected_key='rejected', sft_messages_key='messages', ground_truths_key='ground_truth', dataset_source_key='dataset', binary_messages_key='messages', label='binary_labels', convert_preference_to_binary_dataset=False, max_token_length=2048, max_prompt_token_length=None, sanity_check=False, sanity_check_max_samples=100, batched=True, load_from_cache_file=True, num_proc=12, train_only_on_prompt=True, ncols=2)

In [13]:
paraphrased_dataset_config = get_dataset_config(model, sft_messages_key="paraphrased_messages")
paraphrased_dataset_config

DatasetConfig(chat_template='tulu', preference_chosen_key='chosen', preference_rejected_key='rejected', sft_messages_key='paraphrased_messages', ground_truths_key='ground_truth', dataset_source_key='dataset', binary_messages_key='messages', label='binary_labels', convert_preference_to_binary_dataset=False, max_token_length=2048, max_prompt_token_length=None, sanity_check=False, sanity_check_max_samples=100, batched=True, load_from_cache_file=True, num_proc=12, train_only_on_prompt=True, ncols=2)

In [14]:
original_dataset_tokenized = prepare_dataset(dataset=dataset, tokenizer=tokenizer, dataset_config=original_dataset_config, sample_size=sample_size)

Tokenizing and reformatting SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

Filtering SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

In [15]:
paraphrased_dataset_tokenized = prepare_dataset(dataset=dataset, tokenizer=tokenizer, dataset_config=paraphrased_dataset_config, sample_size=sample_size)

Tokenizing and reformatting SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

Filtering SFT data:   0%|          | 0/5 [00:00<?, ? examples/s]

In [19]:
start_time = time.time()

data = []

gradients = dict()

original_ids = set()
paraphrased_ids = set()

progress_wrapper = tqdm(original_dataset_tokenized, desc="Calculating gradients and corresponding similarities")

for original in progress_wrapper:
    original_id = original["id"]
    original_ids.add(original_id)

    original_gradients = get_gradients(model, original, device)
    original_flattened_gradients = get_flattened_weight_vector(original_gradients)

    for paraphrased in paraphrased_dataset_tokenized:
        paraphrased_id = paraphrased["id"]
        paraphrased_ids.add(paraphrased_id)

        progress_wrapper.set_description(desc=f"Processing original ({original_id}) and paraphrased ({paraphrased_id})")

        paraphrased_gradients = get_gradients(model, paraphrased, device)
        paraphrased_flattened_gradients = get_flattened_weight_vector(paraphrased_gradients)

        similarity = original_flattened_gradients.dot(paraphrased_flattened_gradients).item()
        data.append((original_id, paraphrased_id, similarity))

progress_wrapper.set_description("Calculating gradients and corresponding similarities")

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Calculating gradients and corresponding similarities:   0%|          | 0/5 [00:00<?, ?it/s]

Execution time: 97.0526909828186 seconds


In [20]:
df = pd.DataFrame(data, columns=['original_id', 'paraphrased_id', 'value'])
df_pivot = df.pivot(index='original_id', columns='paraphrased_id', values='value')
df_pivot = df_pivot.reindex(index=sorted(original_ids), columns=sorted(paraphrased_ids))

In [21]:
df_pivot

paraphrased_id,lima_0,lima_1,lima_2,lima_3,lima_4
original_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lima_0,704.572021,7.378706,26.238413,5.326203,-2.626506
lima_1,-3.731173,197.466614,-1.583303,-5.860563,-2.444476
lima_2,11.600275,3.028663,969.945374,7.783442,11.206348
lima_3,11.6197,5.798289,9.424985,302.027283,0.150802
lima_4,-4.735126,9.952369,11.523482,8.553665,391.019653


In [22]:
os.chdir("/content/master-thesis/code")
df_pivot.to_csv(get_gradient_similarity_file_path(sample_size), index=True, header=True)

In [23]:
# google colab related stuff to publish data automatically to github repository
if is_running_on_colab():
    from google.colab import userdata
    os.chdir("/content/master-thesis")

    !git config user.email "{userdata.get('GIT_EMAIL')}"
    !git config user.name "{userdata.get('GIT_NAME')}"

    !git add .

    commit_message = "\"executing gradient similarity for sample size " + str(sample_size) + "\""
    !git commit -m '"{commit_message}"'
    !git remote set-url origin "https://{userdata.get('GIT_TOKEN')}@github.com/lukas-hinterleitner/master-thesis.git"
    !git push

[main 4ebd9df] ""executing gradient similarity for sample size 5""
 1 file changed, 6 insertions(+), 6 deletions(-)
 rewrite data/gradient_similarity/sample_size_5.csv (91%)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 12 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 748 bytes | 748.00 KiB/s, done.
Total 5 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/lukas-hinterleitner/master-thesis.git
   ba4cd4e..4ebd9df  main -> main


In [None]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity

In [None]:
# random projections to reduce weight vector size
# compare ranking to other algorithms: bm25, tf-idf, (rouge optionally)