### !!!
### Keep in mind, that the google colab session will crash after executing the first cell. This is necessary to load open-instruct as an editable package. Just continue by executing the next cell after the session has crashed.
### !!!

In [21]:
import time


# google colab related stuff
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_running_on_colab():
    import os
    os.chdir("/content")
    !git clone https://github.com/lukas-hinterleitner/master-thesis.git

    os.chdir("/content/master-thesis")
    !git submodule init
    !git submodule update

    !pip uninstall torchvision torchaudio -y
    !pip install -r requirements.txt
    os.kill(os.getpid(), 9)

In [22]:
# google colab related stuff
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_running_on_colab():
    import os
    os.chdir("/content/master-thesis/code")

    from google.colab import output
    output.enable_custom_widget_manager()

In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk
from tqdm.notebook import tqdm

import torch
import random
import gc
import time

import numpy as np
import pandas as pd

from config import hf_model_id, lima_paraphrased_dataset_path, get_dataset_config, get_gradient_similarity_file_path

from utilities.preprocessing import prepare_dataset
from utilities.gradient_operations import get_gradients, get_flattened_weight_vector

In [2]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
torch.cuda.empty_cache()
gc.collect()

25

In [4]:
model = AutoModelForCausalLM.from_pretrained(hf_model_id)
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [5]:
dataset_config = get_dataset_config(model)
dataset_config

DatasetConfig(chat_template='tulu', preference_chosen_key='chosen', preference_rejected_key='rejected', sft_messages_key='messages', binary_messages_key='messages', label='binary_labels', convert_preference_to_binary_dataset=False, max_token_length=2048, max_prompt_token_length=None, sanity_check=False, sanity_check_max_samples=100, batched=False, load_from_cache_file=True, num_proc=12, train_only_on_prompt=True, ncols=2)

In [6]:
use_gpu = True

device = torch.device('cuda' if torch.cuda.is_available() and use_gpu else 'cpu')
device

device(type='cuda')

In [7]:
model.to(device)
model.eval() # set to evaluation because we don't need to update weights

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [8]:
model.num_parameters()

1176764416

In [9]:
dataset = load_from_disk(lima_paraphrased_dataset_path)

In [10]:
dataset.column_names

['id', 'messages', 'paraphrased_messages']

In [11]:
# create two datasets for original messages and paraphrased messages

original_dataset_columns = ["id", "messages"]
paraphrased_dataset_columns = ["id", "paraphrased_messages"]

original_dataset = dataset.select_columns(original_dataset_columns)
paraphrased_dataset = dataset.select_columns(paraphrased_dataset_columns)

In [12]:
print(original_dataset.column_names)
print(paraphrased_dataset_columns)

['id', 'messages']
['id', 'paraphrased_messages']


In [13]:
# rename paraphrased messages to messages since open-instruct encode_sft_example only works with 'messages'key
paraphrased_dataset = paraphrased_dataset.rename_column("paraphrased_messages", "messages")

In [14]:
original_dataset

Dataset({
    features: ['id', 'messages'],
    num_rows: 988
})

In [15]:
paraphrased_dataset

Dataset({
    features: ['id', 'messages'],
    num_rows: 988
})

In [16]:
sample_size = 10 # original_dataset.num_rows
sample_size

10

In [17]:
original_dataset_tokenized = prepare_dataset(dataset=dataset, tokenizer=tokenizer, model=model, sample_size=sample_size)

Tokenizing and reformatting instruction data:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

In [18]:
paraphrased_dataset_tokenized = prepare_dataset(dataset=paraphrased_dataset, tokenizer=tokenizer, model=model, sample_size=sample_size)

Tokenizing and reformatting instruction data:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

In [29]:
start_time = time.time()

data = []

gradients = dict()

original_ids = set()
paraphrased_ids = set()

progress_wrapper = tqdm(original_dataset_tokenized, desc="Calculating gradients and corresponding similarities")

for original in progress_wrapper:
    original_id = original["id"][0][0]
    original_ids.add(original_id)

    original_gradients = get_gradients(model, original, device)
    original_flattened_gradients = get_flattened_weight_vector(original_gradients)

    for paraphrased in paraphrased_dataset_tokenized:
        paraphrased_id = paraphrased["id"][0][0]
        paraphrased_ids.add(paraphrased_id)
        
        progress_wrapper.set_description(desc=f"Processing original ({original_id}) and paraphrased ({paraphrased_id})")

        paraphrased_gradients = get_gradients(model, paraphrased, device)
        paraphrased_flattened_gradients = get_flattened_weight_vector(paraphrased_gradients)

        similarity = original_flattened_gradients.dot(paraphrased_flattened_gradients).item()
        data.append((original_id, paraphrased_id, similarity))
        
progress_wrapper.set_description("Calculating gradients and corresponding similarities")

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

  0%|          | 0/10 [00:00<?, ?it/s]

In [38]:
df = pd.DataFrame(data, columns=['original_id', 'paraphrased_id', 'value'])
df_pivot = df.pivot(index='original_id', columns='paraphrased_id', values='value')
df_pivot = df_pivot.reindex(index=sorted(original_ids), columns=sorted(paraphrased_ids))

In [39]:
df_pivot

paraphrased_id,lima_0,lima_1,lima_2,lima_3,lima_4,lima_5,lima_6,lima_7,lima_8,lima_9
original_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lima_0,700.738403,7.380151,26.236687,5.326636,-2.626546,35.676949,30.140007,7.95838,6.076571,8.484148
lima_1,-3.730602,196.84938,-1.58321,-5.861059,-2.444615,5.129015,15.378582,-12.913674,5.850517,5.514441
lima_2,11.600126,3.029165,960.965332,7.783887,11.206194,10.060546,34.408218,24.065556,-6.019435,7.923914
lima_3,11.620741,5.798695,9.425385,300.290955,0.150631,-10.277615,10.038214,-2.954804,1.152241,7.397229
lima_4,-4.73551,9.950197,11.52303,8.552574,388.990479,-4.557472,19.577757,5.981244,-8.240814,12.040644
lima_5,35.705952,6.106727,18.841635,-22.901018,-19.440754,846.697205,11.4716,14.80472,43.659912,7.644453
lima_6,47.727016,11.200463,23.724585,6.237095,13.382336,9.398446,399.436981,-1.615424,16.108091,1.934655
lima_7,1.834507,-0.49121,18.389734,1.1523,-4.126739,11.712915,11.202678,711.893921,29.263861,12.942389
lima_8,20.194534,5.915745,17.892975,0.000722,2.154916,4.304615,20.152128,16.248529,743.848145,-17.747938
lima_9,7.368604,2.516392,-11.044443,5.662416,6.940208,8.011875,22.922361,9.811154,3.914279,350.690247


In [40]:
df_pivot.to_csv(get_gradient_similarity_file_path(sample_size), index=True, header=True)

In [43]:
# google colab related stuff to publish data automatically to github repository
if is_running_on_colab():
    from google.colab import userdata
    os.chdir("/content/master-thesis")

    !git config user.email "{userdata.get('GIT_EMAIL')}"
    !git config user.name "{userdata.get('GIT_NAME')}"

    !git add .

    commit_message = "\"executing gradient similarity for sample size " + str(sample_size) + "\""
    !git commit -m '"{commit_message}"'
    !git remote set-url origin "https://{userdata.get('GIT_TOKEN')}@github.com/lukas-hinterleitner/master-thesis.git"
    !git push

[main bfb0ef2] ""executing gradient similarity for sample size 10""
 1 file changed, 11 insertions(+)
 create mode 100644 data/gradient_similarity/sample_size_10.csv
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 8 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 1.47 KiB | 1.47 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/lukas-hinterleitner/master-thesis.git
   5ad09d0..bfb0ef2  main -> main


In [None]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity

In [None]:
# random projections to reduce weight vector size
# compare ranking to other algorithms: bm25, tf-idf, (rouge optionally)