In [1]:
# google colab related stuff
def is_running_on_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

if is_running_on_colab():
    import os
    os.chdir("/content")
    !git clone https://github.com/lukas-hinterleitner/master-thesis.git
    
    os.chdir("/content/master-thesis")
    !git submodule init
    !git submodule update
    
    !pip uninstall ibis-framework torchvision torchaudio -y
    !pip install -r requirements.txt
    os.kill(os.getpid(), 9)

In [2]:
# google colab related stuff
if is_running_on_colab():
    import os
    os.chdir("/content/master-thesis/code")

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk

from config import hf_model_id, lima_filtered_paraphrased_dataset_path, get_dataset_config
from torch.utils.data import Subset
import torch
import numpy as np
import random
import gc

from utilities.preprocessing import prepare_dataset
from utilities.gradient_operations import get_gradients, get_flattened_weight_vector

[2024-11-07 22:03:37,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


INFO:root:gcc -pthread -B /home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include -fPIC -c /tmp/tmp52v2k_a5/test.c -o /tmp/tmp52v2k_a5/test.o
INFO:root:gcc -pthread -B /home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat /tmp/tmp52v2k_a5/test.o -laio -o /tmp/tmp52v2k_a5/a.out
/home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
INFO:root:gcc -pthread -B /home/lukashinterleitner/anaconda3/envs/master-thesis/compiler_compat -fno-strict-overflow -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include -fPIC -O2 -isystem /home/lukashinterleitner/anaconda3/envs/master-thesis/include 

In [4]:
# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
torch.cuda.empty_cache()
gc.collect()

88

In [6]:
model = AutoModelForCausalLM.from_pretrained(hf_model_id)
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)

print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)

50279
1


In [7]:
dataset_config = get_dataset_config(model)
dataset_config

DatasetConfig(chat_template='tulu', preference_chosen_key='chosen', preference_rejected_key='rejected', sft_messages_key='messages', binary_messages_key='messages', label='binary_labels', convert_preference_to_binary_dataset=False, max_token_length=2048, max_prompt_token_length=None, sanity_check=False, sanity_check_max_samples=100, batched=False, load_from_cache_file=True, num_proc=12, train_only_on_prompt=True, ncols=2)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [9]:
# remove the comment from the following line, if the model should be processed on a GPU
# model.to(device)

In [10]:
model.eval()

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): OlmoRotaryEmbedding()
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
  )
  (

In [11]:
dataset = load_from_disk(lima_filtered_paraphrased_dataset_path)

In [12]:
dataset.column_names

['id', 'messages', 'paraphrased_messages']

In [13]:
# create two datasets for original messages and paraphrased messages

original_dataset_columns = ["id", "messages"]
paraphrased_dataset_columns = ["id", "paraphrased_messages"]

original_dataset = dataset.select_columns(original_dataset_columns)
paraphrased_dataset = dataset.select_columns(paraphrased_dataset_columns)

In [14]:
print(original_dataset.column_names)
print(paraphrased_dataset_columns)

['id', 'messages']
['id', 'paraphrased_messages']


In [15]:
# rename paraphrased messages to messages since open-instruct encode_sft_example only works with 'messages'key
paraphrased_dataset = paraphrased_dataset.rename_column("paraphrased_messages", "messages")

In [16]:
original_dataset

Dataset({
    features: ['id', 'messages'],
    num_rows: 988
})

In [17]:
paraphrased_dataset

Dataset({
    features: ['id', 'messages'],
    num_rows: 988
})

In [18]:
sample_size = 5

In [19]:
original_dataset_tokenized = prepare_dataset(dataset=dataset, tokenizer=tokenizer, model=model, sample_size=sample_size)

Tokenizing and reformatting instruction data:   0%|          | 0/5 [00:00<?, ? examples/s]

In [20]:
paraphrased_dataset_tokenized = prepare_dataset(dataset=paraphrased_dataset, tokenizer=tokenizer, model=model, sample_size=sample_size)

Tokenizing and reformatting instruction data:   0%|          | 0/5 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

In [22]:
gradients = dict()

for original in original_dataset_tokenized:
    original_id = original["id"][0][0]
    
    original_gradients = get_gradients(model, original)
    original_flattened_gradients = get_flattened_weight_vector(gradients)#.to(device)
    
    for paraphrased in paraphrased_dataset_tokenized:
        paraphrased_id = paraphrased["id"][0][0]
        
        paraphrased_gradients = get_gradients(model, paraphrased)
        paraphrased_flattened_gradients = get_flattened_weight_vector(paraphrased_gradients)#.to(device)
        
        gradients[(original_id, paraphrased_id)] = original_flattened_gradients.dot(paraphrased_flattened_gradients)

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


In [23]:
gradients

{('lima_0', 'lima_0'): 0,
 ('lima_0', 'lima_1'): 0,
 ('lima_0', 'lima_2'): 0,
 ('lima_0', 'lima_3'): 0,
 ('lima_0', 'lima_4'): 0,
 ('lima_1', 'lima_0'): 0,
 ('lima_1', 'lima_1'): 0,
 ('lima_1', 'lima_2'): 0,
 ('lima_1', 'lima_3'): 0,
 ('lima_1', 'lima_4'): 0,
 ('lima_2', 'lima_0'): 0,
 ('lima_2', 'lima_1'): 0,
 ('lima_2', 'lima_2'): 0,
 ('lima_2', 'lima_3'): 0,
 ('lima_2', 'lima_4'): 0,
 ('lima_3', 'lima_0'): 0,
 ('lima_3', 'lima_1'): 0,
 ('lima_3', 'lima_2'): 0,
 ('lima_3', 'lima_3'): 0,
 ('lima_3', 'lima_4'): 0,
 ('lima_4', 'lima_0'): 0,
 ('lima_4', 'lima_1'): 0,
 ('lima_4', 'lima_2'): 0,
 ('lima_4', 'lima_3'): 0,
 ('lima_4', 'lima_4'): 0}

In [15]:
#flattened_gradients.size()

torch.Size([1176764416])

In [16]:
# todo: investigate how olmo uses a single training iteration, check masking
# todo: add filtering with regard to open instruct (threshold for similarity)
# todo: ranking between sampling
# todo: tf-idf -> term-frequency inverse-document-frequency
# todo: think about explainability vs. similarity

In [17]:
# random projections to reduce weight vector size
# compare ranking to other algorithms: bm25, tf-idf, (rouge optionally)