In [2]:
from trak.projectors import CudaProjector
from trak.projectors import ProjectionType

In [7]:
import torch
import time

n_train, n_test, grad_size = 100, 200, 4000
train_grads = torch.randn(n_train, grad_size)
test_grads = torch.randn(n_test, grad_size)

start = time.time()
slow_res = []
for i in range(len(train_grads)):
    sims_for_train_i = []
    for j in range(len(test_grads)):
        sims_for_train_i.append(torch.nn.functional.cosine_similarity(train_grads[i], test_grads[j], dim=0))
    slow_res.append(torch.Tensor(sims_for_train_i).mean())
slow_res = torch.Tensor(slow_res)
slow_time = time.time() - start
print(f"Slow method runtime: {slow_time:.4f} seconds")


start = time.time()
fast_res = []
precomp_test_avg = torch.nn.functional.normalize(torch.nn.functional.normalize(test_grads, dim=1).mean(axis=0),dim=0)
for i in range(len(train_grads)):
    fast_res.append(torch.dot(torch.nn.functional.normalize(train_grads[i], dim=0), precomp_test_avg))
fast_res = torch.Tensor(fast_res)
fast_time = time.time() - start
print(f"Fast method runtime: {fast_time:.4f} seconds")



Slow method runtime: 0.7527 seconds
Fast method runtime: 0.0049 seconds


In [3]:
torch.cosine_similarity(train_grads, test_grads)

RuntimeError: The size of tensor a (100) must match the size of tensor b (200) at non-singleton dimension 0

In [29]:

def batched(a, precomputed_b):
    results = []
    for aa in torch.split(a, 100):
        results.append(torch.mv(torch.nn.functional.normalize(aa), precomputed_b))
  
    result = torch.cat(results)
    return result

start = time.time()
precomputed_b = torch.nn.functional.normalize(test_grads,dim=1).mean(axis=0)
print("precomputed_b", precomputed_b.shape)
batched_res = batched(train_grads, precomputed_b)
batched_time = time.time() - start
print(f"Batched method runtime: {batched_time:.4f} seconds")



assert torch.allclose(slow_res, fast_res), "Results fast wrong"
assert torch.allclose(slow_res, batched_res), "Results batched wrong"



precomputed_b torch.Size([4000])
Batched method runtime: 0.0102 seconds


In [47]:
torch.split(test_grads, 123)

(tensor([[-1.3856,  1.1161, -0.1036,  ...,  0.4223,  0.0343,  1.5029],
         [-0.6620,  0.8604,  1.0990,  ...,  0.6454, -2.5239,  0.3107],
         [ 0.1714, -0.3584, -0.1210,  ..., -0.2899, -1.2966, -1.7349],
         ...,
         [-0.4732, -0.6423, -1.1674,  ...,  0.4317,  0.6189, -1.6795],
         [ 0.5421,  0.1864,  0.3507,  ...,  0.7997, -0.1297, -0.9185],
         [ 0.5019,  1.6623, -0.5723,  ...,  0.9868,  0.8770, -1.0427]]),
 tensor([[ 3.4273e-01, -2.7250e-01, -4.6453e-01,  ..., -1.1107e+00,
          -1.2405e+00,  7.8210e-01],
         [-1.2946e+00, -3.9343e-01, -2.4400e+00,  ..., -3.2485e-01,
          -1.4577e+00,  9.9926e-01],
         [ 3.1344e-01,  2.3541e+00, -6.2582e-04,  ..., -1.5518e+00,
          -5.5519e-01, -7.3342e-01],
         ...,
         [-8.8507e-03, -3.2516e-01, -1.5228e+00,  ..., -1.1632e+00,
           1.0874e+00,  5.7149e-01],
         [-6.3087e-01,  4.1980e-01,  1.1003e+00,  ...,  1.3482e+00,
          -6.8238e-01,  3.9552e-01],
         [ 2.2978e-

In [46]:

def batched_chunked_mean(a, precomputed_b):
    results = []
    for aa in torch.split(a, 100):
        results.append(torch.mv(torch.nn.functional.normalize(aa), precomputed_b))
  
    result = torch.cat(results)
    return result

start = time.time()
precomputed_b = []
for bb in torch.split(test_grads, 123):
  precomputed_b.append(torch.nn.functional.normalize(bb,dim=1).sum(axis=0))
precomputed_b = torch.stack(precomputed_b).sum(axis=0) / test_grads.shape[0]



batched_res_chunked_mean = batched_chunked_mean(train_grads, precomputed_b)
batched_time = time.time() - start
print(f"Batched chunked mean method runtime: {batched_time:.4f} seconds")




assert torch.allclose(slow_res, batched_res_chunked_mean), "Results batched mean wrong"



Batched chunked mean method runtime: 0.0045 seconds


In [25]:
def batched_precomp_avg(a, precomputed_b_avg):
    results = []
    for aa in torch.split(a, 100):
        results.append(torch.mv(torch.nn.functional.normalize(aa, dim = 1), precomputed_b_avg))
    result = torch.cat(results)
    return result

start = time.time()
precomputed_b_avg = torch.nn.functional.normalize(test_grads, dim = 1).mean(axis = 0)
batched_precomp_avg_res = batched_precomp_avg(train_grads, precomputed_b_avg)
batched_precomp_avg_time = time.time() - start
print(f"Batched precomp avg method runtime: {batched_precomp_avg_time:.4f} seconds")
assert torch.allclose(slow_res, batched_precomp_avg_res), "Results batched wrong"

Batched precomp avg method runtime: 0.0020 seconds


In [31]:
train_grads.shape

torch.Size([1000, 4000])

In [24]:
train_grads.shape

torch.Size([1000, 4000])

In [23]:
slow_res.shape

torch.Size([1000])

In [16]:
torch.cosine_similarity(train_grads,test_grads)

RuntimeError: The size of tensor a (1000) must match the size of tensor b (200) at non-singleton dimension 0

In [None]:
test_grads

torch.Size([1000])

In [3]:
from sklearn.random_projection import johnson_lindenstrauss_min_dim
johnson_lindenstrauss_min_dim(n_samples=939344, eps=0.1)

np.int64(11788)

In [4]:
import torch
import os

In [5]:
import torch

print(torch.version.cuda)

12.4


In [6]:
a = torch.load("gradients/OLMo-2-1124-7B-SFT/tulu_3_formatting_errors/train[0%:100%]/main/mean",map_location="cuda").half().flatten().unsqueeze(0)
b = torch.load("gradients/OLMo-2-1124-7B-SFT/tulu_3_no_errors/train[0%:100%]/main/mean",map_location="cuda").half().flatten().unsqueeze(0)

a.shape

torch.Size([1, 16777216])

In [7]:

os.path.getsize("gradients/OLMo-2-1124-7B-SFT/tulu_3_formatting_errors/train[0%:100%]/main/mean") >> 20

32

In [8]:
projector = CudaProjector(grad_dim=a.shape[-1], proj_dim=2**18,seed=42, proj_type=ProjectionType.normal,device="cuda", max_batch_size=8)



In [9]:
projector

<trak.projectors.CudaProjector at 0x7f0dc77654f0>

In [10]:
projector.project(a,model_id=0)

tensor([[ 0.2388,  0.3356, -0.2365,  ...,  0.0204, -0.0943, -0.1738]],
       device='cuda:0')

In [None]:
torch.save( projector.project(a,model_id=0),"test.grad",)

In [None]:
projector.project(a,model_id=0).shape[-1] / a.shape[-1]

In [None]:
a.shape[-1]

In [None]:
os.path.getsize("test.grad") >> 20

In [None]:
[8192*k for k in range(1,5)]

In [None]:
2**20


In [None]:
r = []
import pandas as pd
for proj_dim in [2**k for k in range(9,18)]:
    print(proj_dim)
    projector = CudaProjector(grad_dim=a.shape[-1], proj_dim=proj_dim,seed=42, proj_type=ProjectionType.normal,device="cuda", max_batch_size=8)
    r.append((proj_dim, float(torch.abs(torch.cosine_similarity(projector.project(a,model_id=0), projector.project(b,model_id=0)) - torch.cosine_similarity(a,b))[0])))

df = pd.DataFrame(r)


In [None]:
import seaborn as sns
sns.lineplot(df, x=0, y=1)

In [None]:
df

In [None]:
df

In [None]:
projector.project(a,model_id=0).shape

In [None]:


# torch.allclose(torch.cosine_similarity(a,b), )

In [None]:
a.shape

In [None]:
b.shape

In [None]:
torch.allclose(((a * b).sum(dim=1) / (torch.norm(a, p=2, dim=1) * torch.norm(b, p=2, dim=1))),(torch.nn.functional.normalize(a) *torch.nn.functional.normalize(b)).sum(dim=1))

In [None]:

assert torch.allclose(torch.cosine_similarity(a,b),(torch.nn.functional.normalize(a) *torch.nn.functional.normalize(b)).sum(dim=1))

In [None]:
b.mean(axis=0).shape

In [None]:
b.mean(axis=0).shape

In [None]:
torch.cosine_similarity(a,b)

In [None]:
torch.cosine_similarity(torch.nn.functional.normalize(a, p=2, dim=1),torch.nn.functional.normalize(b, p=2, dim=1))

In [None]:
(torch.nn.functional.normalize(a, p=2, dim=1) * torch.nn.functional.normalize(b, p=2, dim=1)).sum(axis=1)

In [None]:
((torch.nn.functional.normalize(a, p=2, dim=1) * torch.nn.functional.normalize(b, p=2, dim=1)).sum(axis=1))

In [None]:
torch.nn.functional.normalize(b, p=2, dim=1)

In [None]:
(torch.nn.functional.normalize(a, p=2, dim=1) * torch.nn.functional.normalize(b, p=2, dim=1)).sum(axis=1)

In [None]:
torch.allclose(torch.nn.functional.normalize(b, p=2, dim=1).mean() * torch.norm(b,dim=1).unsqueeze(1), b.mean())

In [None]:
b.mean(axis=1)

In [None]:
(torch.nn.functional.normalize(b, p=2, dim=1).mean() * torch.norm(b,dim=1))

In [None]:
torch.norm(b,dim=1).shape

In [None]:
torch.nn.functional.normalize(b, p=2, dim=1) * torch.norm(b,dim=1) 

In [None]:
import torch

def batched(a, b):
    # Normalize tensor b once
    mean_b = torch.nn.functional.normalize(b, p=2, dim=1).mean(axis=0)
    
    results = []
    
    # Split tensor a into batches
    gradients_a = torch.split(a, 2)
    
    for aa in gradients_a:
        # Normalize each batch of tensor a
        normalized_aa = torch.nn.functional.normalize(aa, p=2, dim=1)
       # print(normalized_aa.shape, mean_b.shape)
        # Compute the cosine similarity: dot product between each normalized row of a and b
        similarity = (normalized_aa*mean_b).sum(dim=1)  # Use matrix multiplication to calculate dot product
        
        # Append the results
        results.append(similarity)
    
    # Concatenate all batch results
    result = torch.cat(results)
    
    return result

# Example usage

similarity = batched(a, b)
similarity.shape

In [None]:
from sklearn.random_projection import johnson_lindenstrauss_min_dim

In [27]:
import torch


def batched(a,precomputed_b):
    
    results = []
    for aa in torch.split(a, a.shape[0]):
        results.append((torch.nn.functional.normalize(aa) * precomputed_b).sum(dim=1))
    result = torch.cat(results)
    return result

torch.manual_seed(0)
a = torch.rand((113,1111))
b = torch.rand((1113,1111))
precomputed_b = torch.nn.functional.normalize(b)

assert torch.allclose(slow_res,batched(train_gradients,precomputed_b))

NameError: name 'train_gradients' is not defined

In [None]:
torch.cosine_similarity(a,b).shape

In [None]:
import torch


def batched(a,precomputed_b):
    
    results = []
    for aa in torch.split(a, a.shape[0]):
        results.append((torch.nn.functional.normalize(aa) * precomputed_b).sum(dim=1))
    result = torch.cat(results)
    return result

torch.manual_seed(0)
a = torch.rand((11,1111))
b = torch.rand((11,1111))
precomputed_b = torch.nn.functional.normalize(b).mean(axis=0)
print(batched(a,precomputed_b))
print(torch.cosine_similarity(a,b))
assert torch.allclose(torch.cosine_similarity(a,b).mean(axis = 1), batched(a,precomputed_b))


assert torch.allclose(torch.cosine_similarity(a,b),batched(a,precomputed_b))

In [None]:
torch.cosine_similarity(a,b).shape

In [None]:
torch.cosine_similarity(a,b)

In [None]:
batched(a,b)

In [None]:
gradients_a

In [None]:
batched(a,b)

In [None]:
torch.cosine_similarity(a,b).mean()

In [None]:
batched(a,b).mean()

In [None]:
torch.cosine_similarity(a,b)

In [None]:
sum(results)/len(results)

In [None]:
(torch.nn.functional.normalize(a) *(torch.nn.functional.normalize(b).mean())).sum(dim=1).mean()############

In [None]:
import numpy as np
from scipy.spatial.distance import cdist


Y = bb.unsqueeze(dim=0)
x = a
similarities = 1 - cdist(x, Y, metric='cosine')
print(similarities)

In [None]:
den

In [None]:
(torch.nn.functional.normalize(a) * torch.nn.functional.normalize(b).mean(axis=0)).sum(dim=1)

In [None]:
((a * b).sum(dim=1) / (torch.norm(a, p=2, dim=1) * torch.norm(b, p=2, dim=1)))

In [None]:
(torch.nn.functional.normalize(a) *torch.nn.functional.normalize(b)).sum(dim=1)

In [None]:
torch.norm(a, p=2, dim=1).shape

In [None]:
((a * b.mean(axis=0)).sum(dim=1) / (torch.norm(a, p=2, dim=1) * torch.norm(b, p=2, dim=1)))

In [None]:
gradients_a = torch.split(a, a.shape[0] // 2)
gradients_b = torch.split(b, b.shape[0] // 2)


results = []
for a in gradients_a
((a * b).sum(dim=1) / (torch.norm(a, p=2, dim=1) * torch.norm(b, p=2, dim=1)))

In [None]:
gradients_a[0].shape

In [None]:
model_name = "loris3/stratified_equitoken_10m_curriculum_random"


dataset_train_name ="loris3/stratified_equitoken_10m_curriculum"
dataset_train_split_name = "validation"

dataset_test_name = "loris3/stratified_equitoken_10m_curriculum"
dataset_test_split_name = "validation"

In [None]:
import torch
import os


import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset

from dotenv import load_dotenv
load_dotenv()

dataset_test = load_dataset(dataset_test_name)[dataset_test_split_name] 



len_ds = len(dataset_test)


Tests on wether implementaiton of

$
        \phi(f;z,z') = \frac{\sum_{\forall z' \in D_{test}} \nabla \ell(z) \cdot \nabla \ell(z')}{ |D_{test}|}
$
works out with sufficent accuracy via taking the mean of the test gradients first
$
      = \frac{1}{ |D_{test}|}\cdot(\nabla \ell(z) \cdot\sum_{\forall z' \in D_{test}} \nabla \ell(z'))
$


In [None]:
%run extract_gradients.py loris3/stratified_equitoken_10m_curriculum_random loris3/stratified_equitoken_10m_curriculum 0 --dataset_split=validation[0%:10%] --paradigm=mlm --gradients_per_file=1000 --mode=store
%run extract_gradients.py loris3/stratified_equitoken_10m_curriculum_random loris3/stratified_equitoken_10m_curriculum 0 --dataset_split=validation[0%:10%] --paradigm=mlm --gradients_per_file=1000 --mode=store_mean

In [None]:
def validate_mean_extraction():
    cp = "/data/loriss21dm/babylm/gradients/stratified_equitoken_10m_curriculum_random/stratified_equitoken_10m_curriculum/validation[0%:10%]/checkpoint-6174/"
    paths = os.listdir(cp)
    mean_pw = torch.cat([torch.load(os.path.join(cp,p),weights_only=True) for p in paths if p != "mean"],axis=0).mean(axis=0, dtype=torch.float64)
    mean_script = torch.load("/data/loriss21dm/babylm/gradients/stratified_equitoken_10m_curriculum_random/stratified_equitoken_10m_curriculum/validation[0%:10%]/checkpoint-6174/mean", weights_only=True)
    assert torch.cosine_similarity(mean_pw, mean_script).mean().float() == torch.tensor(1.0).float()
    assert torch.allclose(mean_pw, mean_script, atol=0.000001)
validate_mean_extraction()

In [None]:
%run extract_gradients.py loris3/stratified_equitoken_10m_curriculum_random loris3/stratified_equitoken_10m_curriculum 0 --dataset_split=validation[10%:15%] --paradigm=mlm --gradients_per_file=1000 --mode=store
%run extract_gradients.py loris3/stratified_equitoken_10m_curriculum_random loris3/stratified_equitoken_10m_curriculum 0 --dataset_split=validation[10%:15%] --paradigm=mlm --gradients_per_file=1000 --mode=store_mean

In [None]:
%run process_gradients.py loris3/stratified_equitoken_10m_curriculum_random loris3/stratified_equitoken_10m_curriculum 0 --dataset_train_split=validation[0%:10%] --dataset_test="loris3/stratified_equitoken_10m_curriculum" --dataset_test_split=validation[10%:15%] --mode=single --gradients_per_file=1000 --batch_size=20

In [None]:
influence_single = torch.load("/data/loriss21dm/babylm/influence/stratified_equitoken_10m_curriculum_random/stratified_equitoken_10m_curriculum_validation[0%:10%]_stratified_equitoken_10m_curriculum_validation[10%:15%]/checkpoint-6174", weights_only=True)


In [None]:
%run process_gradients.py loris3/stratified_equitoken_10m_curriculum_random loris3/stratified_equitoken_10m_curriculum 0 --dataset_train_split=validation[0%:10%] --dataset_test="loris3/stratified_equitoken_10m_curriculum" --dataset_test_split=validation[10%:15%] --mode=mean --gradients_per_file=1000 --batch_size=20

In [None]:
influence_mean = torch.load("/data/loriss21dm/babylm/mean_influence/stratified_equitoken_10m_curriculum_random/stratified_equitoken_10m_curriculum_validation[0%:10%]_stratified_equitoken_10m_curriculum_validation[10%:15%]/checkpoint-6174", weights_only=True)


In [None]:
influence_mean.shape

In [None]:
torch.allclose(influence_single.mean(-1),influence_mean.squeeze())

In [None]:
tensor = (torch.cosine_similarity(influence_single.mean(-1), influence_mean.squeeze())).unsqueeze(1)


array = tensor.numpy()

plt.figure(figsize=(15, 5))
plt.imshow(array, cmap='viridis', aspect='auto', interpolation=None)
plt.colorbar()

plt.show()


Tests of the two modes "single" and "mean" against eachother for getting the mean influence

In [None]:


def load_debug(start, stop):
    return torch.arange(start*393216, stop*393216, 1, dtype=torch.float64).reshape(-1,393216) /  (len_ds*393216)

gradient_dir = "./gradients/test/test/test/test"
if not os.path.exists(gradient_dir):
    os.makedirs(gradient_dir)
    chunks_test = [ (i, min(i+10000, len_ds), os.path.join(gradient_dir, str(i) + "_" + str(i + 10000))) for i in range(0, len(dataset_test),10000)]
    for start, stop, chunk in chunks_test:
        torch.save(load_debug(start, stop), chunk)



In [None]:

def validate_train_train():
    s = None
    if not os.path.exists("test"):
        data = load_debug(0, len_ds).squeeze()
        
        with torch.no_grad():
            s = torch.matmul(data, data.T).sum(dim=1)
        s = s / len_ds
        s = s.unsqueeze(0)
        torch.save(s, "test")
    else:
        s = torch.load("test")

    slurm = torch.load("/data/loriss21dm/babylm/mean_influence/test/test_test_test_test/test")
    assert torch.allclose(slurm, s.float())
 



    tensor = (s / slurm).float()


    array = tensor.numpy()

    plt.figure(figsize=(15, 5))
    plt.imshow(array, cmap='viridis', aspect='auto', interpolation=None)
    plt.colorbar()

    plt.show()




In [None]:
%run process_gradients.py test test 0 --mode=mean --dataset_test_split=test --dataset_train_split=test --test=True --test_dataset_size=53457 --gradients_per_file=10000 --batch_size=2

In [None]:
validate_train_train()

In [None]:
%run process_gradients.py test test 0 --mode=mean --dataset_test_split=test --dataset_train_split=test --test=True --test_dataset_size=53457 --gradients_per_file=10000 --batch_size=1

In [None]:
validate_train_train()

In [None]:
%run process_gradients.py test test 0 --mode=mean --dataset_test_split=test --dataset_train_split=test --test=True --test_dataset_size=53457 --gradients_per_file=10000 --batch_size=20

In [None]:
validate_train_train()