In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader, TensorDataset
import torch

In [3]:
def costest(l1,l2):
    cos=nn.CosineSimilarity(dim=0, eps=1e-6)
    c=0.0
    a=0.0
    for j in range(len(l1[:,0])):
        a+=cos(l1[j,:],l2[j,:])
        c+=1.0
    return a/c

In [4]:
class ImprovedTwoLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ImprovedTwoLayerNN, self).__init__()
        # Increase depth and capacity
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.LeakyReLU()
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(0.5)  # Adjust dropout rate as needed
        
        # Additional layer
        self.layer2 = nn.Linear(hidden_size, hidden_size * 2)
        self.relu2 = nn.LeakyReLU()
        self.batch_norm2 = nn.BatchNorm1d(hidden_size * 2)
        self.dropout2 = nn.Dropout(0.5)  # Adjust dropout rate as needed
        
        # Output layer
        self.layer3 = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu1(x)
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        
        x = self.layer2(x)
        x = self.relu2(x)
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        
        x = self.layer3(x)
        return x
    
class TwoLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size,  output_size):
        super(TwoLayerNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x



In [5]:
def train(data_in, data_out, model, criterion, num_epochs, save=False, learning_rate=0.001):
    
    
    # Create a complete dataset
    full_dataset = TensorDataset(data_in.to("cuda"), data_out.to("cuda"))

    # Define the sizes for your training and validation sets
    total_size = len(full_dataset)
    train_size = int(0.8 * total_size)
    val_size = total_size - train_size

    # Split the dataset
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    # Create DataLoaders for both training and validation sets
    train_dataloader = DataLoader(train_dataset, batch_size=400, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=500)  # No need to shuffle the validation data


    model.to("cuda")
     
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)



    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_dataloader:
            inputs, targets = inputs.to("cuda").float(), targets.to("cuda").float()
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            #loss.backward()
            loss.backward(retain_graph=True)
            optimizer.step()
            
         
            running_loss += loss.item() * inputs.size(0)
    
        epoch_loss = running_loss / len(train_dataloader.dataset)
        # Validation phase
        model.eval()  # Set the model to evaluation mode
        running_loss = 0.0
        with torch.no_grad():  # No gradients need to be calculated
            for inputs, targets in val_dataloader:
                inputs, targets = inputs.to("cuda").float(), targets.to("cuda").float()
                outputs = model(inputs)
                
                loss = criterion(outputs, targets)
                running_loss += loss.item() * inputs.size(0)
        val_loss = running_loss / len(val_dataloader.dataset)
    print(costest(outputs,targets))
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.8f}, Validation Loss: {val_loss:.8f}')
    
    if(save):
        # Ensure the model is in evaluation mode
        model.eval()

        predictions = []
        with torch.no_grad():  # No gradients needed for inference
            for inputs, _ in full_dataset:  # Assuming your dataset returns inputs and targets
                inputs = inputs.to('cuda').float().unsqueeze(0)
                
                # Get the model output
                outputs = model(inputs)
                

                predictions.append(outputs.cpu())

        # Concatenate all batches of predictions
        all_predictions = torch.stack(predictions, dim=0)

        # Save the tensor to a file
        torch.save(all_predictions, 'model_predictions.pth')



In [6]:
id = "1"

loaded_activations  = torch.load(f'activations-{id}.pth')
loaded_embeddings = torch.load(f"embeddings-{id}.pth")
loaded_residual_stream = torch.load(f"residual_stream-{id}.pth")
#loaded_rebased_embeddings = torch.load(f"rebased_embeddings-{id}.pth")
loaded_one_hot = torch.load(f"one-hot-{id}.pth")

In [7]:
data_in.shape

NameError: name 'data_in' is not defined

In [28]:
loaded_embeddings=loaded_embeddings.to(dtype=torch.float32)

bn = torch.nn.BatchNorm1d(4096)
loaded_embeddings = bn(loaded_embeddings)
def cossimtest(l):
    cos=nn.CosineSimilarity(dim=0, eps=1e-6)
    c=0.0
    a=0.0
    for j in range(len(l[:,0])):
        for i in range(len(l[:,0])):
            if(i<j):
                a+=cos(l[i,:],l[j,:])
                c+=1.0
    print(a/c)

cossimtest(loaded_embeddings)
cossimtest(loaded_one_hot.T)

tensor(-0.0051, grad_fn=<DivBackward0>)
tensor(0.0831, device='cuda:0', dtype=torch.float16)


In [None]:
print(loaded_embeddings.size())
print(loaded_one_hot.size())

torch.Size([195, 4096])
torch.Size([21, 195])


In [30]:
print(len(loaded_residual_stream[0]))

# consts
attention = 2
ff = 3


for layer in range(30):


    residual_stream =  [i[layer, -1, :] for i in loaded_residual_stream]

 

    residual_stream = torch.stack(residual_stream)

    

    data_in = residual_stream
    #data_in=loaded_activations
    #data_out = loaded_one_hot.T
    data_out = loaded_embeddings

    input_size = data_in.shape[1]
    output_size = data_out.shape[1]
    hidden_size = 8000

    criterion = nn.MSELoss() 

    print(layer)
    model = TwoLayerNN(input_size, hidden_size, output_size)
    train(data_in, data_out, model, criterion, num_epochs=100)
    

25
0
tensor(-0.1431, device='cuda:0')
Epoch [100/100], Training Loss: 0.99456662, Validation Loss: 1.02705252
1
tensor(0.1003, device='cuda:0')
Epoch [100/100], Training Loss: 0.47711614, Validation Loss: 1.17776179
2
tensor(0.0796, device='cuda:0')
Epoch [100/100], Training Loss: 0.41598642, Validation Loss: 1.24047458
3
tensor(0.0969, device='cuda:0')
Epoch [100/100], Training Loss: 0.04650988, Validation Loss: 1.37019670
4
tensor(0.0755, device='cuda:0')
Epoch [100/100], Training Loss: 0.06309481, Validation Loss: 1.33092928
5
tensor(0.0742, device='cuda:0')
Epoch [100/100], Training Loss: 0.05800305, Validation Loss: 1.35214448
6
tensor(0.0909, device='cuda:0')
Epoch [100/100], Training Loss: 0.12691428, Validation Loss: 1.25419056
7
tensor(0.1261, device='cuda:0')
Epoch [100/100], Training Loss: 0.09117375, Validation Loss: 1.30883729
8
tensor(0.1251, device='cuda:0')
Epoch [100/100], Training Loss: 0.16916449, Validation Loss: 1.24366927
9
tensor(0.1650, device='cuda:0')
Epoch [1

IndexError: index 25 is out of bounds for dimension 0 with size 25