In [1]:
import os   
os.environ['HF_HOME'] = "/home/max4c/.cache/huggingface/"
os.environ['CUDA_VISIBLE_DEVICES'] = "13,14,15"  
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import numpy as np
from sentence_transformers import SentenceTransformer

[1717720076.968500] [e89995c8a091:1425555:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device


  from tqdm.autonotebook import tqdm, trange


In [2]:
class AbsDataset(Dataset):
    def __init__(self, train_fn="./train.json"):
        self.data = json.load(open(train_fn, "r"))
        self.sentences = []
        for item in self.data:
            self.sentences.extend(item['good'])

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

class SparseAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU()
        )
        self.decoder = nn.Linear(hidden_size, input_size)
        
        self.apply(self.init_weights)
    
    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                module.bias.data.zero_()
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

In [3]:
print("Loading embedding model...")
emb_model = SentenceTransformer("sentence-t5-xxl").to("cuda")

print("Loading data...")
training_data = AbsDataset()[:1000]
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True, num_workers=10)

input_size = emb_model.get_sentence_embedding_dimension()
hidden_size = 10000  # Adjust this value as needed
autoencoder = SparseAutoencoder(input_size, hidden_size).to("cuda")

criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.0001)  # Reduced learning rate

sparsity_weight = 0.5

Loading embedding model...




Loading data...


In [4]:
print("Training...")
for epoch in range(1): 
    for batch, X in enumerate(train_dataloader):
        X = emb_model.encode(X, convert_to_numpy=False, convert_to_tensor=True, device="cuda")

        optimizer.zero_grad()
        reconstructed, encoded = autoencoder(X)
        reconstruction_loss = criterion(reconstructed, X)

        # Compute sparsity loss
        sparsity_loss = torch.mean(torch.abs(encoded))

        # Combine reconstruction loss and sparsity loss
        loss = reconstruction_loss + sparsity_weight * sparsity_loss
        torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_norm=1.0)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_norm=1.0)

        optimizer.step()

        if batch % 1 == 0:
            print(f"Epoch [{epoch+1}/1], Batch [{batch+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")

# Save the trained model's state dictionary
torch.save(autoencoder.state_dict(), "trained_autoencoder.pth")

Training...
Epoch [1/1], Batch [1/16], Loss: 0.0042
Epoch [1/1], Batch [2/16], Loss: 0.0036
Epoch [1/1], Batch [3/16], Loss: 0.0030
Epoch [1/1], Batch [4/16], Loss: 0.0026
Epoch [1/1], Batch [5/16], Loss: 0.0023
Epoch [1/1], Batch [6/16], Loss: 0.0020
Epoch [1/1], Batch [7/16], Loss: 0.0018
Epoch [1/1], Batch [8/16], Loss: 0.0016
Epoch [1/1], Batch [9/16], Loss: 0.0015
Epoch [1/1], Batch [10/16], Loss: 0.0014
Epoch [1/1], Batch [11/16], Loss: 0.0013
Epoch [1/1], Batch [12/16], Loss: 0.0012
Epoch [1/1], Batch [13/16], Loss: 0.0012
Epoch [1/1], Batch [14/16], Loss: 0.0012
Epoch [1/1], Batch [15/16], Loss: 0.0011
Epoch [1/1], Batch [16/16], Loss: 0.0011
