In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np


# Define the dataset
class SortingDataset(Dataset):
    def __init__(self, sequence_length, dataset_size, number_range):
        self.sequence_length = sequence_length
        self.dataset_size = dataset_size
        self.number_range = number_range
        self.data = self._generate_data()

    def _generate_data(self):
        data = np.random.randint(self.number_range[0], self.number_range[1], (self.dataset_size, self.sequence_length))
        return data

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, idx):
        unsorted_sequence = self.data[idx]
        sorted_sequence = np.sort(unsorted_sequence)
        return torch.tensor(unsorted_sequence, dtype=torch.long), torch.tensor(sorted_sequence, dtype=torch.long)


In [61]:
sequence_length = 10
dataset_size = 30000
number_range = (1, 99)
batch_size = 32

dataset = SortingDataset(sequence_length, dataset_size, number_range)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [63]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy


class TransformerEncoderLayer(nn.Module):
    def __init__(self, model_dim, num_heads, feedforward_dim, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(model_dim, num_heads, dropout=dropout)
        self.linear1 = nn.Linear(model_dim, feedforward_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(feedforward_dim, model_dim)

        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = nn.ReLU()

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src2 = self.norm1(src)
        src = src + self.dropout1(self.self_attn(src2, src2, src2, attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask)[0])
        src2 = self.norm2(src)
        src = src + self.dropout2(self.linear2(self.dropout(self.activation(self.linear1(src2)))))
        return src

class TransformerEncoder(nn.Module):
    def __init__(self, layer, num_layers):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, src, mask=None, src_key_padding_mask=None):
        output = src

        for layer in self.layers:
            output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)

        return output


In [65]:



# Define the simplified GPT model
class SimpleGPT(nn.Module):
    def __init__(self, input_dim, model_dim, output_dim, num_heads, num_layers):
        super(SimpleGPT, self).__init__()
        self.embedding = nn.Embedding(input_dim, model_dim)
        transformer_layer = TransformerEncoderLayer(model_dim=model_dim, num_heads=num_heads, feedforward_dim=model_dim*4)
        self.transformer = TransformerEncoder(layer=transformer_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(model_dim, output_dim)


    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = self.output_layer(x)
        return x.mean(axis=1)


# Define the custom loss function
def custom_sorting_loss_function(outputs, targets):
    #print(outputs.shape, targets.shape)
    loss = torch.mean((outputs - targets) ** 2)
    return loss


In [66]:
def train(model, data_loader, loss_fn, optimizer, epochs=1, device=None, threshold=1.1):
    model.train()
    exp_mean = None
    exp_var = None
    exp_std = None
    for epoch in range(epochs):
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to the device
            optimizer.zero_grad()
            outputs = model(inputs)
            # Directly calculate the loss without reshaping
            loss = loss_fn(outputs, targets.float())
            loss.backward()
            optimizer.step()
        if exp_mean is None:
            exp_mean = loss.item()
        else:
            exp_mean = (exp_mean + loss.item()) / 2
            if exp_var is None:
                exp_var = (loss.item() - exp_mean) ** 2
            else:
                exp_var = (exp_var + (loss.item() - exp_mean) ** 2) / 2
            exp_std = exp_var ** 0.5

        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Exponential mean: {exp_mean}, Expoential std: {exp_std}')
        if exp_mean < threshold:
            break


In [67]:

# Model and DataLoader setup
input_dim = 101  # Plus one to accommodate the range
model_dim = 128
output_dim = 10  # This matches the sequence length
num_heads = 4
num_layers = 4

model = SimpleGPT(input_dim=input_dim, model_dim=model_dim, output_dim=output_dim, num_heads=num_heads, num_layers=num_layers)
model = model.to(device)  # Move model to the chosen device
optimizer = optim.Adam(model.parameters())
loss_fn = custom_sorting_loss_function




In [68]:


# Train the model
train(model, data_loader, loss_fn, optimizer, epochs=50, device=device, threshold=0.1)


Epoch 1, Loss: 34.17226791381836, Exponential mean: 34.17226791381836, Expoential std: None
Epoch 2, Loss: 44.27728271484375, Exponential mean: 39.224775314331055, Expoential std: 5.052507400512695
Epoch 3, Loss: 37.39603042602539, Exponential mean: 38.31040287017822, Expoential std: 3.630696076433662
Epoch 4, Loss: 41.59498977661133, Exponential mean: 39.952696323394775, Expoential std: 2.817719094046525
Epoch 5, Loss: 46.94095993041992, Exponential mean: 43.44682812690735, Expoential std: 3.173995743008519
Epoch 6, Loss: 34.64754867553711, Exponential mean: 39.04718840122223, Expoential std: 3.8360838554744103
Epoch 7, Loss: 32.84743118286133, Exponential mean: 35.94730979204178, Expoential std: 3.487462310755913
Epoch 8, Loss: 22.764333724975586, Exponential mean: 29.355821758508682, Expoential std: 5.273049775279624
Epoch 9, Loss: 26.11663246154785, Exponential mean: 27.736227110028267, Expoential std: 3.9005218085509332
Epoch 10, Loss: 46.28580856323242, Exponential mean: 37.01101

In [23]:
# Demonstrate sorting with the trained model
import pandas as pd


def demonstrate_sorting(model, number_range, sequence_length):
    model.eval()
    with torch.no_grad():
        input_sequence = torch.randint(number_range[0], number_range[1], (sequence_length,))
        input_sequence = input_sequence.to(device)
        sorted_sequence = torch.sort(input_sequence).values
        output = model(input_sequence.unsqueeze(0))
        predicted_sort = output.squeeze(0).cpu()  # Move data back to CPU for printing

        demo_df = pd.DataFrame({
            'Input Sequence': input_sequence.cpu().numpy(),
            'True Sorted Sequence': sorted_sequence.cpu().numpy(),
            'Predicted Sort': predicted_sort.cpu().numpy()
        })
        print(demo_df)



demonstrate_sorting(model, number_range, sequence_length)

   Input Sequence  True Sorted Sequence  Predicted Sort
0              61                     5        4.982429
1              74                    41       42.578876
2              53                    53       53.088623
3               5                    53       53.528297
4              41                    60       60.225136
5              81                    61       60.768818
6              53                    70       71.054192
7              70                    74       74.110855
8              60                    81       81.078911
9              92                    92       92.710854
