In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np


# Define the dataset
class SortingDataset(Dataset):
    def __init__(self, sequence_length, dataset_size, number_range):
        self.sequence_length = sequence_length
        self.dataset_size = dataset_size
        self.number_range = number_range
        self.data = self._generate_data()

    def _generate_data(self):
        data = np.random.randint(self.number_range[0], self.number_range[1], (self.dataset_size, self.sequence_length))
        return data

    def __len__(self):
        return self.dataset_size

    def __getitem__(self, idx):
        unsorted_sequence = self.data[idx]
        sorted_sequence = np.sort(unsorted_sequence)
        return torch.tensor(unsorted_sequence, dtype=torch.long), torch.tensor(sorted_sequence, dtype=torch.long)


In [2]:
sequence_length = 10
dataset_size = 30000
number_range = (1, 99)
batch_size = 32

dataset = SortingDataset(sequence_length, dataset_size, number_range)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [3]:
for inputs, targets in data_loader:
    print(inputs.shape, targets.shape)

torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([32, 10])
torch.Size([32, 10]) torch.Size([3

In [4]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [13]:


# Define the simplified GPT model
class SimpleGPT(nn.Module):
    def __init__(self, input_dim, model_dim, output_dim, num_heads, num_layers):
        super(SimpleGPT, self).__init__()
        self.embedding = nn.Embedding(input_dim, model_dim)
        transformer_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(model_dim, sequence_length)


    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = self.output_layer(x)
        return x.mean(axis=1)


# Define the custom loss function
def custom_sorting_loss_function(outputs, targets):
    #print(outputs.shape, targets.shape)
    loss = torch.mean((outputs - targets) ** 2)
    return loss


In [17]:
def train(model, data_loader, loss_fn, optimizer, epochs=1, device=None, threshold=1.1):
    model.train()
    exp_mean = None
    for epoch in range(epochs):
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to the device
            optimizer.zero_grad()
            outputs = model(inputs)
            # Directly calculate the loss without reshaping
            loss = loss_fn(outputs, targets.float())
            loss.backward()
            optimizer.step()
        if exp_mean is None:
            exp_mean = loss.item()
        else:
            exp_mean = (exp_mean + loss.item()) / 2

        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Exponential mean: {exp_mean}')
        if exp_mean < threshold:
            break


In [18]:

# Model and DataLoader setup
input_dim = 101  # Plus one to accommodate the range
model_dim = 128
output_dim = 10  # This matches the sequence length
num_heads = 4
num_layers = 4

model = SimpleGPT(input_dim=input_dim, model_dim=model_dim, output_dim=output_dim, num_heads=num_heads, num_layers=num_layers)
model = model.to(device)  # Move model to the chosen device
optimizer = optim.Adam(model.parameters())
loss_fn = custom_sorting_loss_function




In [19]:


# Train the model
train(model, data_loader, loss_fn, optimizer, epochs=50, device=device, threshold=2.1)


Epoch 1, Loss: 25.31977081298828, Exponential mean: 25.31977081298828
Epoch 2, Loss: 9.948779106140137, Exponential mean: 17.63427495956421
Epoch 3, Loss: 5.530032634735107, Exponential mean: 11.582153797149658
Epoch 4, Loss: 3.3702354431152344, Exponential mean: 7.476194620132446
Epoch 5, Loss: 5.603341102600098, Exponential mean: 6.539767861366272
Epoch 6, Loss: 3.158351421356201, Exponential mean: 4.849059641361237
Epoch 7, Loss: 1.5010721683502197, Exponential mean: 3.175065904855728
Epoch 8, Loss: 1.5189234018325806, Exponential mean: 2.3469946533441544
Epoch 9, Loss: 5.816882133483887, Exponential mean: 4.0819383934140205
Epoch 10, Loss: 1.5208271741867065, Exponential mean: 2.8013827838003635
Epoch 11, Loss: 1.1238247156143188, Exponential mean: 1.9626037497073412
Epoch 12, Loss: 1.6817134618759155, Exponential mean: 1.8221586057916284
Epoch 13, Loss: 2.340230941772461, Exponential mean: 2.0811947737820446
Epoch 14, Loss: 1.4309026002883911, Exponential mean: 1.7560486870352179


In [23]:
# Demonstrate sorting with the trained model
import pandas as pd


def demonstrate_sorting(model, number_range, sequence_length):
    model.eval()
    with torch.no_grad():
        input_sequence = torch.randint(number_range[0], number_range[1], (sequence_length,))
        input_sequence = input_sequence.to(device)
        sorted_sequence = torch.sort(input_sequence).values
        output = model(input_sequence.unsqueeze(0))
        predicted_sort = output.squeeze(0).cpu()  # Move data back to CPU for printing

        demo_df = pd.DataFrame({
            'Input Sequence': input_sequence.cpu().numpy(),
            'True Sorted Sequence': sorted_sequence.cpu().numpy(),
            'Predicted Sort': predicted_sort.cpu().numpy()
        })
        print(demo_df)



demonstrate_sorting(model, number_range, sequence_length)

   Input Sequence  True Sorted Sequence  Predicted Sort
0              61                     5        4.982429
1              74                    41       42.578876
2              53                    53       53.088623
3               5                    53       53.528297
4              41                    60       60.225136
5              81                    61       60.768818
6              53                    70       71.054192
7              70                    74       74.110855
8              60                    81       81.078911
9              92                    92       92.710854


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

class SimpleGPT(nn.Module):
    def __init__(self, input_dim, model_dim, sequence_length, num_heads, num_layers):
        super(SimpleGPT, self).__init__()
        self.embedding = nn.Embedding(input_dim, model_dim)
        transformer_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(model_dim, sequence_length)  # Output dim matches sequence length

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = self.output_layer(x)
        return x


# Dataset class remains unchanged from your current implementation
# Assume SortingDataset is defined as before

# Custom loss function - Adjust as necessary based on your task requirements
def custom_sorting_loss_function(outputs, targets):
    # Mean squared error for direct comparison if outputs and targets are numerical values
    print(outputs.shape, targets.shape)
    loss = torch.mean((outputs - targets) ** 2)
    return loss

# Training function
def train(model, data_loader, loss_fn, optimizer, epochs=1, device=None):
    model.train()
    for epoch in range(epochs):
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, targets.float())
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Adjustments for device compatibility (CPU or CUDA)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
sequence_length = 10
dataset_size = 30000
number_range = (1, 99)
batch_size = 32

dataset = SortingDataset(sequence_length, dataset_size, number_range)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:

# Model parameters and instantiation
input_dim = 101  # Adjust based on your dataset
model_dim = 64
#output_dim = 1  # Assuming this is your sequence length
num_heads = 2
num_layers = 1

model = SimpleGPT(input_dim=input_dim, model_dim=model_dim, sequence_length=sequence_length, num_heads=num_heads, num_layers=num_layers).to(device)
optimizer = optim.Adam(model.parameters())


In [None]:

# DataLoader setup - Assuming SortingDataset is defined with the appropriate parameters
batch_size = 1
dataset = SortingDataset(sequence_length=10, dataset_size=1000, number_range=(1, 100))
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Train the model
train(model, data_loader, custom_sorting_loss_function, optimizer, epochs=5, device=device)

# Ensure you adjust the demonstration function similarly to match the model and task setup


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (10) must match the size of tensor b (32) at non-singleton dimension 1

In [None]:
# Demonstrate sorting with the trained model
def demonstrate_sorting(model, number_range, sequence_length):
    model.eval()
    with torch.no_grad():
        input_sequence = torch.randint(number_range[0], number_range[1], (sequence_length,))
        input_sequence = input_sequence.to(device)
        sorted_sequence = torch.sort(input_sequence).values
        output = model(input_sequence.unsqueeze(0))
        predicted_sort = output.squeeze(0).cpu()  # Move data back to CPU for printing
        print(f"Input sequence: {input_sequence.cpu()}")
        print(f"True sorted sequence: {sorted_sequence}")
        print(f"Model's predicted sort (raw output): {predicted_sort}")

demonstrate_sorting(model, number_range, sequence_length)

Input sequence: tensor([57, 87, 98, 33, 49, 21, 84, 98,  5, 94])
True sorted sequence: tensor([ 5, 21, 33, 49, 57, 84, 87, 94, 98, 98], device='cuda:0')
Model's predicted sort (raw output): tensor([[13.5009, 24.9526, 35.2856, 46.9555, 58.3324, 68.3047, 77.6713, 84.7283,
         92.6870, 97.7248],
        [12.1971, 22.6793, 32.5958, 43.8530, 55.2479, 65.2111, 74.8128, 82.4505,
         91.4200, 97.5098],
        [12.5755, 23.4548, 33.5359, 44.9784, 56.3409, 66.5834, 75.9328, 83.1016,
         91.7038, 97.0161],
        [13.3223, 24.8177, 35.4965, 46.9335, 58.2768, 68.5323, 77.7914, 84.5128,
         92.6191, 97.3526],
        [11.2647, 21.3196, 30.7332, 41.7813, 52.9393, 62.8873, 72.5314, 80.4051,
         89.8875, 96.5314],
        [10.5452, 19.9976, 28.9157, 39.7282, 50.7149, 60.4835, 70.5333, 78.5684,
         88.2440, 95.9225],
        [11.8404, 22.1951, 31.9494, 43.1241, 54.3060, 64.2230, 74.0281, 81.6993,
         90.7938, 97.2541],
        [12.5755, 23.4548, 33.5359, 44.9784, 56