In [97]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

In [98]:
class MysteryDataset(Dataset):
    def __init__(self, csv_file, is_test=False):
        self.is_test = is_test
        df = pd.read_csv(csv_file)
        
        self.ids = df.iloc[:, 0].values
        if self.is_test:
            self.X = df.iloc[:, 1:].values.astype(np.float32)
            self.y = None
        else:
            self.X = df.iloc[:, 1:-1].values.astype(np.float32)
            self.y = df.iloc[:, -1].values.astype(np.int64) 

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        features = torch.tensor(self.X[idx])
        
        if self.is_test:
            return features
        else:
            label = torch.tensor(self.y[idx])
            return features, label

In [99]:
# import data from files
full_tensor = MysteryDataset("data/train.csv", 0)
test_data_tensor = MysteryDataset("data/test.csv", 1)

train_size = int(0.8 * len(full_tensor))
val_size = len(full_tensor) - train_size
train_data_tensor, val_data_tensor = random_split(full_tensor, [train_size, val_size])

In [100]:
# create data loaders
train_loader = DataLoader(dataset=train_data_tensor, batch_size=64)
val_loader = DataLoader(dataset=val_data_tensor, batch_size=64)
test_loader = DataLoader(dataset=test_data_tensor, batch_size=64)

In [101]:
for X, y in train_loader:
    print(max(y), end=' ')

tensor(4) tensor(4) tensor(4) tensor(4) tensor(3) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(3) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(3) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) tensor(4) 

In [102]:
class MysteryTransformer(nn.Module):
    def __init__(self, num_features=205, num_classes=5, d_model=64, nhead=4, num_layers=2, dropout=0.1):
        super().__init__()

        self.d_model = d_model
        
        
        
        self.tok_emb = nn.Linear(1, d_model)  
    
        
        self.pos_emb = nn.Parameter(torch.randn(1, num_features, d_model)) 

        
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=4*d_model,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True
        )

        self.transformer = nn.TransformerEncoder(
            enc_layer,
            num_layers=num_layers
        )

        self.ln_f = nn.LayerNorm(d_model)

        self.head = nn.Linear(num_features * d_model, num_classes)

    def forward(self, x):
        # (Batch, Features) -> (32, 205)
        B, S = x.shape 

        # 1. EMBEDDING
        x = x.unsqueeze(-1)       
        tok = self.tok_emb(x)
        
        x = tok + self.pos_emb    

        # 2. TRANSFORMER (No Mask!)
        x = self.transformer(x)

        x = self.ln_f(x)
        
        # 3. CLASSIFICATION HEAD
        x = x.flatten(start_dim=1) 
        
        logits = self.head(x)   
        return logits

In [103]:
input_dim = 205
device = torch.cuda.current_device().type if torch.cuda.is_available() else "cpu"
model = MysteryTransformer().to(device)                        
learning_rate = 1e-3
batch_size = 128
epochs = 5



In [104]:
optimizer = torch.optim.AdamW(                        # AdamW optimizer (common for Transformers)
    model.parameters(),                         # optimize all model parameters
    lr=3e-4,                                    # learning rate
    weight_decay=1e-2                           # weight decay regularization
)

def lm_loss(logits, targets):
    # logits: (B, S, V) and targets: (B, S)
    B, S, V = logits.shape                      # unpack dimensions
    logits_2d = logits.reshape(B*S, V)          # flatten batch+time into one dimension
    targets_1d = targets.reshape(B*S)           # flatten targets similarly
    return F.cross_entropy(logits_2d, targets_1d)  # standard next-token cross entropy

loss_fn = nn.CrossEntropyLoss()

In [105]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train() # Enable Dropout/BatchNorm
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 20 == 0: # Print more often
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def validation_loop(dataloader, model, loss_fn):
    model.eval() # Disable Dropout/BatchNorm
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            # FIX: Move data to GPU
            X, y = X.to(device), y.to(device)
            
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            # FIX: Ensure comparison happens on same device
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Validation Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

def test_loop(dataloader, model, output_file):
    model.eval()
    all_preds = []

    print("Generating predictions...")
    with torch.no_grad():
        for X in dataloader:
            # FIX: Move data to GPU
            X = X.to(device)
            
            preds = model(X)
            
            # FIX: Move back to CPU before converting to Numpy
            # argmax(1) gets the class index (0-4)
            predicted_classes = preds.argmax(1).cpu().numpy()
            all_preds.extend(predicted_classes)

    # Save to CSV
    # We use range() for IDs assuming the loader preserved order (it does with shuffle=False)
    # Ideally, your dataset should return IDs, but this works for now.
    df = pd.DataFrame({
        "id": test_data_tensor.ids, # Use the IDs we saved in __init__!
        "label": all_preds
    })
    df.to_csv(output_file, index=False)
    print(f"Saved {len(all_preds)} predictions to {output_file}")

In [106]:
def get_batch(data, batch_size, block_size, device):
    # Choose random starting indices so each batch is different
    ix = torch.randint(0, len(data) - block_size - 1, (batch_size,))  # (B,)

    # Build input sequences x: tokens [i, ..., i+block_size-1]
    x = torch.stack([data[i:i+block_size] for i in ix])               # (B, S)

    # Build target sequences y: the "next token" for each position
    # y is the same as x but shifted by 1 position in the original text
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])           # (B, S)

    # Move tensors to the chosen device (CPU/GPU)
    return x.to(device), y.to(device)

In [None]:
epochs = 10
print(f"Starting Training on {device}...")

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    validation_loop(val_loader, model, loss_fn)

print("Training Done!")

Starting Training on cpu...
Epoch 1
-------------------------------
loss: 2.281155  [    0/ 6400]
loss: 1.305554  [ 1280/ 6400]
loss: 0.985393  [ 2560/ 6400]
loss: 0.810563  [ 3840/ 6400]
loss: 0.879799  [ 5120/ 6400]
Validation Error: 
 Accuracy: 65.8%, Avg loss: 0.933589 

Epoch 2
-------------------------------
loss: 0.746520  [    0/ 6400]
loss: 1.191886  [ 1280/ 6400]
loss: 0.871822  [ 2560/ 6400]


In [None]:
test_loop(test_loader, model, "submission3.csv")

Generating predictions...
Saved 2000 predictions to submission3.csv
