In [1]:
from joblib import load
import numpy as np
import pandas as pd

# fetch training data (thank you keegan)
df = load('../DL_data/results_dataframe/results_dataframe.pkl')
print(type(df))
print(f'DF {df.shape}')
print(df.columns)

def normalize_P(df):
    def norm(x):
        x = np.array(x, dtype=np.float32)
        return ((x - -100) / (100 - -100)) * 2 - 1
    return df['P'].apply(lambda x: norm(x).tolist())
# df['P'] = normalize_P(df)
print(df.head())

<class 'pandas.core.frame.DataFrame'>
DF (10980480, 5)
Index(['n', 'k', 'm', 'result', 'P'], dtype='object')
    n  k  m       result                                                  P
0  10  5  2   266.264524  [-53.30165452517249, 46.36747377721454, 12.267...
1   9  4  2   140.163560  [6.5178894459618135, -6.281201889271188, 99.66...
2  10  4  3   167.692765  [-67.02930762978451, 82.95217006264068, -6.583...
3   9  4  3   565.010999  [91.14110355866848, -15.260304068831033, -47.3...
4   9  6  2  1170.076493  [92.83943954357875, -97.82331459868368, 5.3277...


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
import math

def pad_matrix(P, target_shape=(6, 10)):
    """
    Pad a matrix P (shape [k, n-k]) to shape [6, 10]
    """
    P = np.array(P, dtype=np.float32)
    padded = np.zeros(target_shape, dtype=np.float32)
    k, n_k = P.shape
    padded[:k, :n_k] = P
    return padded

class CNNTransformerDataset(Dataset):
    def __init__(self, df, normalize=True, include_m=True):
        """
        Parameters:
            df (pd.DataFrame): must contain 'P', 'm', 'n', 'k', and optionally 'result'
            normalize (bool): scale P values from [-100, 100] to [-1, 1]
            include_m (bool): include scalar m as auxiliary input
        """
        self.df = df.reset_index(drop=True)
        self.normalize = normalize
        self.include_m = include_m
        self.has_targets = 'result' in df.columns

    def __len__(self):
        return len(self.df)

    def normalize_P(self, P, k, n_k):
        P = np.array(P, dtype=np.float32).reshape(k, n_k)
        return ((P + 100) / 200) * 2 - 1 if self.normalize else P

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        k, n = int(row['k']), int(row['n'])
        n_k = n - k

        P = self.normalize_P(row['P'], k, n_k)
        P = pad_matrix(P)  # pad to [6, 10]
        P_tensor = torch.tensor(P).unsqueeze(0)  # [1, 6, 10]

        if self.include_m:
            m_tensor = torch.tensor([row['m']], dtype=torch.float32)
        else:
            m_tensor = None

        if self.has_targets:
            y = torch.tensor(row['result'], dtype=torch.float32)
            return P_tensor, m_tensor, y
        else:
            return P_tensor, m_tensor

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # (1, max_len, d_model)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].to(x.device)
        return x


class CNNTransformerModel(nn.Module):
    def __init__(self, token_dim=128, d_model=128, nhead=8, num_layers=4, dim_feedforward=256, dropout=0.1):
        super().__init__()
        self.token_dim = token_dim
        self.d_model = d_model

        # CNN to extract token sequence from padded 6x10 P matrix
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, token_dim, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(token_dim, token_dim // 2, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(token_dim // 2, token_dim, kernel_size=3, padding=1),
            nn.ReLU()
        )

        self.m_project = nn.Linear(1, d_model)  # Project scalar m into token space

        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.input_proj = nn.Linear(token_dim, d_model)
        self.regressor = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Linear(d_model // 2, 1)
        )

    def forward(self, P, m):
        # P: (B, 1, 6, 10), m: (B, 1)
        B = P.size(0)

        cnn_out = self.cnn(P)               # (B, token_dim, 6, 10)
        tokens = cnn_out.flatten(2).transpose(1, 2)  # (B, 60, token_dim)
        tokens = self.input_proj(tokens)   # (B, 60, d_model)

        m_token = self.m_project(m).unsqueeze(1)  # (B, 1, d_model)
        token_seq = torch.cat([m_token, tokens], dim=1)  # (B, 61, d_model)

        encoded = self.pos_encoder(token_seq)    # Add positional info
        transformed = self.transformer(encoded)  # (B, 61, d_model)

        pooled = transformed[:, 0]  # Use m-token output (position 0)
        return self.regressor(pooled)  # Final regression prediction


def ratio_loss(preds, targets, eps=1e-6):
    preds = torch.clamp(preds, min=eps)
    targets = torch.clamp(targets, min=eps)
    return torch.mean((torch.log2(targets) - torch.log2(preds))**2)
criterion = ratio_loss

def train(model, train_loader, val_loader, optimizer, config, epochs=25):
    train_losses, val_losses = [], []
    best_val_loss = float('inf')
    best_model_state = None
    no_improve_epochs = 0
    device = next(model.parameters()).device

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        model.train()
        running_loss = 0.0

        for P_tensor, m_tensor, target in train_loader:
            P_tensor, m_tensor, target = P_tensor.to(device), m_tensor.to(device), target.to(device).unsqueeze(1)

            optimizer.zero_grad()
            preds = model(P_tensor, m_tensor)
            loss = criterion(preds, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * P_tensor.size(0)

        train_loss = running_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for P_tensor, m_tensor, target in val_loader:
                P_tensor, m_tensor, target = P_tensor.to(device), m_tensor.to(device), target.to(device).unsqueeze(1)
                preds = model(P_tensor, m_tensor)
                loss = criterion(preds, target)
                running_val_loss += loss.item() * P_tensor.size(0)

        val_loss = running_val_loss / len(val_loader.dataset)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"Epoch {epoch+1}/{epochs}: train loss {train_loss:.4f}, val loss {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            torch.save({
                'model_state_dict': best_model_state,
                'config': config
            }, f"model3{config['flag']}.pt")
            print(f"Best model saved at epoch {epoch+1} with val loss: {best_val_loss:.4f}")
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1

        if LENIENT:
            if no_improve_epochs > 4: print('terminating training'); break
            if epoch > 2 and val_loss > 500: print('terminating training'); break
        else:
            if no_improve_epochs > 2: print('terminating training'); break

    # Plot losses
    plt.figure()
    plt.plot(train_losses, label="Train loss")
    plt.plot(val_losses, label="Val loss")
    plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.legend()
    plt.title(f"Training and Validation Losses")
    plt.show()

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    return model


def test_and_plot(model, test_loader):
    model.eval()
    all_preds, all_targets = [], []
    total_loss = 0
    criterion = ratio_loss
    device = next(model.parameters()).device

    with torch.no_grad():
        for P_tensor, m_tensor, target in test_loader:
            P_tensor, m_tensor, target = P_tensor.to(device), m_tensor.to(device), target.to(device).unsqueeze(1)
            preds = model(P_tensor, m_tensor).reshape(-1)
            loss = criterion(preds, target.reshape(-1))
            total_loss += loss.item() * P_tensor.size(0)
            all_preds.append(preds.cpu())
            all_targets.append(target.cpu().reshape(-1))

    all_preds = torch.cat(all_preds).numpy()
    all_targets = torch.cat(all_targets).numpy()

    loss = total_loss / len(test_loader.dataset)
    print(f"Test Ratio Loss: {loss:.6f}")

    # Plotting
    combined = np.concatenate([all_targets, all_preds])
    low, high = np.percentile(combined, [0, 99])

    plt.figure()
    plt.scatter(all_targets, all_preds, alpha=0.7)
    plt.plot([low, high], [low, high], linestyle='--', label='Ideal')
    plt.xlim(low, high); plt.ylim(low, high)
    plt.xlabel("True target"); plt.ylabel("Predicted")
    plt.title(f"Test: True vs Predicted")
    plt.legend()
    plt.show()

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(1706)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
BATCH_SIZE = 64
EPOCHS = 40
# for n, k, m in [(9,4,5), (9,5,4), (9,6,3), (10,4,2), (10,4,6), (10,5,2), (10,5,5), (10,6,4)]:
# for n in [9,10]:
#     for k in [4,5,6]:
#         for m in list(range(n-k+1))[2:]:

LENIENT = True
if device.type == "cuda":
    data = df
else: # <3 laptop
    data = df.head(20000)
num_train_samples = int(0.65 * len(data))
num_val_samples = int(0.25 * len(data))
num_test_samples = len(data) - num_train_samples - num_val_samples
print("num_train_samples:", num_train_samples,"num_val_samples:", num_val_samples,"num_test_samples:", num_test_samples)
dataset = CNNTransformerDataset(data)
train_data, val_data, test_data = random_split(dataset, [num_train_samples, num_val_samples, num_test_samples])
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)
config = {
    'token_dim': 256,
    'd_model': 256,
    'nhead': 16,
    'num_layers': 4,
    'dim_feedforward': 512,
    'dropout': 0.1,
    'lr': 0.00001,
    'flag': 'a'
}
model_args = {key: config[key] for key in [
    'token_dim', 'd_model', 'nhead', 'num_layers',
    'dim_feedforward', 'dropout'
]}
model = CNNTransformerModel(**model_args).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
model = train(model, train_loader, val_loader, optimizer, config, epochs=EPOCHS)
test_and_plot(model, test_loader)


Using device: cpu
num_train_samples: 13000 num_val_samples: 5000 num_test_samples: 2000
modle created
Epoch 1/40


KeyboardInterrupt: 

# A
'd_model': 512,\
'nhead': 8,\
'num_layers': 4,\
'dim_feedforward': 256,\
'dropout': 0.2,\
'lr': 0.000001,\
'flag': 'a'\
(9,4) Test Loss by m[2:]\
.2, .25, .9, 12.5\
(9,5) Test Loss by m[2:]\
.23, .87, 14.7\
(9,6) Test Loss by m[2:]\
.66, 8.9\
(10,4) Test Loss by m[2:]\
.85, .11, .29, 1.48, 24.6\
(10,5) Test Loss by m[2:]\
.13, .33, .97, 24.5\
(10,6) Test Loss by m[2:]\
.25, .89, 1297

# B
'd_model': 1024,\
'nhead': 16,\
'num_layers': 6,\
'dim_feedforward': 512,\
'dropout': 0.2,\
'lr': 0.0001,\
'flag': 'b'\
(9,4) Test Loss by m[2:]\
735.6, .26, 891, 3.4\
(9,5) Test Loss by m[2:]\
.25, 903, 3.48\
(9,6) Test Loss by m[2:]\
.66, 3.4\
(10,4) Test Loss by m[2:]\
.87, .12, .3, .95, 3.4\
(10,5) Test Loss by m[2:]\
770, .35, .95, 3.4\
(10,6) Test Loss by m[2:]\
.27, .9, 1297

# C
'd_model': 1024,\
'nhead': 32,\
'num_layers': 8,\
'dim_feedforward': 2048,\
'dropout': 0.2,\
'lr': 0.00005,\
'flag': 'c'\
(9,4) Test Loss by m[2:]\
.21, 785, .91, 3.4\
(9,5) Test Loss by m[2:]\
.23, .88, 3.4\
(9,6) Test Loss by m[2:]\
.67, 3.4\
(10,4) Test Loss by m[2:]\
.87, .12, .3, 921, 3.4\
(10,5) Test Loss by m[2:]\
.14, .35, .95, 3.5\
(10,6) Test Loss by m[2:]\
.26, .89, 3.5

# D
'd_model': 1024,\
'nhead': 16,\
'num_layers': 4,\
'dim_feedforward': 2048,\
'dropout': 0.2,\
'lr': 0.00001,\
'flag': 'd'\
(9,4) Test Loss by m[2:]\
.2, 785, .91, 3.4\
(9,5) Test Loss by m[2:]\
.24, .88, \
Terminated training.

The larger models cut down on error at the higher m-values. Every model became a 1-value predictor for the optimal output to minimize overall loss. Rather than learning a pattern, these models (often) found the best single value to guess to minimize validation loss.

Final ensemble:\
(9,4)[2,3,4] a\
(9,4)[5] c\
(9,5) c\
(9,6) b\
(10,4) b\
(10,5) c\
(10,7) c


In [13]:
import os

def load_ensemble_models(model_dir, device='cuda' if torch.cuda.is_available() else 'cpu'):
    ensemble_models = {}
    for fname in os.listdir(model_dir):
        if fname.startswith("model2-") and fname.endswith(".pt"):
            try:
                parts = fname[len("model2-"):-3].split("_")
                n, k, m = map(int, parts)
                path = os.path.join(model_dir, fname)
                checkpoint = torch.load(path, map_location=device)
                config = checkpoint['config']
                model = TransformerRegression(
                    token_dim=config['token_dim'],
                    d_model=config['d_model'],
                    nhead=config['nhead'],
                    num_layers=config['num_layers'],
                    dim_feedforward=config['dim_feedforward'],
                    dropout=config['dropout'],
                    n=config['n'], k=config['k'], m=config['m']
                ).to(device)
                model.load_state_dict(checkpoint['model_state_dict'])
                model.eval()
                ensemble_models[(n, k, m)] = model
            except Exception as e:
                print(f"Failed to load {fname}: {e}")
    return ensemble_models

def run_ensemble_predictions(df_test, ensemble_models, batch_size=64):
    from collections import defaultdict
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Store results
    prediction_results = []

    # Group by n, k, m
    for (n, k, m), group in df_test.groupby(["n", "k", "m"]):
        model = ensemble_models.get((n, k, m))
        if model is None:
            print(f"No model for (n={n}, k={k}, m={m}) — skipping")
            continue

        dataset = RowColTokenDataset(group.reset_index(drop=True), n=n, k=k)
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        preds = []
        with torch.no_grad():
            for tokens, _ in loader:
                tokens = tokens.to(device)
                pred = model(tokens).squeeze(1)
                preds.extend(pred.cpu().numpy().tolist())

        result_df = group.copy()
        result_df["prediction"] = preds
        prediction_results.append(result_df)

    if prediction_results:
        return pd.concat(prediction_results, ignore_index=True)
    else:
        return pd.DataFrame()


In [24]:
def build_test_dataframe(n, k, m, P_list):
    """
    Converts a list of P matrices into a DataFrame compatible with RowColTokenDataset.

    Parameters:
        n (int): total number of columns in original matrix
        k (int): number of rows in each P matrix
        m (int or list[int]): the m value(s), scalar or one per row
        P_list (list or np.ndarray): one or more numpy matrices of shape (k, n-k)

    Returns:
        pd.DataFrame with columns: 'n', 'k', 'm', 'P'
    """
    if isinstance(P_list, np.ndarray) and P_list.ndim == 2:
        P_list = [P_list]  # single matrix case

    if not isinstance(P_list, list):
        raise ValueError("P_list must be a list of numpy arrays")

    for i, P in enumerate(P_list):
        if P.shape != (k, n - k):
            raise ValueError(f"P[{i}] has shape {P.shape}, expected ({k}, {n - k})")

    # Broadcast m if scalar
    m_values = [m] * len(P_list) if isinstance(m, int) else m
    if len(m_values) != len(P_list):
        raise ValueError("Length of m must match number of P matrices")

    df = pd.DataFrame({
        "n": [n] * len(P_list),
        "k": [k] * len(P_list),
        "m": m_values,
        "P": [p.tolist() for p in P_list]
    })

    return df

def norm(x):
    x = np.array(x, dtype=np.float32)
    return ((x + 100) / 200) * 2 - 1

n, k, m = 9, 4, 3
ensemble_models = load_ensemble_models("./")
P_raw = df[(df['n'] == n) & (df['k'] == k) & (df['m'] == m)].head(1)['P'].values[0]
P_norm = norm(P_raw).reshape(k, n - k)
df_test = build_test_dataframe(n=n, k=k, m=m, P_list=[P_norm])
ensemble_output_df = run_ensemble_predictions(df_test, ensemble_models)


In [25]:
print(ensemble_output_df.head())

   n  k  m                                                  P  prediction
0  9  4  3  [[0.9114111661911011, -0.15260308980941772, -0...  272.356445
