In [None]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# navigating your repository
path = Path(os.getcwd())
PROJECT_ROOT = path.parent.absolute()
DATA_PATH = os.path.join(PROJECT_ROOT, r"training-data\data_train_track2_refit.csv")
PLOTS_DIR = os.path.join(PROJECT_ROOT, r"main\plots")
os.makedirs(PLOTS_DIR, exist_ok=True)

print(f"Using project root: {PROJECT_ROOT}")
print(f"Train data path: {DATA_PATH}")
print(f"Plots will be saved to: {PLOTS_DIR}")

# Load training data (day-first dates)
df = pd.read_csv(DATA_PATH, header=0 ,sep=';')
df['date'] = pd.to_datetime(df['date'], dayfirst=True)


# Keep Date as first column
df = df[['date'] + [c for c in df.columns if c != 'date']]
feature_cols = [c for c in df.columns if c != 'date']

df["value"] = df["value"].str.replace(",", ".").astype(float)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values('date').reset_index(drop=True)

# 1) Linear time index (trend)
df["time_idx"] = (df["date"] - df["date"].min()).dt.days

# 2) Day-of-year cyclic features (seasonality)
df["doy"] = df["date"].dt.dayofyear

df["doy_sin"] = np.sin(2 * np.pi * df["doy"] / 365.25)
df["doy_cos"] = np.cos(2 * np.pi * df["doy"] / 365.25)

print(df.head())
print(f"Rows: {len(df)}")
print(f"Date range: {df['date'].min().date()} → {df['date'].max().date()}")
print(f"Number of surface points: {len(feature_cols)}")

In [None]:
# split df by date order so test = 20% most recent dates
test_size = 0.2  # 20% most recent rows for test

split_idx = int(len(df) * (1 - test_size))

train_df = df.iloc[:split_idx].copy()  # older data
test_df  = df.iloc[split_idx:].copy() 

print(f"Train date range: {train_df['date'].min()} → {train_df['date'].max()}")
print(f"Test date range:  {test_df['date'].min()} → {test_df['date'].max()}")
print(f"Train rows: {len(train_df)}, Test rows: {len(test_df)}")

In [None]:
print(train_df[feature_cols].isna().sum())

In [None]:
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler

TARGET_COL = "value"
DATE_COL = "date"

# All feature columns except raw date + target
feature_cols = [
    c for c in df.columns 
    if c not in [TARGET_COL, DATE_COL, "doy"]  # we keep sin/cos, drop raw doy
]

print("Features:", feature_cols)

feature_cols = ["tenor", "maturity", "time_idx", "doy_sin", "doy_cos"]
TARGET_COL = "value"   # or whatever your target is called

# 1. Numpy arrays (float32 initial cast)
X_train = train_df[feature_cols].to_numpy(dtype=np.float32)
y_train = train_df[TARGET_COL].to_numpy(dtype=np.float32)
X_test  = test_df[feature_cols].to_numpy(dtype=np.float32)
y_test  = test_df[TARGET_COL].to_numpy(dtype=np.float32)

# 2. Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype(np.float32)
X_test  = scaler.transform(X_test).astype(np.float32)

print("X_train type:", type(X_train), "dtype:", X_train.dtype)

# 3. Tensors (use from_numpy, no dtype confusion)
X_train_t = torch.from_numpy(X_train)                   # torch.float32
X_test_t  = torch.from_numpy(X_test)

y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_t  = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [None]:
from merlin.datasets import mnist_digits

train_features, train_labels, train_metadata = mnist_digits.get_data_train_percevalquest()
test_features, test_labels, test_metadata = mnist_digits.get_data_test_percevalquest()

# Flatten the images from (N, 28, 28) to (N, 784)
train_features = train_features.reshape(train_features.shape[0], -1)
test_features = test_features.reshape(test_features.shape[0], -1)

# Convert data to PyTorch tensors
X_train = torch.FloatTensor(train_features)
y_train = torch.LongTensor(train_labels)
X_test = torch.FloatTensor(test_features)
y_test = torch.LongTensor(test_labels)

print(f"Dataset loaded: {len(X_train)} training samples, {len(X_test)} test samples")

In [None]:
import platform, struct, torch, numpy as np
print("Python:", platform.python_version())
print("Bits:", struct.calcsize("P") * 8)
print("torch:", torch.__version__)
print("numpy:", np.__version__)
