In [1]:
import pandas as pd
import pickle
import torch

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

GOLD_PARQUET_PATH = '/Users/olievortex/lakehouse/default/Files/gold/igra2/liftedindex_lr'
ARTIFACTS_PATH = '/Users/olievortex/lakehouse/default/Files/gold/igra2/artifacts'

batch_size = 32
epochs = 16
learning_rate = 0.001
learning_rate_gamma = 0.90

In [2]:
def load_dataset():
    X = pd.read_parquet(GOLD_PARQUET_PATH)

    # Remove irrelevant data
    X = X.drop(['id', 'effective_date', 'hour', 'li'], axis=1)
    
    # Scale the X dataset
    ss = MinMaxScaler() 
    X = ss.fit_transform(X)

    with open(f'{ARTIFACTS_PATH}/ae_min_max_scaler.pkl', 'wb') as f:
        pickle.dump(ss, f)
    
    return train_test_split(X, X, test_size=0.2)

x_train, x_test, _, _ = load_dataset()
print (f"Training size: {x_train.size:,}")
print (f"Predict size: {x_test.size:,}")
print (f"Feature count: {len(x_train[0])}")

x_train = torch.from_numpy(x_train).float()
x_test = torch.from_numpy(x_test).float()
y_test = x_test.numpy()
n_batches = x_train.size()[0] // batch_size

Training size: 34,140,648
Predict size: 8,535,162
Feature count: 127


In [3]:
class AutoEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(127, 96),
            torch.nn.ReLU(),
            torch.nn.Linear(96, 64)
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(64, 96),
            torch.nn.ReLU(),
            torch.nn.Linear(96, 127),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)

        return decoded
    
model = AutoEncoder()
optimizer = torch.optim.Adam(model.parameters(), learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learning_rate_gamma)
loss_function = torch.nn.MSELoss()

In [4]:
def train(model, loss, optimizer, inputs, labels):
    optimizer.zero_grad()

    logits = model.forward(inputs)
    output = loss.forward(logits, labels)
    output.backward()
    optimizer.step()

    return output.item()

def predict(model, inputs):
    logits = model.forward(inputs)
    return logits.data

def train_orch():
    for epoch in range(epochs):
        cost = 0
        loader = torch.utils.data.DataLoader(dataset = x_train,
                                             batch_size = batch_size,
                                             shuffle = True)

        for batch in loader:
            cost += train(model, loss_function, optimizer, batch, batch)

        y_pred = predict(model, x_test)
        acc = r2_score(y_pred.numpy(), y_test)
        scheduler.step()

        print(f"Epoch: {epoch+1}, cost: {cost / n_batches:.4f}, acc: {acc:.3f}, lr: {scheduler.get_last_lr()[0]:.2e}")

train_orch()

Epoch: 1, cost: 0.0014, acc: 0.767, lr: 9.00e-04
Epoch: 2, cost: 0.0004, acc: 0.761, lr: 8.10e-04
Epoch: 3, cost: 0.0003, acc: 0.743, lr: 7.29e-04
Epoch: 4, cost: 0.0002, acc: 0.744, lr: 6.56e-04
Epoch: 5, cost: 0.0002, acc: 0.751, lr: 5.90e-04
Epoch: 6, cost: 0.0002, acc: 0.759, lr: 5.31e-04
Epoch: 7, cost: 0.0002, acc: 0.765, lr: 4.78e-04
Epoch: 8, cost: 0.0001, acc: 0.765, lr: 4.30e-04
Epoch: 9, cost: 0.0001, acc: 0.766, lr: 3.87e-04
Epoch: 10, cost: 0.0001, acc: 0.764, lr: 3.49e-04
Epoch: 11, cost: 0.0001, acc: 0.765, lr: 3.14e-04
Epoch: 12, cost: 0.0001, acc: 0.767, lr: 2.82e-04
Epoch: 13, cost: 0.0001, acc: 0.766, lr: 2.54e-04
Epoch: 14, cost: 0.0001, acc: 0.765, lr: 2.29e-04
Epoch: 15, cost: 0.0001, acc: 0.765, lr: 2.06e-04
Epoch: 16, cost: 0.0001, acc: 0.765, lr: 1.85e-04


In [5]:
torch.save(model.state_dict(), f'{ARTIFACTS_PATH}/ae_fnn.pt')