In [6]:
import os
import pandas as pd
import pickle
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

GOLD_PARQUET_PATH = '/usr/datalake/silver/igra/liftedindex_lr/gph20s10k_li.parquet'
ARTIFACTS_PATH = '/usr/datalake/silver/igra/liftedindex_lr/artifacts'

batch_size = 256
epochs = 64
learning_rate = 0.001
learning_rate_gamma = 0.75

In [7]:
def load_dataset():
    X = pd.read_parquet(GOLD_PARQUET_PATH)

    # Remove irrelevant data
    X = X.drop(['id', 'effective_date', 'hour', 'li'], axis=1)
    
    # Scale the X dataset
    ss = MinMaxScaler() 
    X = ss.fit_transform(X)

    # Save the transform
    os.makedirs(ARTIFACTS_PATH, exist_ok=True)
    with open(f'{ARTIFACTS_PATH}/ae_min_max_scaler.pkl', 'wb') as f:
        pickle.dump(ss, f)
    
    return train_test_split(X, test_size=0.2)

x_train, x_test = load_dataset()
print (f"Training size: {x_train.size:,}")
print (f"Predict size: {x_test.size:,}")
print (f"Feature count: {len(x_train[0])}")

x_train = torch.from_numpy(x_train).float().cuda()
x_test = torch.from_numpy(x_test).float().cuda()
n_batches = x_train.size()[0] // batch_size

Training size: 36,478,464
Predict size: 9,119,616
Feature count: 127


In [3]:
class AutoEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(127, 50),
            torch.nn.ReLU(),
            torch.nn.Linear(50, 10)
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(10, 50),
            torch.nn.ReLU(),
            torch.nn.Linear(50, 127)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)

        return decoded
    
model = AutoEncoder().cuda()
optimizer = torch.optim.Adam(model.parameters(), learning_rate)
loss_function = torch.nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learning_rate_gamma)

In [4]:
def train(model, loss, optimizer, inputs, labels) -> float:
    optimizer.zero_grad()

    # Calculate error
    logits = model(inputs)
    loss = loss(logits, labels)

    # Back propagation
    loss.backward()
    optimizer.step()

    return float(loss.item())

def predict(model, inputs):
    optimizer.zero_grad()

    # Calculate error
    logits = model(inputs)

    return logits

def r2_score_manual(preds, target):
    target_mean = torch.mean(target)
    ss_tot = torch.sum((target - target_mean) ** 2)  # Total sum of squares
    ss_res = torch.sum((target - preds) ** 2)      # Residual sum of squares
    r2 = 1 - (ss_res / ss_tot)
    return float(r2.item())

def train_orch():
    for epoch in range(epochs):
        cost = 0
        loader = torch.utils.data.DataLoader(dataset = x_train,
                                             batch_size = batch_size,
                                             shuffle = True)

        for batch in loader:
            cost += train(model, loss_function, optimizer, batch, batch)

        preds = predict(model, x_test)
        acc = r2_score_manual(x_test, preds)
        scheduler.step()

        print(f"Epoch: {epoch+1}, cost: {cost / n_batches:.4f}, acc: {acc:.3f}, lr: {scheduler.get_last_lr()[0]:.2e}")

train_orch()

Epoch: 1, cost: 0.0084, acc: 0.948, lr: 7.50e-04
Epoch: 2, cost: 0.0019, acc: 0.973, lr: 5.63e-04
Epoch: 3, cost: 0.0014, acc: 0.977, lr: 4.22e-04
Epoch: 4, cost: 0.0012, acc: 0.979, lr: 3.16e-04
Epoch: 5, cost: 0.0011, acc: 0.980, lr: 2.37e-04
Epoch: 6, cost: 0.0011, acc: 0.981, lr: 1.78e-04
Epoch: 7, cost: 0.0010, acc: 0.981, lr: 1.33e-04
Epoch: 8, cost: 0.0010, acc: 0.982, lr: 1.00e-04
Epoch: 9, cost: 0.0010, acc: 0.982, lr: 7.51e-05
Epoch: 10, cost: 0.0010, acc: 0.982, lr: 5.63e-05
Epoch: 11, cost: 0.0010, acc: 0.982, lr: 4.22e-05
Epoch: 12, cost: 0.0010, acc: 0.982, lr: 3.17e-05
Epoch: 13, cost: 0.0010, acc: 0.982, lr: 2.38e-05
Epoch: 14, cost: 0.0010, acc: 0.982, lr: 1.78e-05
Epoch: 15, cost: 0.0010, acc: 0.982, lr: 1.34e-05
Epoch: 16, cost: 0.0010, acc: 0.982, lr: 1.00e-05
Epoch: 17, cost: 0.0010, acc: 0.982, lr: 7.52e-06
Epoch: 18, cost: 0.0010, acc: 0.982, lr: 5.64e-06
Epoch: 19, cost: 0.0010, acc: 0.982, lr: 4.23e-06
Epoch: 20, cost: 0.0010, acc: 0.982, lr: 3.17e-06
Epoch: 21

In [5]:
torch.save(model.state_dict(), f'{ARTIFACTS_PATH}/ae_fnn.pt')