In [6]:
import os
import pandas as pd
# import pickle
import torch
import glob

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, Dataset
from torch.nn import Module, Sequential, Linear, ReLU, Tanh, CrossEntropyLoss

MAIDENHEAD_PATH = '/usr/datalake/silver/stormevents/csvfiles/igra_maidenhead'
# STATION_LIST = '/usr/datalake/silver/igra/doc/igra2-station-list.csv'
ARTIFACTS_PATH = '/usr/datalake/silver/stormevents/artifacts/maidenhead_igra_classifier'

In [7]:
class OlieClassifier(Module):
    def __init__(self):
        super().__init__()
        
        self.classify = Sequential(
            Linear(61, 40),
            ReLU(),
            Linear(40, 40),
            ReLU(),
            Linear(40, 20),
            ReLU(),
            Linear(20, 3)
        )

    def forward(self, x):
        result = self.classify(x)

        return result

class OlieDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [8]:
class olie_igra_trainer:
    batch_size = 256
    epochs = 1024
    learning_rate = 0.001
    learning_rate_gamma = 0.99

    def __init__(self, maidenhead_path: str, artifact_path: str, maidenhead_id: str, model):
        self.maidenhead_path = maidenhead_path
        self.maidenhead_id = maidenhead_id
        self.artifact_path = artifact_path
        self.model = model

    def load_transform_dataset(self):
        df = pd.read_csv(f'{self.maidenhead_path}/{self.maidenhead_id}.csv')

        # Split the features/labels
        df_features = df.iloc[:,:-1]
        df_labels = df.iloc[:,-1:]
        
        # Scale the X dataset
        ss = MinMaxScaler()
        df_features = ss.fit_transform(df_features)

#         # Save the transform
#         os.makedirs(self.artifact_path, exist_ok=True)
#         with open(f'{self.artifact_path}/{self.station_id}_scaler.pkl', 'wb') as f:
#             pickle.dump(ss, f)
        
        # Split between train/test set
        x_train, x_test, y_train, y_test = train_test_split(df_features, df_labels, test_size=0.2)
        y_train = pd.get_dummies(df_labels, columns=['LABEL'], dtype=int)

        # Convert to tensors
        self.x_train = torch.from_numpy(x_train).float().cuda()
        self.x_test = torch.from_numpy(x_test).float().cuda()
        self.y_train = torch.from_numpy(y_train.to_numpy()).float().cuda()
        self.y_test = torch.from_numpy(y_test.to_numpy()).float().cuda().flatten()
        self.n_batches = self.x_train.size()[0] // self.batch_size

        print (f"Maidenhead: {self.maidenhead_id}, Training size: {self.x_train.size()[0]:,}, Test size: {self.x_test.size()[0]:,}, Feature count: {self.x_train.size()[1]}, Number of batches: {self.n_batches}")
    
    def train(self, inputs, labels) -> float:
        self.optimizer.zero_grad()

        # Calculate error
        logits = self.model(inputs)
        cost = self.loss_function(logits, labels)

        # Back propagation
        cost.backward()
        self.optimizer.step()

        return float(cost.item())

    def predict(self, inputs):
        self.optimizer.zero_grad()

        # Calculate error
        logits = self.model(inputs).clone().detach()

        return logits
    
    def output_progress(self, epoch: int, cost: float):
        logits = self.predict(self.x_test)
        _, predicted = torch.max(logits, dim=1)
        correct = float(torch.eq(self.y_test, predicted).sum().item())
        acc = correct / len(predicted)

        print(f"Epoch: {epoch+1}, cost: {cost / self.n_batches:.4f}, acc: {acc:.3f}, lr: {self.scheduler.get_last_lr()[0]:.2e}\r", end="")
        
    def train_orch(self):
        self.optimizer = torch.optim.Adam(self.model.parameters(), self.learning_rate)
        self.loss_function = CrossEntropyLoss()
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=self.learning_rate_gamma)
        olie_dataset = OlieDataset(self.x_train, self.y_train)

        for epoch in range(self.epochs):
            cost = 0
            loader = DataLoader(dataset = olie_dataset, batch_size = self.batch_size, shuffle = True)

            for batch_features, batch_labels in loader:
                cost += self.train(batch_features, batch_labels)

            self.scheduler.step()

            if epoch % 32 == 0:
                self.output_progress(epoch, cost)
        
        self.output_progress(epoch, cost)
        print()

#     def save_weights(self):
#         torch.save(self.model.state_dict(), f'{self.artifact_path}/{self.station_id}_fnn.pt')

    def exists_weights(self):
        return os.path.exists(f'{self.artifact_path}/{self.maidenhead_id}_fnn.pt')

    def dispose(self):
        del self.x_train
        del self.x_test
        del self.y_train
        del self.y_test
        del self.optimizer
        del self.loss_function
        del self.scheduler
        del self.model

In [9]:
def process_station(maidenhead_id: str):
    model = OlieClassifier().cuda()
    train = olie_igra_trainer(MAIDENHEAD_PATH, ARTIFACTS_PATH, maidenhead_id, model)

    if train.exists_weights():
        print(f"Maidenhead {maidenhead_id} already processed")
        return

    train.load_transform_dataset()
    train.train_orch()
#     train.save_weights()
    train.dispose()

    del train
    del model

In [10]:
for filepath in glob.glob(f'{MAIDENHEAD_PATH}/*.csv'):
    filename = Path(filepath).name
    maidenhead = filename.split('.')[0]

    process_station(maidenhead)

Maidenhead: EN35, Training size: 4,800, Test size: 1,200, Feature count: 61, Number of batches: 18
Epoch: 1024, cost: 1.0726, acc: 0.337, lr: 3.39e-08
