In [1]:
import numpy as np
import os
import random
import torch
from PD.PlasmaDataset import PlasmaDataset
from PML.PlasmaModel import PlasmaModel
from PML.PMLParameters import PMLParameters

In [2]:
DATA_SPLIT = [0.7, 0.2, 0.1] #train/test/val splits
DATA_FRAC = 1 #fraction of files to load data from
DATASET_NAME = "main"
HDF5_DATA_DIR = "./jtext_data/low_freq" #data to source hdf5 files from
ORG_DATA_DIR = "./jtext_org" #directory for exporting data CSVs
MODEL_COUNT = 20 #number of models to randomly generate and train
MODEL_DIR = "./models" #directory to save trained models
MODEL_METRICS = "./models/metrics.json"
FEATS = ['dx', 'dy', 'ip', 'ip_error'] #model features
HP_SEARCH = 'random' #hyperparameter search mode

In [3]:
#initialize processed data with some dummy training feature data directories for testing (uses dx and dy features)
PROCESSED_DATA = {
    "train_norm"   : f'./jtext_org/train/train-norm-{DATASET_NAME}.csv',
    "train_labels" : f'./jtext_org/train/train-labels-{DATASET_NAME}.csv',
    "test_norm"    : f'./jtext_org/test/test-norm-{DATASET_NAME}.csv',
    "test_labels"  : f'./jtext_org/test/test-labels-{DATASET_NAME}.csv',
    "val_norm"     : f'./jtext_org/val/val-norm-{DATASET_NAME}.csv',
    "val_labels"   : f'./jtext_org/val/val-labels-{DATASET_NAME}.csv'
}

In [4]:
#designate feature ranges and static parameters for hyperparameter search
#note to current user - grid search is currently borked - don't use (plus not efficient)
PARAMETER_RANGES = {
    'lr'            : [0.001, 0.01], #learning rate range
    'lstm_layers'   : [[200,400], [80,120]], #lstm layer count and hidden size ranges
    'linear_layers' : [[100,200], [100,150]], #linear layer count and neuron ranges
    'dropout_layers': [[0.05, 0.1]], #dropout layer count and dropout probabilities
}
STATIC_PARAMETERS = {
    'batch_size'       : 16,
    'criterion'        : torch.nn.BCEWithLogitsLoss(), #uses binary cross entropy loss
    'epochs'           : 40, #number of training epochs/model
    'init'             : torch.nn.init.xavier_normal_,
    'input_size'       : len(FEATS), #set input size to # of features
    'lstm_activation'  : torch.nn.functional.tanh, #LSTM layers activation function
    'linear_activation': torch.nn.functional.relu, #Linear layers activation function
    'optimizer'        : torch.optim.Adam, #use ADAM optimizer
    'output_activation': torch.nn.functional.sigmoid, #output neuron activation
}

In [5]:
def makeDataset(dataset:"PlasmaDataset", split:list, features:list, frac:float=1, preview=False):
    dataset.initialize() #creates train/test/val subdatasets
    dataset.sourceFiles(data_split = split, data_frac = frac) #initialize split/datafrac and gather hdf5 file info
    dataset.sourceData(features) #source specified feature data from files
    dataset.calcStats() #calculate data statistics from raw feature data
    dataset.normalize() #use data statistics to normalize data
    dataset.saveCSV(['train', 'test', 'val', 'stats'], name=DATASET_NAME) #export dataset to model-loadable CSV
    if preview:
        dataset.preview() #preview datasets
    dataset.deleteDatasets() #remove dataset from memory (since saved to CSV)

In [6]:
def makeModels(
                modeler:"PlasmaModel", 
                processed_data:dict, 
                parameter_ranges:dict, 
                static_parameters:dict, 
                model_count:int, 
                searchmode:str
              ):
    modeler.makeHyperparameterSet(
                    static_params=static_parameters, 
                    param_ranges=parameter_ranges, 
                    count=model_count, 
                    mode=searchmode
    )
    modeler.prepareData(processed_data)
    modeler.runModelSearch()

In [7]:
JTEXT_LOW = PlasmaDataset(org_directory = ORG_DATA_DIR, h5_source = HDF5_DATA_DIR)
MODELER = PlasmaModel(MODEL_DIR, static_parameters=STATIC_PARAMETERS, json_save_file=MODEL_METRICS)
makeDataset(JTEXT_LOW, split=DATA_SPLIT, frac=DATA_FRAC, features=FEATS)
makeModels(
    modeler=MODELER,
    processed_data=PROCESSED_DATA,
    parameter_ranges=PARAMETER_RANGES,
    static_parameters=STATIC_PARAMETERS,
    model_count=MODEL_COUNT,
    searchmode=HP_SEARCH
)

{'batch_size': 16, 'criterion': BCEWithLogitsLoss(), 'epochs': 40, 'init': <function xavier_normal_ at 0x10d3742c0>, 'input_size': 4, 'lstm_activation': <function tanh at 0x10d1de160>, 'linear_activation': <function relu at 0x10d1dd760>, 'optimizer': <class 'torch.optim.adam.Adam'>, 'output_activation': <function sigmoid at 0x10d1de200>, 'lr': 0.00580965963889642, 'lstm_layers': [316, 84], 'linear_layers': [164, 106], 'dropout_layers': [0.055421388733220694]}
Epoch [1/40], Training Loss: 0.6952
Epoch [1/40], Validation Loss: 0.6398
Epoch [2/40], Training Loss: 0.6822
Epoch [2/40], Validation Loss: 0.6928
Epoch [3/40], Training Loss: 0.6918
Epoch [3/40], Validation Loss: 0.6913
Epoch [4/40], Training Loss: 0.6706
Epoch [4/40], Validation Loss: 0.6895
Epoch [5/40], Training Loss: 0.6703
Epoch [5/40], Validation Loss: 0.7046
Epoch [6/40], Training Loss: 0.6653
Epoch [6/40], Validation Loss: 0.7033
Epoch [7/40], Training Loss: 0.6517
Epoch [7/40], Validation Loss: 0.6880
Epoch [8/40], Trai