## New Regression Model ##

This script runs a pipeline to load data, format data, train the neural network, evaluate it's accuracy, and save the model


In [11]:
#import tools

import csv
import numpy as np
import json
import math
import matplotlib.pyplot as plt
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
'''
import multiprocessing as mp

from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score

import sklearn.preprocessing as PP


import sys

sys.path.append('/home/ada/Documents/HullParameterization')

from HullParameterization import Hull_Parameterization as HP

In [12]:
#CPU Multithreading

def run_MAP_multiprocessing(func, argument_list, chunksize = None, show_prog = True):
    """Run function in parallel
    Parameters
    ----------
    func:          function
                    Python function to run in parallel.
    argument_list: list [N]
                    List of arguments to be passed to the function in each parallel run.
            
    show_prog:     boolean
                    If true a progress bas will be displayed to show progress. Default: True.
    Returns
    -------
    output:        list [N,]
                    outputs of the function for the given arguments.
    """
    #Reserve 2 threads for other Tasks
    #pool = mp.Pool(processes=mp.cpu_count()-2)
    
    if show_prog:            
        result_list_tqdm = []
        for result in tqdm(pool.map(func=func, iterable=argument_list,chunksize=chunksize), total=len(argument_list),position=0, leave=True):
            result_list_tqdm.append(result)
    else:
        result_list_tqdm = []
        for result in pool.map(func=func, iterable=argument_list,chunksize=chunksize):
            result_list_tqdm.append(result)

    return result_list_tqdm


def run_IMAP_multiprocessing(func, argument_list, chunksize = None, show_prog = True):
    """Run function in parallel
    Parameters
    ----------
    func:          function
                    Python function to run in parallel.
    argument_list: list [N]
                    List of arguments to be passed to the function in each parallel run.
            
    show_prog:     boolean
                    If true a progress bas will be displayed to show progress. Default: True.
    Returns
    -------
    output:        list [N,]
                    outputs of the function for the given arguments.
    """
    #Reserve 2 threads for other Tasks
    #pool = mp.Pool(processes=mp.cpu_count()-2)
    
    if show_prog:            
        result_list_tqdm = []
        for result in tqdm(pool.imap(func=func, iterable=argument_list,chunksize=chunksize), total=len(argument_list),position=0, leave=True):
            result_list_tqdm.append(result)
    else:
        result_list_tqdm = []
        for result in pool.imap(func=func, iterable=argument_list,chunksize=chunksize):
            result_list_tqdm.append(result)

    return result_list_tqdm




## Load and Format the Training Data ##

In [13]:
#Load XLimits

X_LIMITS = np.load('/home/ada/Documents/HullParameterization/HullDiffusion/Restructured_Dataset/X_LIMITS.npy')

print(X_LIMITS.shape)

#Load in Volume Prediction for Now
DesVecName = 'Input_Vectors.csv'
YName = 'GeometricMeasures/Volume.csv'
DS_path = '/home/ada/Documents//Hull_DataSet/'

folder_roots = ['Constrained_Randomized_Set_1', 
                'Constrained_Randomized_Set_2',
                'Constrained_Randomized_Set_3',
                'Diffusion_Aug_Set_1',
                'Diffusion_Aug_Set_2']
DesVec = []

for i in range(0,len(folder_roots)):
    path = DS_path + folder_roots[i] + '/'    
    #Location of Design Vectors
    with open(path + DesVecName) as csvfile:
        reader = csv.reader(csvfile)
        for count, row in enumerate(reader):
            if count != 0:
                DesVec.append(row)
'''
    #Location of Vol Vectors
    with open(path + YName) as csvfile:
        reader = csv.reader(csvfile)
        for count, row in enumerate(reader):
            if count != 0:
                YVec.append(row)
            else:
                labels = np.array(row)
'''
DesVec = np.array(DesVec)
DesVec = DesVec.astype(np.float32())

np.save('DesVec_82k.npy',DesVec)

#YVec = np.array(YVec)
#YVec = YVec.astype(np.float32())

#Normalize Volume to LogScale
#Y_log = np.log10(YVec)

def Performance_Metric(X):
    hull = HP(X)
    Z = hull.Calc_VolumeProperties(101,1000)
    
    return np.divide(hull.Volumes,X[0]**3.0)

def Calc_GeometricProperties(x):
    '''
    This function takes in a Ship Design Vector and calculates the volumetric properties of the hull 
    
    It returns the values for:
    
    Z / L             -> nondimensialized vector for the height at which each value was measured
    Volume / L^3
    Area of Waterplane / L^2
    Longitudinal Centers of Buoyancy/L
    Vertical Center of Buoyancy / L
    Longitudinal Center of Flotation / L
    Ixx / L^4
    Iyy / L^4
    
    where L = LOA of the design vector ( x[0])
    
    This function is written to be paralellized   
    
    '''
    
    hull = HP(x)
       
    Z = hull.Calc_VolumeProperties(NUM_WL = 101, PointsPerWL = 1000)
    
    L = x[0]
    
    z = np.divide(Z,L)
    Vol = np.divide(hull.Volumes,L**3.0)
    WP = np.divide(hull.Areas_WP,L**2.0)
    LCF = np.divide(hull.LCFs,L)
    Ixx = np.divide(hull.I_WP[:,0],L**4.0)
    Iyy = np.divide(hull.I_WP[:,1],L**4.0)
    LCB = np.divide(hull.VolumeCentroids[:,0],L)
    VCB = np.divide(hull.VolumeCentroids[:,0],L)
    WSA = np.divide(hull.Area_WS,L**2.0)
    WL = np.divide(hull.WL_Lengths,L)
    
    
    return np.concatenate((z,Vol,WP,LCB,VCB,LCF,Ixx,Iyy,WSA,WL),axis=0)

(44, 2)


In [14]:
# Now lets clean up X

idx_BBFactors = [33,34,35,36,37]
idx_BB = 31

idx_SBFactors = [38,39,40,41,42,43,44]
idx_SB = 32

for i in range(0,len(DesVec)):
    
    DesVec[i,idx_BBFactors] = DesVec[i,idx_BB] * DesVec[i,idx_BBFactors] 
    DesVec[i,idx_SBFactors] = DesVec[i,idx_SB] * DesVec[i,idx_SBFactors]



In [15]:
'''
#Compute Y
# Run Multiprocessing to Calculate the Geometric Measures
CHUNKS = 256
print('Calculating Hulls...')

print('Threads: ' + str(mp.cpu_count()))
pool = mp.Pool(processes=mp.cpu_count()-2)
#Y = [Performance_Metric(DesVec[i]) for i in tqdm(range(0,len(DesVec)))]
Y = run_IMAP_multiprocessing(Calc_GeometricProperties, DesVec,chunksize=CHUNKS,show_prog=True)
Y = np.array(Y)
np.save('GeometricMeasures.npy',Y)
print('Hull Calculations Complete!')
'''

Y = np.load('GeometricMeasures.npy')
print(Y.shape)

(82168, 1010)


### Quantile Normalize the Design Vectors: ###

In [16]:
class Data_Normalizer:
    def __init__(self, X_LL_Scaled, X_UL_Scaled,datalength):
        
        self.normalizer = PP.QuantileTransformer(
            output_distribution='normal',
            n_quantiles=max(min(datalength // 30, 1000), 10),
            subsample=int(1e9)
            )
        
        self.X_LL_Scaled = X_LL_Scaled
        self.X_UL_Scaled = X_UL_Scaled
        
        self.X_LL_norm = np.zeros((1,len(X_LL_Scaled)))
        self.X_UL_norm = np.zeros((1,len(X_LL_Scaled)))
        
        self.X_mean = np.zeros((1,len(X_LL_Scaled)))
        self.X_std = np.zeros((1,len(X_LL_Scaled)))
        
    def fit_Data(self,X):
        
        
        
        x = 2.0*(X-self.X_LL_Scaled)/(self.X_UL_Scaled- self.X_LL_Scaled) - 1.0
        
        self.normalizer.fit(x)
        x = self.normalizer.transform(x) # Scale Dataset between 
        #x = (X-self.X_LL_Scaled)/(self.X_UL_Scaled- self.X_LL_Scaled)
        

        return x
    
    def transform_Data(self,X):
        x = 2.0*(X-self.X_LL_Scaled)/(self.X_UL_Scaled- self.X_LL_Scaled) - 1.0
        
        
        x = self.normalizer.transform(x)
        return x
        

    def scale_X(self,z):
        #rescales data
        z = self.normalizer.inverse_transform(z)
        scaled = (z + 1.0) * 0.5 * (self.X_UL_Scaled - self.X_LL_Scaled) + self.X_LL_Scaled
        #scaled = z* (self.X_UL_Scaled - self.X_LL_Scaled) + self.X_LL_Scaled

        '''
        x = self.normalizer.inverse_transform(x)
        
        #scaled = x* (self.X_UL_norm - self.X_LL_norm) + self.X_LL_norm
        '''
        #z = (z + 1.0) * 0.5 * (8.0) + 4.0
       
        #scaled = z*self.X_std + self.X_mean
        #scaled = self.normalizer.inverse_transform(scaled)
        return scaled

## Build Regression Model Class ##



In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F  



class Regression_ResNet(torch.nn.Module):
    def __init__(self, Reg_Dict):
        nn.Module.__init__(self)
        
        self.xdim = Reg_Dict['xdim']
        self.ydim = 1
        self.tdim = Reg_Dict['tdim']
        self.net = Reg_Dict['net']
        
        self.fc = nn.ModuleList()
        
        self.fc.append(self.LinLayer(self.tdim,self.net[0]))
        
        for i in range(1, len(self.net)):
            self.fc.append(self.LinLayer(self.net[i-1],self.net[i]))
            
        self.fc.append(self.LinLayer(self.net[-1], self.tdim))
        '''
        #self.tc = nn.ModuleList()

        #for i in range(0, len(self.net)):
            self.tc.append(self.LinLayer(self.tdim,self.net[i]))
        self.tc.append(self.LinLayer(self.tdim, self.tdim))
        '''
        self.finalLayer = nn.Sequential(nn.Linear(self.tdim, self.ydim))
        
    
        self.X_embed = nn.Linear(self.xdim, self.tdim)
        #self.T_embed = nn.Linear(self.ydim, self.tdim)
       
        
    def LinLayer(self, dimi, dimo):
        
        return nn.Sequential(nn.Linear(dimi,dimo),
                             nn.SiLU(),
                             nn.LayerNorm(dimo),
                             nn.Dropout(p=0.1))
    
    def forward(self, x):
        x = self.X_embed(x)
    
        res_x = x

        for i in range(0,len(self.fc)):
            x = self.fc[i](x)
        
        x = torch.add(x,res_x)
        x = self.finalLayer(x)
        
        return x


## Make Training Environment ##

In [18]:
class Regressor_Training_Env:
    def __init__(self, Reg_Dict, DesVec, Y):

        self.Reg_Dict = Reg_Dict
        self.DesVec = DesVec

        self.QT = Data_Normalizer(X_LIMITS[:,0],X_LIMITS[:,1],len(DesVec))
        
        self.X = np.copy(DesVec[:,1:])

        # Quantile Transform X:
        self.X = self.QT.fit_Data(self.X)

        self.Y = np.copy(Y)
      
        self.model = Regression_ResNet(self.Reg_Dict)
        self.device =torch.device(self.Reg_Dict['device_name'])

        self.model.to(self.device)

        self.data_length = len(self.X)
        self.batch_size = self.Reg_Dict['batch_size']
        self.num_epochs = self.Reg_Dict['Training_Epochs']
        
        lr = self.Reg_Dict['lr']
        self.init_lr = lr
        weight_decay = self.Reg_Dict['weight_decay']
        
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr, weight_decay=weight_decay)


    '''
    ==============================================================================
    Base Regression Training Functions
    ==============================================================================
    '''
    
    def run_regressor_step(self,x,y):
        self.optimizer.zero_grad()

        ones = torch.ones_like(y)

        predicted_y = self.model(x)
        
        loss =  F.mse_loss(predicted_y, y)
        #print(loss)
        
        loss.backward()
        self.optimizer.step()
        
        return loss  
    
    def run_train_regressors_loop(self,batches_per_epoch=64, subsample_per_batch = 64, num_WL_Steps = 101):
            
            num_batches = self.data_length // self.batch_size
        
            batches_per_epoch = min(num_batches,batches_per_epoch)

            T_vec = np.linspace(0,1,num_WL_Steps)
            
            print('Regressor Model Training...')

            for i in tqdm(range(0,self.num_epochs)):

                for j in range(0,batches_per_epoch):
                    
                    A = np.random.randint(0,self.data_length,self.batch_size)
                    x_batch = torch.tensor(self.X[A]).float().to(self.device) 

                
                    for k in range(0,subsample_per_batch):
                        #Random Waterline
                        t = np.random.randint(0,num_WL_Steps,(self.batch_size,))
                        t_tens = torch.tensor(T_vec[t,np.newaxis]).float().to(self.device)

                        
                        #Interpolate Volume
                        Y_calc = np.array([HP.interp(y[i],T_vec,t[i]) for i in range(0,len(t))])


                        y = self.Y[A,t]
                        y_batch = torch.tensor(y[:,np.newaxis]).float().to(self.device)

                        

                        x = torch.cat((x_batch,t_tens),dim=1)

                        loss = self.run_regressor_step(x,y_batch)
                if i % 1000 == 0:
                    print('Epoch: ' + str(i) + ' Loss: ' + str(loss))   
                    
        
            print('Regression Model Training Complete!')

            self.model.eval()
            eval_size = 10000

            A = np.random.randint(0,self.data_length,eval_size)

            t = np.random.random((eval_size,1))
            t_tens = torch.tensor(t).float().to(self.device)

            x_eval = torch.tensor(self.X[A]).float().to(self.device)
            
            x_eval = torch.cat((x_eval, t_tens),dim=1) 

            Y_pred = self.model(x_eval)
            Y_pred = Y_pred.to(torch.device('cpu')).detach().numpy() 

            y = self.Y[A]

            Y_calc = np.array([HP.interp(y[i],T_vec,t[i]) for i in range(0,len(t))])

            Rsq = r2_score(Y_calc, Y_pred)
            print("R2 score of Y:" + str(Rsq))


    # SAVE FUNCTIONS
        
    def load_trained_model(self):
        label = self.Reg_Dict['Model_Path']
        self.model.load_state_dict(torch.load(label))
        self.model.to(self.device)
        
    
    def Save_model(self,PATH):
        '''
        PATH is the path to the folder to store this in, including '/' at the end
       
        '''

        torch.save(self.model.state_dict(), PATH + self.Reg_Dict['Model_Label']+'.pth')
        
        JSON = json.dumps(self.Reg_Dict)
        f = open(PATH + self.Reg_Dict['Model_Label'] + '_Dict.json', 'w')
        f.write(JSON)
        f.close()

## Set Up Model Training ##

In [19]:
#Regression model Dict
nodes = 512

Reg_Dict = {
        'xdim' : len(DesVec[0])-1 + 1,              # Dimension of parametric design vector
        'ydim': 1,                              # trains regression model for each objective
        'tdim': nodes,                            # dimension of latent variable
        'net': [nodes,nodes,nodes],                       # network architecture        
        'Training_Epochs': 30000,               # number of training epochs
        'batch_size': 1024,                       # batch size
        'Model_Label': 'Regressor_WSA',         # labels for regressors
                    
        'lr' : 0.001,                          # learning rate
        'weight_decay': 0.0,                   # weight decay
        'device_name': 'cuda:0'}    


num_WL_Steps = 101

Y_set = np.log10(Y[:,8*num_WL_Steps:9*num_WL_Steps])
#Y_set = Y[:,8*num_WL_Steps:9*num_WL_Steps]
idx = np.where(np.isnan(Y_set))
print(idx)

Y_set[idx] = -6.0 #fix nan to dummy value

print(Y_set.shape)

REG = Regressor_Training_Env(Reg_Dict, DesVec,Y_set)

REG.run_train_regressors_loop(batches_per_epoch=8, subsample_per_batch = 8, num_WL_Steps = num_WL_Steps)

REG.Save_model('./TrainedModels/')

(array([], dtype=int64), array([], dtype=int64))
(82168, 101)


Regressor Model Training...


  0%|          | 2/30000 [00:00<25:00, 20.00it/s]

Epoch: 0 Loss: tensor(0.1320, device='cuda:0', grad_fn=<MseLossBackward0>)


  3%|▎         | 1003/30000 [00:47<22:58, 21.03it/s]

Epoch: 1000 Loss: tensor(0.0012, device='cuda:0', grad_fn=<MseLossBackward0>)


  7%|▋         | 2005/30000 [01:35<22:05, 21.12it/s]

Epoch: 2000 Loss: tensor(0.0010, device='cuda:0', grad_fn=<MseLossBackward0>)


 10%|█         | 3004/30000 [02:22<21:16, 21.16it/s]

Epoch: 3000 Loss: tensor(0.0008, device='cuda:0', grad_fn=<MseLossBackward0>)


 13%|█▎        | 4003/30000 [03:09<20:31, 21.10it/s]

Epoch: 4000 Loss: tensor(0.0008, device='cuda:0', grad_fn=<MseLossBackward0>)


 17%|█▋        | 5005/30000 [03:57<19:41, 21.15it/s]

Epoch: 5000 Loss: tensor(0.0008, device='cuda:0', grad_fn=<MseLossBackward0>)


 20%|██        | 6004/30000 [04:44<18:57, 21.09it/s]

Epoch: 6000 Loss: tensor(0.0007, device='cuda:0', grad_fn=<MseLossBackward0>)


 23%|██▎       | 7003/30000 [05:31<18:17, 20.95it/s]

Epoch: 7000 Loss: tensor(0.0004, device='cuda:0', grad_fn=<MseLossBackward0>)


 27%|██▋       | 8005/30000 [06:19<17:15, 21.23it/s]

Epoch: 8000 Loss: tensor(0.0004, device='cuda:0', grad_fn=<MseLossBackward0>)


 30%|███       | 9004/30000 [07:06<16:47, 20.84it/s]

Epoch: 9000 Loss: tensor(0.0007, device='cuda:0', grad_fn=<MseLossBackward0>)


 33%|███▎      | 10005/30000 [07:53<15:42, 21.22it/s]

Epoch: 10000 Loss: tensor(0.0005, device='cuda:0', grad_fn=<MseLossBackward0>)


 37%|███▋      | 11004/30000 [08:41<15:03, 21.03it/s]

Epoch: 11000 Loss: tensor(0.0007, device='cuda:0', grad_fn=<MseLossBackward0>)


 40%|████      | 12003/30000 [09:28<14:23, 20.84it/s]

Epoch: 12000 Loss: tensor(0.0007, device='cuda:0', grad_fn=<MseLossBackward0>)


 43%|████▎     | 13005/30000 [10:16<13:24, 21.11it/s]

Epoch: 13000 Loss: tensor(0.0006, device='cuda:0', grad_fn=<MseLossBackward0>)


 47%|████▋     | 14004/30000 [11:03<12:57, 20.58it/s]

Epoch: 14000 Loss: tensor(0.0011, device='cuda:0', grad_fn=<MseLossBackward0>)


 50%|█████     | 15003/30000 [11:51<11:50, 21.12it/s]

Epoch: 15000 Loss: tensor(0.0004, device='cuda:0', grad_fn=<MseLossBackward0>)


 53%|█████▎    | 16005/30000 [12:38<11:04, 21.05it/s]

Epoch: 16000 Loss: tensor(0.0006, device='cuda:0', grad_fn=<MseLossBackward0>)


 57%|█████▋    | 17004/30000 [13:25<10:08, 21.35it/s]

Epoch: 17000 Loss: tensor(0.0003, device='cuda:0', grad_fn=<MseLossBackward0>)


 60%|██████    | 18003/30000 [14:13<09:28, 21.09it/s]

Epoch: 18000 Loss: tensor(0.0003, device='cuda:0', grad_fn=<MseLossBackward0>)


 63%|██████▎   | 19005/30000 [15:00<08:41, 21.09it/s]

Epoch: 19000 Loss: tensor(0.0004, device='cuda:0', grad_fn=<MseLossBackward0>)


 67%|██████▋   | 20004/30000 [15:47<07:57, 20.92it/s]

Epoch: 20000 Loss: tensor(0.0013, device='cuda:0', grad_fn=<MseLossBackward0>)


 70%|███████   | 21003/30000 [16:35<07:07, 21.05it/s]

Epoch: 21000 Loss: tensor(0.0006, device='cuda:0', grad_fn=<MseLossBackward0>)


 73%|███████▎  | 22005/30000 [17:22<06:16, 21.23it/s]

Epoch: 22000 Loss: tensor(0.0015, device='cuda:0', grad_fn=<MseLossBackward0>)


 77%|███████▋  | 23004/30000 [18:10<05:31, 21.08it/s]

Epoch: 23000 Loss: tensor(0.0009, device='cuda:0', grad_fn=<MseLossBackward0>)


 80%|████████  | 24003/30000 [18:57<04:44, 21.12it/s]

Epoch: 24000 Loss: tensor(0.0005, device='cuda:0', grad_fn=<MseLossBackward0>)


 83%|████████▎ | 25005/30000 [19:44<03:56, 21.10it/s]

Epoch: 25000 Loss: tensor(0.0003, device='cuda:0', grad_fn=<MseLossBackward0>)


 87%|████████▋ | 26004/30000 [20:32<03:06, 21.38it/s]

Epoch: 26000 Loss: tensor(0.0003, device='cuda:0', grad_fn=<MseLossBackward0>)


 90%|█████████ | 27003/30000 [21:19<02:21, 21.24it/s]

Epoch: 27000 Loss: tensor(0.0003, device='cuda:0', grad_fn=<MseLossBackward0>)


 93%|█████████▎| 28005/30000 [22:06<01:33, 21.26it/s]

Epoch: 28000 Loss: tensor(0.0003, device='cuda:0', grad_fn=<MseLossBackward0>)


 97%|█████████▋| 29004/30000 [22:54<00:47, 21.18it/s]

Epoch: 29000 Loss: tensor(0.0003, device='cuda:0', grad_fn=<MseLossBackward0>)


100%|██████████| 30000/30000 [23:41<00:00, 21.11it/s]

Regression Model Training Complete!
R2 score of Y:0.9939168330224027





In [29]:
REG.model.eval()

sample_size = 100000

T_vec = np.linspace(0,1,num_WL_Steps)
A = np.random.randint(0,len(DesVec),sample_size)

t = np.random.random((sample_size,1))
t_tens = torch.tensor(t).float().to(REG.device)

x_eval = torch.tensor(REG.X[A]).float().to(REG.device)

x_eval = torch.cat((x_eval, t_tens),dim=1) 

Y_pred = REG.model(x_eval)
Y_pred = Y_pred.to(torch.device('cpu')).detach().numpy() 

y = REG.Y[A]

Y_calc = np.array([HP.interp(y[i],T_vec,t[i]) for i in range(0,len(t))])

#MAEP = np.mean(np.abs(np.power(10,Y_calc)-np.power(10,Y_pred)/np.power(10,Y_calc)))
MAEP = np.mean(np.abs(Y_calc-Y_pred)/np.abs(Y_calc))

Y_scaled_calc = 10**Y_calc
Y_scaled_pred = 10**Y_pred  


print('Log scale MAEP: ' + str(MAEP*100.0) + '%')

MAEP_scaled = np.mean(np.abs(Y_scaled_calc-Y_scaled_pred)/np.abs(Y_scaled_calc))
print('Scaled MAEP: ' + str(MAEP_scaled*100.0) + '%')
Rsq = r2_score(Y_scaled_calc, Y_scaled_pred)

print("R2 score of Scaled WSA Prediction: " + str(Rsq))



Log scale MAEP: 3.7725849970698446%
Scaled MAEP: 6.909024703541988%
R2 score of Scaled WSA Prediction: 0.986877063170483
