In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt
import collections
import torch.distributions as tdists
import math
from misc_funcs import indexes, get_samples
from tabular_dataset import TabularDataset
import seaborn as sns
from torch.autograd import grad as torch_grad
import ot
import pandas as pd

plt.rcParams.update({'font.size': 13})
plt.rc("axes", labelsize=18)
plt.rc("axes", titlesize=21)
plt.rc("legend", fontsize=18)

LabelledData = collections.namedtuple("LabelledData",["x","y"])

In [2]:
#import data
DATASET_PATH = './datasets/aero/'
DATASET_NAME = 'TwrBsMyt_ST_DEL'
CHANNEL_NAME = 'TwrBsMyt_[kN-m] ST_DEL'
# For saving plots
PLOT_PATH = './plots_server'
PLT_DATASET_NAME = 'aero_TwrBsMyt_ST_DEL'


# path for saving parameters of model
PARAM_PATH = './param_best'
FILE_NAME = 'aero_wcgan'

#CHANGE DIMENSIONS OF DATA ACCORDINGLY
X_DIM = 3
Y_DIM = 1
dataset_dir = os.path.join(DATASET_PATH,DATASET_NAME)
assert os.path.exists(dataset_dir),("dataset folder {} does not exist".format(dataset_dir))

splits = {}
scatter_plot = 0

for split in ("train","test","val"):
    data_path = os.path.join(dataset_dir,"{}.csv".format(split))
    assert os.path.exists(data_path),"data file {} does not exist".format(data_path)
    
    data = np.genfromtxt(data_path,delimiter=",")
    if scatter_plot:
        plt.figure
        plt.scatter(data[:,:1],data[:,1:], c='k')
        plt.xlabel("x")
        plt.ylabel('y')
        plt.title(split)
        plt.show()
    torch_data = torch.tensor(data, device="cpu").float()
    splits[split] = LabelledData(x=torch_data[:,:X_DIM],y=torch_data[:,X_DIM:])

train_data = splits["train"]
val_data = splits['val']
test_data = splits['test']

# train_tab = TabularDataset(train_data)
# train_tab[0]

In [3]:
# import raw data
df_test = pd.read_csv("datasets/aero/raw_data/test/data_raw.dat", header = 0, index_col = 0)
aero_test_raw = df_test.loc[:, ["URef", "PLExp", "IECturbc", CHANNEL_NAME]]
test_raw = LabelledData(x= aero_test_raw.to_numpy()[:,:X_DIM],y = aero_test_raw.to_numpy()[:,X_DIM:])

x_values_scale, x_values_index = np.unique(test_data.x, axis = 0, return_index=True)
x_values = np.unique(test_raw.x, axis = 0)
sort =np.argsort(x_values_index)
x_values_scale = x_values_scale[sort]
x_values_index = x_values_index[sort]
x_values = x_values[sort]

# Extract real samples and store in matrix
num_samples_real = 300
real_samples = np.zeros((num_samples_real,len(x_values_scale)))
for i, (idx,values_scaled) in enumerate(zip(x_values_index, x_values_scale)):
    tmp = indexes(test_data.x[idx], test_data.x)
    real_samples[:,i] = test_data.y[tmp].squeeze()
real_samples_std = np.std(real_samples, axis=0)


In [9]:
def wasserstein_dist_nd(real_samples_std, list, real_samples):
    M = np.zeros((len(real_samples_std),len(list)))
    M_normalised = np.zeros((len(real_samples_std),len(list)))
    
    for i, nd in enumerate(list):
        path_to_gen_samples = os.path.join(PLOT_PATH,PLT_DATASET_NAME,'different_{}'.format('nd'),'aero_wcgan_nd_{}'.format(nd),'samples.csv')
        gen_samples = np.genfromtxt(path_to_gen_samples, delimiter=',')
        M[:,i] = ot.wasserstein_1d(gen_samples, real_samples, p = 2)**0.5
        M_normalised[:,i] = M[:,i]/real_samples_std
        # plt.plot(M/real_samples_std)
    M_normalised_mean = np.mean(M_normalised, axis = 0)
    return M, M_normalised, M_normalised_mean
def wasserstein_dist_lambda(real_samples_std, list, nd, real_samples):
    M = np.zeros((len(real_samples_std),len(list)))
    M_normalised = np.zeros((len(real_samples_std),len(list)))
    
    for i, lambda_val in enumerate(list):
        path_to_gen_samples = os.path.join(PLOT_PATH,PLT_DATASET_NAME,'different_lambda',\
                                           'nd_{}'.format(nd),'aero_wcgan_nd_{}_lambda_{}'.format(nd, lambda_val),'samples.csv')
        gen_samples = np.genfromtxt(path_to_gen_samples, delimiter=',')
        M[:,i] = ot.wasserstein_1d(gen_samples, real_samples, p = 2)**0.5
        M_normalised[:,i] = M[:,i]/real_samples_std
        # plt.plot(M/real_samples_std)
    M_normalised_mean = np.mean(M_normalised, axis = 0)
    return M, M_normalised, M_normalised_mean
def wasserstein_dist_arch(real_samples_std, list, real_samples, label):
    M = np.zeros((len(real_samples_std),len(list)))
    M_normalised = np.zeros((len(real_samples_std),len(list)))
    
    for i, arch in enumerate(list):
        path_to_gen_samples = os.path.join(PLOT_PATH,PLT_DATASET_NAME,'different_NN_sizes',\
                                           label,'aero_wcgan_nd_{}_v{}'.format('20', arch),'samples.csv')
        gen_samples = np.genfromtxt(path_to_gen_samples, delimiter=',')
        M[:,i] = ot.wasserstein_1d(gen_samples, real_samples, p = 2)**0.5
        M_normalised[:,i] = M[:,i]/real_samples_std
        # plt.plot(M/real_samples_std)
    M_normalised_mean = np.mean(M_normalised, axis = 0)
    return M, M_normalised, M_normalised_mean

In [12]:
# Calculate average 2-wasserstein distance for each n.d.
list_of_nd = [1,2,3,5,8,10,20]
_, normalised_wasserstein_nd, normalised_wasserstein_nd_mean = wasserstein_dist_nd(real_samples_std,list_of_nd,real_samples)

list_of_lambda = ['1', '2e-1', '2e-2']
_, normalised_wasserstein_lambda_nd_2, normalised_wasserstein_lambda_nd_2_mean = wasserstein_dist_lambda(real_samples_std, list_of_lambda, 2, real_samples)

list_of_lambda = ['1', '2e-2', '2e-3']
_, normalised_wasserstein_lambda_nd_5, normalised_wasserstein_lambda_nd_5_mean = wasserstein_dist_lambda(real_samples_std, list_of_lambda, 5, real_samples)

list_of_arch = ['1','3']
_, normalised_wasserstein_nn_layers, normalised_wasserstein_nn_layers_mean = wasserstein_dist_arch(real_samples_std, list_of_arch, real_samples, 'layers')
list_of_arch = ['2']
_, normalised_wasserstein_nn_neurons, normalised_wasserstein_nn_neurons_mean = wasserstein_dist_arch(real_samples_std, list_of_arch, real_samples, 'neurons')
