In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn

from tools.torch_lib import *

from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import copy
from torchmetrics.regression import MeanAbsolutePercentageError

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
gpu = torch.device('cuda')
cpu = torch.device('cpu')
device = cpu

if torch.cuda.is_available():
    device = gpu
    # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
    # in PyTorch 1.12 and later.
    torch.backends.cuda.matmul.allow_tf32 = True
    # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
    torch.backends.cudnn.allow_tf32 = True

print(device)

cuda


### Load dataframe

In [4]:
dataset_dir = "dataset/"
dataset_file_name = "pz_3L.csv"
plots_dir = "plots/"
test_plots_dir = "test_plots/"

In [5]:
df = pd.read_csv(dataset_dir + dataset_file_name)
df.head()

Unnamed: 0,ro_well,ro_formation,invasion_zone_h,invasion_zone_ro,PZ,r_well
0,0.01,0.1,0.0,0.1,0.119657,0.04
1,0.01,0.1,0.0,0.1,0.120473,0.042
2,0.01,0.1,0.0,0.1,0.121212,0.044
3,0.01,0.1,0.0,0.1,0.121874,0.046
4,0.01,0.1,0.0,0.1,0.12246,0.048


In [6]:
df.columns

Index(['ro_well', 'ro_formation', 'invasion_zone_h', 'invasion_zone_ro', 'PZ',
       'r_well'],
      dtype='object')

In [7]:
# print attribute's min max

In [8]:
for column in df.columns:
    print(f"{column}: min={df[column].min()} max={df[column].max()}")

ro_well: min=0.01 max=1000.0
ro_formation: min=0.1 max=10000.0
invasion_zone_h: min=0.0 max=2.4
invasion_zone_ro: min=0.1 max=10000.0
PZ: min=0.0915816 max=15345.2
r_well: min=0.04 max=0.2


In [9]:
# attributes in logarithmic scale:
for column in df.columns:
    if column == 'd_well':
        continue
    print(f"{column}: min={np.log(df[column].min())} max={np.log(df[column].max())}")

ro_well: min=-4.605170185988091 max=6.907755278982137
ro_formation: min=-2.3025850929940455 max=9.210340371976184
invasion_zone_h: min=-inf max=0.8754687373538999
invasion_zone_ro: min=-2.3025850929940455 max=9.210340371976184
PZ: min=-2.3905249008422538 max=9.63855800053032
r_well: min=-3.2188758248682006 max=-1.6094379124341003


  print(f"{column}: min={np.log(df[column].min())} max={np.log(df[column].max())}")


### Add dataframe transforms

In [10]:
inputs = np.array(['ro_well', 'ro_formation', 'r_well', 'invasion_zone_h', 'invasion_zone_ro'])
outputs = np.array(['PZ']) # 'A02M01N' dropped

In [11]:
logarithmic_columns = ['ro_formation', 'ro_well', 'invasion_zone_ro']
# normalize data ('min/max' normalization):
interval_th = [-1, 1]     # normalization interval for 'th' activation function
interval_sigmoid = [0, 1] # normalization interval for 'sigmoid' activation function
normalize_interval = interval_sigmoid

attributes_transform_dict = {}
df_transformed = df.copy()

# transform output attributes:
for output_attr in outputs:
    attr_transformer = attributes_transform_dict[output_attr] = AttributeTransformer(df_transformed[output_attr].to_numpy())

    # logarithmic transform
    forward, backward = np.log, np.exp
    df_transformed[output_attr] = attr_transformer.transform(forward, backward)
    # scaling transform
    #forward, backward = get_standard_scaler_transform(attr_transformer.data)
    #df_transformed[output_attr] = attr_transformer.transform(forward, backward)
    # # normalize transform
    forward, backward = get_normalize_transforms(attr_transformer.data, normalize_interval)
    df_transformed[output_attr] = attr_transformer.transform(forward, backward)

# logarithm resistance:
for col in logarithmic_columns:
    if col in outputs:
        continue
    df_transformed[col] = df_transformed[col].apply(np.log)

# add normalization
for attribute in df_transformed.columns:
    if attribute in outputs:
        continue
    transform, _ = get_normalize_transforms(df_transformed[attribute].to_numpy(), normalize_interval)
    #transform, _ = get_standard_scaler_transform(df_transformed[attribute].to_numpy())  # use scaling instead of min-max norm
    df_transformed[attribute] = transform(df_transformed[attribute].to_numpy())

df_transformed

Unnamed: 0,ro_well,ro_formation,invasion_zone_h,invasion_zone_ro,PZ,r_well
0,0.0,0.0,0.0,0.0,0.022229,0.00000
1,0.0,0.0,0.0,0.0,0.022794,0.01250
2,0.0,0.0,0.0,0.0,0.023303,0.02500
3,0.0,0.0,0.0,0.0,0.023756,0.03750
4,0.0,0.0,0.0,0.0,0.024154,0.05000
...,...,...,...,...,...,...
10881670,1.0,1.0,1.0,1.0,0.974364,0.56250
10881671,1.0,1.0,1.0,1.0,0.972347,0.62500
10881672,1.0,1.0,1.0,1.0,0.970280,0.68750
10881673,1.0,1.0,1.0,1.0,0.965016,0.84375


In [12]:
def print_inference_statistic(attributes, df_):
    means = []
    stds = []
    mins = []
    maxes = []

    for column in attributes:
        col_data = df_[column].to_numpy()

        if column in logarithmic_columns or column in outputs:
            col_data = np.log(col_data)  # first transform - log

        # col_mean = np.mean(col_data)
        # col_std = np.std(col_data)

        # means.append(col_mean)
        # stds.append(col_std)
        #
        # col_data = (col_data - col_mean) / col_std

        mins.append(np.min(col_data))
        maxes.append(np.max(col_data))

    # print(f"means={means}")
    # print(f"stds={stds}")
    print(f"mins={mins}")
    print(f"maxes={maxes}")

In [13]:
print_inference_statistic(inputs, df)

mins=[-4.605170185988091, -2.3025850929940455, 0.04, 0.0, -2.3025850929940455]
maxes=[6.907755278982137, 9.210340371976184, 0.2, 2.4, 9.210340371976184]


In [14]:
print_inference_statistic(outputs, df)

mins=[-2.3905249008422538]
maxes=[9.63855800053032]


### Build Datasets and create dataloaders

In [15]:
class SimpleDataset(Dataset):
    def __init__(self, df_, inputs, outputs, device):
        self.df = df_
        self.inputs = torch.from_numpy(df_[inputs].to_numpy()).float().to(device)
        self.outputs = torch.from_numpy(df_[outputs].to_numpy()).float().to(device)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        item, label = self.inputs[idx], self.outputs[idx]

        return item, label


In [16]:
batch_size = 1000

train_df, test_df = train_test_split(df_transformed, shuffle=True, test_size=0.3)
test_df, validation_df = train_test_split(test_df, shuffle=True, test_size=0.33)

train_dataset = SimpleDataset(train_df, inputs, outputs, device)
test_dataset = SimpleDataset(test_df, inputs, outputs, device)
validation_dataset = SimpleDataset(validation_df, inputs, outputs, device)
full_dataset = SimpleDataset(df_transformed, inputs, outputs, device)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
full_dataset_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)

### Build models

In [17]:
class WeightedMAE(nn.Module):
    def __init__(self, weights):
        super(WeightedMAE, self).__init__()
        self.mae = nn.L1Loss()
        self.weights = weights

    def forward(self, inputs, targets):
        weighted_inputs = inputs * self.weights

        return self.mae(weighted_inputs, targets)

    def to(self, device):
        super().to(device)
        self.weights = self.weights.to(device)


class LinearModel(nn.Module):
    def __init__(self, layers_dims, act_str_list, output_dim):
        super().__init__()
        layers_count = len(layers_dims)
        assert layers_count > 0

        module_list = []
        for i in range(layers_count - 1):
            module_list.append(nn.Linear(layers_dims[i], layers_dims[i + 1]))
        module_list.append(nn.Linear(layers_dims[layers_count - 1], output_dim))

        activations_list = []
        for i in range(layers_count):
            activations_list.append(activations[act_str_list[i]])

        self.linears = nn.ModuleList(module_list)
        self.activations = nn.ModuleList(activations_list)

    def forward(self, x):
        y = x

        for lin, act in zip(self.linears, self.activations):
            y = lin(y)
            y = act(y)

        return y

# add batch normalization
class LinearNormalizedModel(nn.Module):
    def __init__(self, layers_dims, act_str_list, output_dim):
        super().__init__()
        layers_count = len(layers_dims)
        assert layers_count > 0

        linears_list = []
        batch_norm_list = []

        for i in range(layers_count - 1):
            in_features, out_features = layers_dims[i], layers_dims[i + 1]
            linears_list.append(nn.Linear(in_features, out_features))
            batch_norm_list.append(nn.BatchNorm1d(out_features))

        linears_list.append(nn.Linear(layers_dims[layers_count - 1], output_dim))
        batch_norm_list.append(nn.BatchNorm1d(output_dim))

        activations_list = []
        for i in range(layers_count):
            activations_list.append(activations[act_str_list[i]])

        self.linears = nn.ModuleList(linears_list)
        self.activations = nn.ModuleList(activations_list)
        self.batch_normalizations = nn.ModuleList(batch_norm_list)

    def forward(self, x):
        y = x

        for lin, act, norm in zip(self.linears, self.activations, self.batch_normalizations):
            y = lin(y)
            y = norm(y)
            y = act(y)

        return y


### Train model

In [18]:
layers_dims = [len(inputs), 50, 120, 1200, 120, 10]
layers_count = len(layers_dims)
activations_string_list = ['leaky-relu' for i in range(layers_count)]
#activations_string_list[-1] = 'sigmoid'

linear_model = LinearModel(layers_dims, activations_string_list, len(outputs)).to(device)
#linear_bn_model = LinearBNormModel(layers_dims, activations_string_list, len(outputs)).to(device)
#linear_ln_model = LinearLNormModel(layers_dims, activations_string_list, len(outputs)).to(device)

model = linear_model
model_name = "linear_model"
linear_model

LinearModel(
  (linears): ModuleList(
    (0): Linear(in_features=5, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=120, bias=True)
    (2): Linear(in_features=120, out_features=1200, bias=True)
    (3): Linear(in_features=1200, out_features=120, bias=True)
    (4): Linear(in_features=120, out_features=10, bias=True)
    (5): Linear(in_features=10, out_features=1, bias=True)
  )
  (activations): ModuleList(
    (0-5): 6 x LeakyReLU(negative_slope=0.01)
  )
)

In [19]:
learning_rate = 0.0001
epoch_count = 200

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

#loss_function = WeightedMAE(torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0], dtype=float))
loss_function = nn.L1Loss()

In [20]:
epoch_validation = True
train_loss_threshold = 0.0003

train_loss_list, validation_loss_list = train_model(epoch_count, model, optimizer, loss_function, train_loader, validation_loader, True, train_loss_threshold)
plot_loss(train_loss_list, "train loss")

Epoch: 0; train loss=0.010797; validation loss=0.003467
Epoch: 1; train loss=0.002717; validation loss=0.001840
Epoch: 2; train loss=0.002131; validation loss=0.001664
Epoch: 3; train loss=0.002149; validation loss=0.001293
Epoch: 4; train loss=0.001960; validation loss=0.002604
Epoch: 5; train loss=0.001670; validation loss=0.003044
Epoch: 6; train loss=0.001625; validation loss=0.001737
Epoch: 7; train loss=0.001514; validation loss=0.001930


KeyboardInterrupt: 

In [None]:
test_loss = test_loop(test_loader, model, loss_function)
print(f"test loss={test_loss}")

In [None]:
plot_loss(validation_loss_list, "test loss")

### Plot predictions

In [None]:
for _, (X, y) in enumerate(train_loader):
    print(model(X))
    break

In [None]:
# plot_predictions(outputs, full_dataset_loader, linear_model)

In [None]:
#plot_actual_predictions(outputs, full_inference_dataset_loader, linear_model, attributes_transform_dict, df)

In [None]:
plot_relative_errors(outputs, full_dataset_loader, model, attributes_transform_dict,
                     df, 0.01, device, plots_dir, mode='default+hist', bin_count=100)

In [None]:
# plot test predictions:
plot_relative_errors(outputs, test_loader, model, attributes_transform_dict,
                     df, 0.01, device, plots_dir + test_plots_dir, mode='default+hist', bin_count=100)

#### check predictions manually

In [None]:
predictor = Predictor(full_dataset_loader, df, attributes_transform_dict, model, inputs, outputs)
predictions_dict, actuals_dict = predictor.predict(device)

In [None]:
def compare_prediction(idx: int, prediction_dict, actuals_dict, attribute):
    predicted = prediction_dict[attribute][idx]
    actual = actuals_dict[attribute][idx]
    relative_error = abs(actual - predicted) / actual
    print(f"{idx}: predicted={predicted}; actual={actual}; relative error={relative_error}")

### Save model

In [None]:
model.to(cpu)    # attach model to cpu before scripting and saving to prevent cuda meta information saved
scripted_model = torch.jit.script(model)
model_file_name = "saved_models/" + model_name + str(round(test_loss, 7)).replace('.', '_')

scripted_model.save(model_file_name + ".pt") # save torch script model which compatible with pytorch c++ api
torch.save(model, model_file_name + ".pth")   # save model in python services specific format

# attach model back to device:
model.to(device)

In [None]:
scripted_model(torch.tensor([0.6, 0.362372, 0.04]))

In [None]:
model(torch.tensor([0.6, 0.362372, 0.04], device=device))