In [1]:
import os
import numpy as np
import pandas as pd
import plotly as plt
from io import StringIO
import torch
from Pytorch.Local.tools.torch_lib import plot_relative_error_hist

In [2]:
data_dir = "data/"
plots_dir = "plots/"
files = os.listdir(data_dir)

In [None]:
dataframes = dict()

for file_name in files:
    cur_df = dataframes[file_name] = pd.read_csv(data_dir + file_name)

    print(file_name + ":")
    print(cur_df.head())
    print()

In [4]:
# def get_by_prefix(prefix:str, directory:str):
#     file_names = os.listdir(directory)
#     result = []
#
#     for name in file_names:
#         if name.startswith(prefix):
#             result.append(name)
#
#     return result
#
#
# with open(data_dir + "1D_3L_(2).csv", 'r') as f:
#     first_line = True
#     data = []
#
#     for line in f:
#         tokens = line.replace("\"", "").replace("\n", "").split(",")
#
#         if first_line:
#             columns = tokens
#             first_line = False
#         else:
#             data.append(tokens)
#
#     cur_df = dataframes['1D_3L_(2).csv'] = pd.DataFrame(columns=columns, data=data)
#
#     print(cur_df.head())


In [5]:
# dataframes['1D_3L_(2).csv'].to_csv(data_dir + '1D_3L_(3).csv', index=False)

In [76]:
origin_datasets_dir = "D:/ML/data/NN-simulation1D/"
actual_names = os.listdir("D:/ML/data/NN-simulation1D/")

outputs_dict = dict()
inputs_dict = dict()
outputs_dict["1D_1L.csv"] = np.array(['IL1', 'IL_6F1', 'IL_8I14'])
outputs_dict["1D_2A.csv"] = np.array(['A04M01N', 'A10M01N', 'A20M05N', 'A40M05N', 'A80M10N'])
outputs_dict["1D_2L.csv"] = np.array(['A04M01N', 'A10M01N', 'A20M05N', 'A40M05N', 'A80M10N'])
outputs_dict["1D_2L_chart.csv"] = np.array(['rok'])
outputs_dict["1D_3L.csv"] = np.array(['A04M01N', 'A10M01N', 'A20M05N', 'A40M05N', 'A80M10N'])
outputs_dict["1D_3L_chart.csv"] = np.array(['rok'])
outputs_dict["pz_2a.csv"] = np.array(['rok'])
outputs_dict["pz_3L.csv"] = np.array(['PZ'])

inputs_dict["1D_1L.csv"] =  np.array(['ro_well', 'ro_formation', 'd_well', 'invasion_zone_h', 'invasion_zone_ro'])
inputs_dict["1D_2A.csv"] = np.array(['ro_well', 'ro_formation', 'rad_well', 'kanisotrop'])
inputs_dict["1D_2L.csv"] = np.array(['ro_well', 'ro_formation', 'd_well'])
inputs_dict["1D_2L_chart.csv"] = np.array(['AO/d', 'lambda', 'ro_formation'])
inputs_dict["1D_3L.csv"] = np.array(['ro_well', 'ro_formation', 'd_well', 'invasion_zone_ro', 'invasion_zone_h'])
inputs_dict["1D_3L_chart.csv"] = np.array(['AO/d', 'ro_formation', 'invasion_zone_ro', 'D/d'])
inputs_dict["pz_2a.csv"] = np.array(['ro_well', 'ro_formation', 'r_well', 'lambda1'])
inputs_dict["pz_3L.csv"] = np.array(['ro_well', 'ro_formation', 'r_well', 'invasion_zone_h', 'invasion_zone_ro'])

In [7]:
def plot_comparison(dataset_name: str, simulation_data_dict: dict, outputs_dict: dict):
    df = pd.read_csv(origin_datasets_dir + dataset_name)
    outputs = outputs_dict[dataset_name]

    def extract_data(simulation_dict: dict):
        simulation_dataset = simulation_dict['dataset']
        st_pos = simulation_dict['st_pos']
        count = simulation_dict['count']

        actual_data = torch.tensor(df[st_pos:st_pos + count][outputs].to_numpy())
        simulation_data = torch.tensor(simulation_dataset[:count][outputs].to_numpy())

        assert actual_data.size() == simulation_data.size()

        return actual_data, simulation_data

    actuals = torch.tensor([])
    simulated = torch.tensor([])

    # concat data
    for key in simulation_data_dict:
        simulation_data = simulation_data_dict[key]
        act, sim = extract_data(simulation_data)
        actuals = torch.concat((actuals, act))
        simulated = torch.concat((simulated, sim))

    for i in range(len(outputs)):
        output = outputs[i]
        assert actuals.size() == simulated.size()
        fig = plot_relative_error_hist(actuals[:, i], simulated[:, i], 0.01, dataset_name + ": simulation")
        dir_name = dataset_name.split(".")[0]

        if not os.path.isdir(plots_dir + dir_name):
            os.mkdir(plots_dir + dir_name)
        fig.write_image(plots_dir + dir_name + "/" + output + ".pdf")


In [8]:
# first = '1D_3L_1.csv'
# second = '1D_3L_2.csv'
#
# sim_data_dict = {
#     first : {
#         'dataset': pd.read_csv(data_dir + first),
#         'st_pos': 0,
#         'count' : 36
#     },
#     second: {
#         'dataset': pd.read_csv(data_dir + second),
#         'st_pos': 516612,
#         'count' : 37
#     }
# }
# #plot_comparison('1D_3L.csv', sim_data_dict, outputs_dict)

In [62]:
def select_existing_subset(dataset_name, simulation_df, st_pos, count):
    inputs = inputs_dict[dataset_name]
    columns = pd.read_csv(origin_datasets_dir + dataset_name, nrows=1).columns
    actual_df = pd.read_csv(origin_datasets_dir + dataset_name, skiprows=st_pos, nrows=count)
    actual_df.columns = columns

    sim_in_data = simulation_df[inputs].to_numpy()
    result_subset_actual = pd.DataFrame({})
    result_subset_simulated = pd.DataFrame({})

    for i in range(len(sim_in_data)):
        row = sim_in_data[i]
        selected_rows = actual_df[np.all(actual_df[inputs].values == row, axis=1)]

        if selected_rows.shape[0] == 0: # input with specified values doesn't contained in actual df
            continue
        assert selected_rows.shape[0] == 1  # one row expects

        result_subset_actual = pd.concat([result_subset_actual, selected_rows], axis=0)
        result_subset_simulated = pd.concat([result_subset_simulated, simulation_df.iloc[[i]]])

    return result_subset_actual, result_subset_simulated

pydev debugger: Unable to find real location for: C:\Users\Andrey\AppData\Local\Temp\ipykernel_420\1373300899.py
pydev debugger: Unable to find real location for: C:\Users\Andrey\AppData\Local\Temp\ipykernel_420\1007791582.py


KeyboardInterrupt: 

# 1D_3L

In [10]:
first = pd.read_csv(data_dir + '1D_3L_1.csv')
second = pd.read_csv(data_dir + '1D_3L_2.csv')
subset_actual, subset_simulated = select_existing_subset('1D_3L.csv', first, 0, 1000)
subset_actual2, subset_simulated2 = select_existing_subset('1D_3L.csv', second, 516612, 1000)

In [None]:
subset_simulated = pd.concat([subset_simulated, subset_simulated2], axis=0).reset_index().drop(['index'], axis=1)
subset_simulated

In [None]:
subset_actual = pd.concat([subset_actual, subset_actual2], axis=0).reset_index().drop(['index'], axis=1)
subset_actual

In [None]:
subset_simulated[inputs_dict['1D_3L.csv']] == subset_actual[inputs_dict['1D_3L.csv']]

In [14]:
def plot_comparisons(dataset_name: str, actual_df, simulated_df):
    outputs = outputs_dict[dataset_name]
    actuals = torch.tensor(actual_df[outputs].to_numpy())
    simulated = torch.tensor(simulated_df[outputs].to_numpy())

    for i in range(len(outputs)):
        output = outputs[i]
        assert actuals.size() == simulated.size()
        fig = plot_relative_error_hist(actuals[:, i], simulated[:, i], 0.01, dataset_name + ": simulation")
        dir_name = dataset_name.split(".")[0]

        if not os.path.isdir(plots_dir + dir_name):
            os.mkdir(plots_dir + dir_name)
        fig.write_image(plots_dir + dir_name + "/" + output + ".pdf")

plot_comparisons('1D_3L.csv', subset_actual, subset_simulated)

# 1D_2A

In [32]:
first = pd.read_csv(data_dir + '1D_2A.csv')
second = pd.read_csv(data_dir + '1D_2A_2.csv')
subset_actual, subset_simulated = select_existing_subset('1D_2A.csv', first, 0, 1000)
subset_actual2, subset_simulated2 = select_existing_subset('1D_2A.csv', second, 520696, 1000)

In [None]:
subset_actual

In [28]:
subset_actual2

In [None]:
subset_simulated = pd.concat([subset_simulated, subset_simulated2], axis=0).reset_index().drop(['index'], axis=1)
subset_simulated

In [None]:
subset_actual = pd.concat([subset_actual, subset_actual2], axis=0).reset_index().drop(['index'], axis=1)
subset_actual

In [None]:
subset_simulated[inputs_dict['1D_2A.csv']] == subset_actual[inputs_dict['1D_2A.csv']]

In [36]:
plot_comparisons('1D_2A.csv', subset_actual, subset_simulated)

# 1D_2L

In [44]:
first = pd.read_csv(data_dir + '1D_2L.csv')
subset_actual, subset_simulated = select_existing_subset('1D_2L.csv', first, 0, 1000)

In [None]:
subset_actual = subset_actual.reset_index().drop(['index'], axis=1)
subset_actual

In [None]:
subset_simulated = subset_simulated.reset_index().drop(['index'], axis=1)
subset_simulated

In [None]:
subset_simulated[inputs_dict['1D_2L.csv']] == subset_actual[inputs_dict['1D_2L.csv']]

In [48]:
plot_comparisons('1D_2L.csv', subset_actual, subset_simulated)

# 1D_2L_chart

In [49]:
first = pd.read_csv(data_dir + '1D_2L_chart.csv')
subset_actual, subset_simulated = select_existing_subset('1D_2L_chart.csv', first, 0, 1000)

In [None]:
subset_actual = subset_actual.reset_index().drop(['index'], axis=1)
subset_actual

In [None]:
subset_simulated = subset_simulated.reset_index().drop(['index'], axis=1)
subset_simulated

In [None]:
subset_simulated[inputs_dict['1D_2L_chart.csv']] == subset_actual[inputs_dict['1D_2L_chart.csv']]

In [53]:
plot_comparisons('1D_2L_chart.csv', subset_actual, subset_simulated)

# 1D_3L_chart

In [63]:
first = pd.read_csv(data_dir + '1D_3L_chart.csv').head(1000)
subset_actual, subset_simulated = select_existing_subset('1D_3L_chart.csv', first, 0, 1000)

In [None]:
subset_actual = subset_actual.reset_index().drop(['index'], axis=1)
subset_actual

In [None]:
subset_simulated = subset_simulated.reset_index().drop(['index'], axis=1)
subset_simulated

In [None]:
subset_simulated[inputs_dict['1D_3L_chart.csv']] == subset_actual[inputs_dict['1D_3L_chart.csv']]

In [67]:
plot_comparisons('1D_3L_chart.csv', subset_actual, subset_simulated)

# pz_2a

In [77]:
first = pd.read_csv(data_dir + 'pz_2a.csv').head(1000)
first.rename(columns={'d_well':'r_well'}, inplace=True)
first['r_well'] = first['r_well'] / 2
subset_actual, subset_simulated = select_existing_subset('pz_2a.csv', first, 0, 1000)

In [None]:
subset_actual = subset_actual.reset_index().drop(['index'], axis=1)
subset_actual

In [None]:
subset_simulated = subset_simulated.reset_index().drop(['index'], axis=1)
subset_simulated

In [None]:
subset_simulated[inputs_dict['pz_2a.csv']] == subset_actual[inputs_dict['pz_2a.csv']]

In [81]:
plot_comparisons('pz_2a.csv', subset_actual, subset_simulated)

# pz_3L

In [82]:
first = pd.read_csv(data_dir + 'pz_3L.csv').head(1000)
subset_actual, subset_simulated = select_existing_subset('pz_3L.csv', first, 0, 1000)

In [None]:
subset_actual = subset_actual.reset_index().drop(['index'], axis=1)
subset_actual

In [None]:
subset_simulated = subset_simulated.reset_index().drop(['index'], axis=1)
subset_simulated

In [None]:
subset_simulated[inputs_dict['pz_3L.csv']] == subset_actual[inputs_dict['pz_3L.csv']]

In [86]:
plot_comparisons('pz_3L.csv', subset_actual, subset_simulated)