# Preprocessing

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import torch

In [30]:
from src.vfm.connection import Connection
from src.vfm.preprocessor import Preprocessor
import pandas as pd
from src.vfm.model.reg.gradient_boost import GradientBoost
from src.vfm.model.nn.swt_tft import SWTTFTModel
from src.utils.descriptive_analysis import *
from src.vfm.model.physics.physics_informed import PhysicsInformedHybridModel, PhysicsModel
from src.utils import *
import matplotlib.pyplot as plt
import os

In [5]:
connection = Connection()
# Get client
client = connection.get_client()

In [52]:
wells = ["W06", "W08", "W10", "W11", "W15", "W18", "W19"]
df_all_wells = connection.get_data(client, wells=wells)


DatetimeIndex(['2017-03-06 02:38:30', '2017-03-06 10:21:00',
               '2017-04-09 06:46:30', '2017-04-12 18:15:30',
               '2017-04-13 21:09:30', '2017-04-16 00:44:30',
               '2017-05-25 21:52:00', '2017-05-27 18:24:00',
               '2017-06-07 01:46:00', '2017-06-09 03:55:00',
               ...
               '2022-10-03 22:07:30', '2022-10-04 01:40:00',
               '2022-11-05 06:40:00', '2023-03-05 12:03:00',
               '2023-03-05 18:38:30', '2023-03-05 21:48:00',
               '2023-03-06 07:55:30', '2023-03-06 10:41:00',
               '2023-05-11 06:11:00', '2023-07-27 17:00:30'],
              dtype='datetime64[ns]', length=1964, freq=None)


In [7]:
df_all_wells.columns


Index(['whp', 'wht', 'dhp', 'dht', 'choke', 'dcp', 'qo_well_test',
       'qg_well_test', 'qw_well_test', 'qo_mpfm', 'qg_mpfm', 'wc_mpfm',
       'well_id'],
      dtype='object')

In [8]:
dependent_vars=["qo_well_test", "qg_well_test", "qw_well_test"]
independent_vars=["dhp", "dht", "whp", "wht", "choke", "dcp"]

In [53]:
preprocessor = Preprocessor()
df_all_wells_preprocessed = preprocessor.preprocess_timeseries(df=df_all_wells)
len(df_all_wells_preprocessed)

Preprocessing well W06... with columns ['whp', 'wht', 'dhp', 'dht', 'choke', 'dcp', 'qo_well_test', 'qg_well_test', 'qw_well_test', 'qo_mpfm', 'qg_mpfm', 'wc_mpfm', 'well_id']
DatetimeIndex(['2017-03-06 02:38:30', '2017-03-06 10:21:00',
               '2017-04-09 06:46:30', '2017-04-12 18:15:30',
               '2017-04-13 21:09:30', '2017-04-16 00:44:30',
               '2017-05-25 21:52:00', '2017-05-27 18:24:00',
               '2017-06-07 01:46:00', '2017-06-09 03:55:00',
               ...
               '2023-01-13 06:24:00', '2023-01-13 10:18:00',
               '2023-01-13 23:46:00', '2023-01-15 05:22:00',
               '2023-01-15 14:25:00', '2023-01-16 03:25:00',
               '2023-01-16 05:04:00', '2023-01-16 21:25:00',
               '2023-01-16 23:23:00', '2023-01-17 20:24:00'],
              dtype='datetime64[ns]', length=340, freq=None)
Preprocessing well W08... with columns ['whp', 'wht', 'dhp', 'dht', 'choke', 'dcp', 'qo_well_test', 'qg_well_test', 'qw_well_test', '

179091

In [54]:
summarize_null(df_all_wells_preprocessed)

dhp                  0
dht                  0
whp                  0
wht                  0
dcp                  0
choke                0
well_id              0
qo_well_test    177990
qg_well_test    177974
qw_well_test    178286
time_idx             0
dtype: int64

In [55]:
df_with_targets = df_all_wells_preprocessed.dropna(subset=["qo_well_test", "qg_well_test", "qw_well_test"])
df_with_targets.shape

(802, 11)

In [56]:
summarize_null(df_with_targets)

dhp             0
dht             0
whp             0
wht             0
dcp             0
choke           0
well_id         0
qo_well_test    0
qg_well_test    0
qw_well_test    0
time_idx        0
dtype: int64

In [57]:
def lowo_with_partial_calibration(
    df,
    n_calibration_points=2,
):
    results = []

    for test_well in df["well_id"].unique():
        df_well = df[df["well_id"] == test_well]
        df_train_other = df[df["well_id"] != test_well]

        df_cal = df_well.iloc[:n_calibration_points]
        df_test = df_well.iloc[n_calibration_points:]

        df_train = pd.concat([df_train_other, df_cal])

        model = PhysicsInformedHybridModel(dependant_vars=dependent_vars, independent_vars=independent_vars)
        # model.phys_model.fit_pres = False
        model.fit(df_train)

        results.append({
            "well": test_well,
            "scores": model.score(df_test)
        })

    return results


In [58]:
lowo_with_partial_calibration(df=df_with_targets)

[{'well': 'W06',
  'scores': {'qo': {'r2': -0.2281908927633327,
    'mae': 29.737729413367624,
    'rmse': 40.44478667307502},
   'qw': {'r2': -0.042938939490254846,
    'mae': 11.674526242244594,
    'rmse': 38.17406736646774},
   'qg': {'r2': -0.016399957457111647,
    'mae': 3390.6679473170702,
    'rmse': 4552.2182855309375}}},
 {'well': 'W08',
  'scores': {'qo': {'r2': -5.986040281176761,
    'mae': 20.68990709438542,
    'rmse': 25.350820115441273},
   'qw': {'r2': -1.9865985308589238,
    'mae': 8.292195140661612,
    'rmse': 13.43591891635634},
   'qg': {'r2': -6.300500887046505,
    'mae': 3063.2233971429973,
    'rmse': 4349.502291000623}}},
 {'well': 'W10',
  'scores': {'qo': {'r2': 0.006618460550163796,
    'mae': 25.655125286415704,
    'rmse': 35.529800229212185},
   'qw': {'r2': -16.726316183499755,
    'mae': 11.219200827851006,
    'rmse': 17.06179214060311},
   'qg': {'r2': 0.42204935025396073,
    'mae': 1783.920662526583,
    'rmse': 3192.6096509158438}}},
 {'well':