# Preprocessing

In [153]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [154]:
%autoreload 2

In [155]:
import torch

In [156]:
from src.vfm.connection import Connection
from src.vfm.preprocessor import Preprocessor
import pandas as pd
from src.vfm.model.reg.gradient_boost import GradientBoost
from src.vfm.model.nn.swt_tft import SWTTFTModel
from src.utils.descriptive_analysis import *
from src.vfm.model.physics.physics_informed import PhysicsInformedHybridModel, PhysicsModel
from src.utils import *
import matplotlib.pyplot as plt
import os

In [157]:
connection = Connection()
# Get client
client = connection.get_client()

In [220]:
wells = ["W06", "W08", "W10", "W11", "W15", "W18", "W19"]
df_all_wells = connection.get_data(client, wells=wells)


DatetimeIndex(['2017-03-06 02:38:30', '2017-03-06 10:21:00',
               '2017-04-09 06:46:30', '2017-04-12 18:15:30',
               '2017-04-13 21:09:30', '2017-04-16 00:44:30',
               '2017-05-25 21:52:00', '2017-05-27 18:24:00',
               '2017-06-07 01:46:00', '2017-06-09 03:55:00',
               ...
               '2022-10-03 22:07:30', '2022-10-04 01:40:00',
               '2022-11-05 06:40:00', '2023-03-05 12:03:00',
               '2023-03-05 18:38:30', '2023-03-05 21:48:00',
               '2023-03-06 07:55:30', '2023-03-06 10:41:00',
               '2023-05-11 06:11:00', '2023-07-27 17:00:30'],
              dtype='datetime64[ns]', length=1964, freq=None)


In [221]:
df_all_wells.columns


Index(['whp', 'wht', 'dhp', 'dht', 'choke', 'dcp', 'qo_well_test',
       'qg_well_test', 'qw_well_test', 'qo_mpfm', 'qg_mpfm', 'wc_mpfm',
       'well_id'],
      dtype='object')

In [222]:
dependent_vars=["qo_well_test", "qg_well_test", "qw_well_test"]
independent_vars=["well_code", "dhp", "dht", "whp", "wht", "choke", "dcp"]

In [223]:
preprocessor = Preprocessor()
df_all_wells_preprocessed = preprocessor.preprocess_timeseries(df=df_all_wells)
len(df_all_wells_preprocessed)

Preprocessing well W06... with columns ['whp', 'wht', 'dhp', 'dht', 'choke', 'dcp', 'qo_well_test', 'qg_well_test', 'qw_well_test', 'qo_mpfm', 'qg_mpfm', 'wc_mpfm', 'well_id']
DatetimeIndex(['2017-03-06 02:38:30', '2017-03-06 10:21:00',
               '2017-04-09 06:46:30', '2017-04-12 18:15:30',
               '2017-04-13 21:09:30', '2017-04-16 00:44:30',
               '2017-05-25 21:52:00', '2017-05-27 18:24:00',
               '2017-06-07 01:46:00', '2017-06-09 03:55:00',
               ...
               '2023-01-13 06:24:00', '2023-01-13 10:18:00',
               '2023-01-13 23:46:00', '2023-01-15 05:22:00',
               '2023-01-15 14:25:00', '2023-01-16 03:25:00',
               '2023-01-16 05:04:00', '2023-01-16 21:25:00',
               '2023-01-16 23:23:00', '2023-01-17 20:24:00'],
              dtype='datetime64[ns]', length=340, freq=None)
Preprocessing well W08... with columns ['whp', 'wht', 'dhp', 'dht', 'choke', 'dcp', 'qo_well_test', 'qg_well_test', 'qw_well_test', '

179091

In [224]:
summarize_null(df_all_wells_preprocessed)

dhp                  0
dht                  0
whp                  0
wht                  0
dcp                  0
choke                0
well_id              0
qo_well_test    177990
qg_well_test    177974
qw_well_test    178286
time_idx             0
well_code            0
dtype: int64

In [225]:
df_with_targets = df_all_wells_preprocessed.dropna(subset=["qo_well_test", "qg_well_test", "qw_well_test"])
df_with_targets.shape

(802, 12)

In [226]:
summarize_null(df_with_targets)

dhp             0
dht             0
whp             0
wht             0
dcp             0
choke           0
well_id         0
qo_well_test    0
qg_well_test    0
qw_well_test    0
time_idx        0
well_code       0
dtype: int64

In [227]:
df_train, df_test = get_train_test_split_per_well(df_with_targets)
len(df_train), len(df_test)

(639, 163)

In [228]:
model = PhysicsInformedHybridModel(dependant_vars=dependent_vars, independent_vars=independent_vars)
model.fit(df_train)

<src.vfm.model.physics.physics_informed.PhysicsInformedHybridModel at 0x25059f93110>

In [229]:
model.physics_score(df_test)

{'W06': {'qo': {'r2': -0.8224005569059707,
   'mae': 35.80843285637026,
   'rmse': 53.156522828058684},
  'qw': {'r2': -6.696884571626496,
   'mae': 6.2818979707033815,
   'rmse': 6.717313912585744},
  'qg': {'r2': -0.3734346182458206,
   'mae': 2004.1774272781224,
   'rmse': 3563.0188393299672}},
 'W08': {'qo': {'r2': -1.7181301234491877,
   'mae': 14.24233547028173,
   'rmse': 17.509704160996545},
  'qw': {'r2': -0.706085007865364,
   'mae': 7.241911623281017,
   'rmse': 8.187558417575092},
  'qg': {'r2': -0.08361655140704438,
   'mae': 964.7084486341304,
   'rmse': 1242.213886586777}},
 'W10': {'qo': {'r2': 0.020989525054240676,
   'mae': 26.999691167047384,
   'rmse': 41.296776157750585},
  'qw': {'r2': -0.020172938731340073,
   'mae': 5.601144377281184,
   'rmse': 6.378408051295666},
  'qg': {'r2': 0.262843629151044,
   'mae': 2428.092828284985,
   'rmse': 3395.3599118952334}},
 'W11': {'qo': {'r2': -1.211308656540576,
   'mae': 37.003194779055754,
   'rmse': 44.05203278604815},
 

In [230]:
model.hybrid_score(df_test)

{'W06': {'qo': {'r2': -1.0510001316661994,
   'mae': 40.42710486797777,
   'rmse': 56.4747217643287},
  'qw': {'r2': -1.4905635832561002,
   'mae': 3.2480552817160233,
   'rmse': 3.856716446064688},
  'qg': {'r2': -0.38161582123589355,
   'mae': 2057.7987948766163,
   'rmse': 3626.820408633688}},
 'W08': {'qo': {'r2': -2.5545403149681385,
   'mae': 16.177711583157514,
   'rmse': 20.900734214709626},
  'qw': {'r2': -2.9590637674675206,
   'mae': 11.603972324303156,
   'rmse': 12.589356424831836},
  'qg': {'r2': -0.08426868183982594,
   'mae': 1028.2439547889778,
   'rmse': 1294.9722683771954}},
 'W10': {'qo': {'r2': -0.09462477563564953,
   'mae': 29.399400472230514,
   'rmse': 44.121619857941724},
  'qw': {'r2': -0.5426233239585607,
   'mae': 6.550119086091819,
   'rmse': 7.9222705381954786},
  'qg': {'r2': 0.26519739943523035,
   'mae': 2451.9260530293327,
   'rmse': 3426.445880508281}},
 'W11': {'qo': {'r2': -0.8887036570902098,
   'mae': 35.110197362980706,
   'rmse': 41.12358909268

In [235]:
def lowo_with_partial_calibration(
    df,
    n_calibration_points=2,
):
    results = []

    for test_well in df["well_id"].unique():
        df_well = df[df["well_id"] == test_well]
        df_train_other = df[df["well_id"] != test_well]

        df_cal = df_well.iloc[:n_calibration_points]
        df_test = df_well.iloc[n_calibration_points:]

        df_train = pd.concat([df_train_other, df_cal])

        model = PhysicsInformedHybridModel(dependant_vars=dependent_vars, independent_vars=independent_vars)
        # model.phys_model.fit_pres = False
        model.fit(df_train)

        results.append({
            "well": test_well,
            "scores": model.hybrid_score(df_test)
        })

    return results


In [237]:
# lowo_with_partial_calibration(df=df_with_targets)