# Setup

In [25]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
%autoreload 2

In [4]:
import torch

In [27]:
from src.vfm.connection import Connection
from src.vfm.preprocessor import Preprocessor
import pandas as pd
from src.vfm.model.reg.gradient_boost import GradientBoost
from src.vfm.model.nn.swt_tft import SWTTFTModel
from src.utils.descriptive_analysis import *
from src.vfm.model.physics.physics_informed import PhysicsInformedHybridModel, PhysicsModel

In [6]:
connection = Connection()
# Get client
client = connection.get_client()

In [7]:
well = 'W06'
df_raw = connection.get_data(client, well = well, start=pd.Timestamp('2019-01-01'), end=pd.Timestamp('2023-01-31'))
# df_raw = connection.get_data(client, well=well)
df_raw.shape

(2085243, 13)

In [8]:
dependent_vars=["qo_well_test", "qg_well_test", "qw_well_test"]
independent_vars=["dhp", "dht", "whp", "wht", "choke", "dcp"]

In [33]:
summarize_na(df_raw)

whp             2085138
wht             2085138
dhp             2085138
dht             2085138
choke           2085138
dcp             2085138
qo_well_test    2085004
qg_well_test    2085003
qw_well_test    2085102
qo_mpfm         2085138
qg_mpfm         2085138
wc_mpfm         2085138
well_id               0
dtype: int64

In [9]:
df_raw.to_csv(r"metrics\df_raw.csv", index=True)

In [7]:
oldest_ts_per_column = df_raw.notna().apply(lambda col: col.index[col].min())
oldest_ts_per_column

whp            2019-01-30 18:22:00
wht            2019-01-30 18:22:00
dhp            2019-01-30 18:22:00
dht            2019-01-30 18:22:00
choke          2019-01-30 18:22:00
dcp            2019-01-30 18:22:00
qo_well_test   2019-01-30 20:39:00
qg_well_test   2019-01-30 20:39:00
qw_well_test   2019-01-30 20:39:00
qo_mpfm        2019-01-30 18:22:00
qg_mpfm        2019-01-30 18:22:00
wc_mpfm        2019-01-30 18:22:00
dtype: datetime64[ns]

In [8]:
latest_ts_per_column = df_raw.notna().iloc[::-1].idxmax()
latest_ts_per_column

whp            2023-01-17 11:19:00
wht            2023-01-17 11:19:00
dhp            2023-01-17 11:19:00
dht            2023-01-17 11:19:00
choke          2023-01-17 11:19:00
dcp            2023-01-17 11:19:00
qo_well_test   2023-01-17 20:24:00
qg_well_test   2023-01-17 20:24:00
qw_well_test   2023-01-17 20:24:00
qo_mpfm        2023-01-17 11:19:00
qg_mpfm        2023-01-17 11:19:00
wc_mpfm        2023-01-17 11:19:00
dtype: datetime64[ns]

In [12]:
timestamps = df_raw.index[df_raw["qo_well_test"].notna()].to_list()
timestamps

[Timestamp('2019-01-30 20:00:00'),
 Timestamp('2019-01-31 04:00:00'),
 Timestamp('2019-01-31 13:00:00'),
 Timestamp('2019-03-02 15:00:00'),
 Timestamp('2019-03-04 08:00:00'),
 Timestamp('2019-03-10 17:00:00'),
 Timestamp('2019-04-14 16:00:00'),
 Timestamp('2019-04-15 16:00:00'),
 Timestamp('2019-04-15 23:00:00'),
 Timestamp('2019-04-16 07:00:00'),
 Timestamp('2019-05-22 17:00:00'),
 Timestamp('2019-05-24 05:00:00'),
 Timestamp('2019-05-24 07:00:00'),
 Timestamp('2019-05-24 10:00:00'),
 Timestamp('2019-05-24 17:00:00'),
 Timestamp('2019-05-25 20:00:00'),
 Timestamp('2019-05-26 05:00:00'),
 Timestamp('2019-05-26 06:00:00'),
 Timestamp('2019-05-26 10:00:00'),
 Timestamp('2019-05-26 18:00:00'),
 Timestamp('2019-05-27 01:00:00'),
 Timestamp('2019-05-27 08:00:00'),
 Timestamp('2019-07-05 09:00:00'),
 Timestamp('2019-07-05 22:00:00'),
 Timestamp('2019-08-27 21:00:00'),
 Timestamp('2019-08-28 01:00:00'),
 Timestamp('2019-08-28 06:00:00'),
 Timestamp('2019-08-28 08:00:00'),
 Timestamp('2019-09-

In [9]:
preprocessor = Preprocessor(df=df_raw)
df = preprocessor.preprocess_timeseries()
df.shape

(2085207, 11)

In [10]:
# Count NaNs per column
summarize_na(df)

dhp                   0
dht                   0
whp                   0
wht                   0
choke                 0
dcp                   0
qo_well_test    2084968
qg_well_test    2084967
qw_well_test    2085066
well_id               0
time_idx              0
dtype: int64

In [12]:
df.to_csv(r"metrics\df.csv", index=True)

In [None]:
summary = df[independent_vars].agg(['min', 'max', 'count'])
print(summary)

                dhp           dht           whp           wht         choke  \
min    0.000000e+00      0.000000  2.557475e+06     54.168702      0.255000   
max    1.635270e+07     81.917253  5.659456e+06     64.859727      0.999988   
count  3.475500e+04  34755.000000  3.475500e+04  34755.000000  34755.000000   

                dcp  
min    2.083724e+06  
max    2.287057e+06  
count  3.475500e+04  


In [47]:
df[independent_vars].describe()

Unnamed: 0,dhp,dht,whp,wht,choke,dcp
count,2085207.0,2085207.0,2085207.0,2085207.0,2085207.0,2085207.0
mean,150.1453,81.80338,46.90378,60.40038,0.4873686,21.70532
std,5.022029,0.2438979,3.747964,2.381579,0.1080299,0.3698608
min,133.5974,79.94653,34.91722,55.48766,0.255,21.03906
25%,146.5018,81.80055,43.87321,58.15537,0.4406993,21.44862
50%,153.0465,81.84642,49.14673,59.68187,0.4406993,21.44862
75%,153.0465,81.89228,49.14673,62.74834,0.585,22.03028
max,163.527,81.91725,56.59456,64.85973,0.8234959,22.87057


In [48]:
df[dependent_vars].describe()

Unnamed: 0,qo_well_test,qg_well_test,qw_well_test
count,239.0,240.0,141.0
mean,118.747419,14647.574982,14.180483
std,39.642301,4907.191782,39.440247
min,0.0,0.0,0.0
25%,101.207059,12956.385172,0.120711
50%,129.630833,15724.561697,5.986846
75%,152.494692,18614.24998,9.668928
max,168.084944,20454.892971,221.120117


In [19]:
summarize_na(df)

dhp                   0
dht                   0
whp                   0
wht                   0
choke                 0
dcp                   0
qo_well_test    2084968
qg_well_test    2084967
qw_well_test    2085066
well_id               0
time_idx              0
dtype: int64

In [20]:
df_with_targets = df.dropna(subset=["qo_well_test", "qg_well_test", "qw_well_test"])
df_with_targets.shape

(141, 11)

In [21]:
summarize_na(df_with_targets)

dhp             0
dht             0
whp             0
wht             0
choke           0
dcp             0
qo_well_test    0
qg_well_test    0
qw_well_test    0
well_id         0
time_idx        0
dtype: int64

In [15]:
df_with_targets.to_csv(r"metrics\df_with_targets.csv", index=True)

# Gradient Boost

In [69]:
gb_model = GradientBoost(dependent_vars=dependent_vars, independent_vars=independent_vars, df_target=df_with_targets)

In [70]:
gb_model.train()

qo_well_test surrogate model trained, R2 score: 0.334
qg_well_test surrogate model trained, R2 score: 0.448
qw_well_test surrogate model trained, R2 score: -9.363


In [45]:
df_dense = gb_model.generate_dense_well_rates(df=df)

In [72]:
df_dense

NameError: name 'df_dense' is not defined

# Physics Model

In [29]:
# Fit physics-only model
phys_model = PhysicsModel()
phys_model.fit(df_with_targets)
print("Physics R2:", phys_model.score(df_with_targets))

Physics R2: {'r2_qo': 0.27136854184012915, 'r2_qw': 0.004011253289519123, 'r2_qg': 0.0918296797983964}


# Physics Informed Hybrid Model

In [28]:
# Fit hybrid model
hybrid_model = PhysicsInformedHybridModel()
hybrid_model.fit(df_with_targets)
print("Pysics R2:", hybrid_model.physics_score(df_with_targets))
print("Hybrid R2:", hybrid_model.score(df_with_targets))


Pysics R2: {'r2_qo': 0.27136854184012915, 'r2_qw': 0.004011253289519123, 'r2_qg': 0.0918296797983964}
Hybrid R2: {'r2_qo': 0.37307558580849554, 'r2_qw': 0.057271139309233, 'r2_qg': 0.3491561954280137}


# Sparse Well-Test Temporal Fusion Transformer Model - Well 06

In [None]:
swt_tft_model = SWTTFTModel(dependent_variables=dependent_vars,
                            independent_variables=independent_vars,
                             df=df)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


AssertionError: filters should not remove entries all entries - check encoder/decoder lengths and lags

In [93]:
swt_tft_model.train()

c:\Users\perer\Documents\Code\pcperera\vfm\.venv\Lib\site-packages\lightning\pytorch\trainer\configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params | Mode  | FLOPs
--------------------------------------------------------------------------------------------------------
0  | loss                               | MultiLoss                       | 0      | train | 0    
1  | logging_metrics                    | ModuleList                      | 0      | train | 0    
2  | input_embeddings                   | MultiEmbedding                  | 1      | train | 0    
3  | prescalers                         | ModuleDict                      | 480    | train | 0    
4  | static_variable_selection          | VariableSelectionNetwork        | 14.0 K | train | 0    
5  | encoder_variable_selection         | VariableSel

<class 'pytorch_forecasting.models.temporal_fusion_transformer._tft.TemporalFusionTransformer'>


c:\Users\perer\Documents\Code\pcperera\vfm\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:434: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=21` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 496/521204 [01:14<21:49:57,  6.63it/s, v_num=2, train_loss_step=2124.5] 


Detected KeyboardInterrupt, attempting graceful shutdown ...


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import matplotlib.pyplot as plt

for well_idx in range(1):  # Plot all workload timeseries
    # for field_idx in range(len(target_fields)):
        fig, ax = plt.subplots(figsize=(10, 4))
        best_tft.plot_prediction(x=raw_predictions.x, out=raw_predictions.output, idx=well_idx, add_loss_to_title=False, ax=ax)
        current_title = ax.get_title()
        new_title = f' {well} qg_mpfm - known reals {time_varying_known_reals} \n' + current_title
        ax.set_title(new_title)