# Preprocessing

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import torch

In [6]:
from src.vfm.connection import Connection
from src.vfm.preprocessor import Preprocessor
import pandas as pd
from src.vfm.model.reg.gradient_boost import GradientBoost
from src.vfm.model.nn.swt_tft import SWTTFTModel
from src.utils.descriptive_analysis import *
from src.vfm.model.physics.physics_informed import PhysicsInformedHybridModel, PhysicsModel
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split


  from tqdm.autonotebook import tqdm


In [7]:
connection = Connection()
# Get client
client = connection.get_client()

In [9]:
well = f"W10"
df_raw = connection.get_data(client, well=well)


In [8]:
df_raw.columns


NameError: name 'df_raw' is not defined

In [55]:
dependent_vars=["qo_well_test", "qg_well_test", "qw_well_test"]
independent_vars=["dhp", "dht", "whp", "wht", "choke", "dcp"]

In [56]:
summarize_na(df_raw)

whp             349
wht             349
dhp             349
dht             349
choke           349
dcp             349
qo_well_test    224
qg_well_test    223
qw_well_test    339
qo_mpfm         353
qg_mpfm         353
wc_mpfm         349
well_id           0
dtype: int64

In [57]:
# df_raw = df_raw.loc["2017-03-06":"2021-04-05"]

In [58]:
preprocessor = Preprocessor(df=df_raw)
df = preprocessor.preprocess_timeseries(well_id=well)
df.shape

(27973, 11)

In [59]:
# Count NaNs per column
summarize_na(df)

dhp                 0
dht                 0
whp                 0
wht                 0
dcp                 0
choke               0
well_id             0
qo_well_test    27629
qg_well_test    27628
qw_well_test    27741
time_idx            0
dtype: int64

In [60]:
# df.to_csv(rf"metrics\df-{well}.csv", index=True)

In [61]:
df[independent_vars].describe()

Unnamed: 0,dhp,dht,whp,wht,choke,dcp
count,27973.0,27973.0,27973.0,27973.0,27973.0,27973.0
mean,141.791053,81.055531,40.782727,61.47813,0.676316,21.788824
std,10.359022,0.761339,7.087309,1.662333,0.195233,0.482602
min,121.658707,72.431594,26.002028,54.923113,0.278,20.721562
25%,131.559452,81.180241,33.772407,60.489831,0.525179,21.390806
50%,141.284636,81.261085,40.179898,61.74429,0.619627,21.728417
75%,151.101341,81.293269,47.112865,62.734602,0.871,22.14291
max,162.327916,81.692168,55.559251,66.433394,1.0,24.0486


In [62]:
df[dependent_vars].describe()

Unnamed: 0,qo_well_test,qg_well_test,qw_well_test
count,344.0,345.0,232.0
mean,114.492712,14404.582732,6.276743
std,39.12292,4366.833044,4.057383
min,0.0,0.0,0.0
25%,98.33412,13056.580069,3.793262
50%,124.892796,15189.890628,6.474224
75%,142.690884,17416.969702,8.961888
max,169.959143,20971.904163,19.985577


In [63]:
summarize_na(df)

dhp                 0
dht                 0
whp                 0
wht                 0
dcp                 0
choke               0
well_id             0
qo_well_test    27629
qg_well_test    27628
qw_well_test    27741
time_idx            0
dtype: int64

In [70]:
df_with_targets = df.dropna(subset=["qo_well_test", "qg_well_test", "qw_well_test"])
df_with_targets.shape

(232, 11)

In [71]:
summarize_na(df_with_targets)

dhp             0
dht             0
whp             0
wht             0
dcp             0
choke           0
well_id         0
qo_well_test    0
qg_well_test    0
qw_well_test    0
time_idx        0
dtype: int64

# Transfer Learn - Physics Informed Hybrid Model

In [1]:
train_df, test_df = get_train_test_split(df_with_targets)

NameError: name 'get_train_test_split' is not defined

In [77]:
hybrid = PhysicsInformedHybridModel.load(r"models\physics_informed_hybrid_model.pkl")

# Refit physics model using WELL-07 well tests
hybrid.phys_model.fit(train_df)

# OPTIONAL: retrain ML residuals if enough data exists
hybrid.fit(train_df)

pred_well07 = hybrid.predict(test_df)

pred_well07

Unnamed: 0,qo_pred,qg_pred,qw_pred,wc_pred
2021-12-02 22:00:00,156.93308,19526.936419,9.119674,0.055025
2020-03-20 04:00:00,118.734046,15593.267295,4.160638,0.046245
2019-04-30 06:00:00,53.178826,11854.416226,1.375546,0.046198
2020-12-28 02:00:00,145.688988,16926.032237,9.472891,0.060273
2019-06-16 20:00:00,100.625656,15585.459889,2.721884,0.035352
2021-02-19 04:00:00,131.125872,15168.869581,7.410774,0.053001
2019-07-06 06:00:00,111.124087,13082.833412,3.347698,0.057373
2021-02-15 00:00:00,159.943307,19762.908802,13.096601,0.071509
2020-12-04 10:00:00,127.98681,15051.271994,8.1112,0.047743
2020-09-22 06:00:00,147.473758,17547.638074,1.883845,0.035361


In [None]:
# Physics-only baseline
physics_metrics = hybrid.physics_score(test_df)

# Hybrid metrics
hybrid_metrics = hybrid.score(test_df)

print("Physics-only:", physics_metrics)
print("Hybrid (Transfer):", hybrid_metrics)


Physics-only: {'qo': {'r2': 0.2778920339776356, 'mae': 18.68775803896581, 'rmse': 23.150642448823866}, 'qw': {'r2': 0.1010394229940329, 'mae': 2.832649663121567, 'rmse': 3.524439535724878}, 'qg': {'r2': -0.07050310093674361, 'mae': 3004.216011789079, 'rmse': 3615.326891069403}}
Hybrid (Transfer): {'qo': {'r2': 0.36204698905248855, 'mae': 13.272970734891274, 'rmse': 21.728127680642885}, 'qw': {'r2': 0.33338471714337936, 'mae': 2.3428733754481788, 'rmse': 2.9580329246378163}, 'qg': {'r2': 0.39903699083579003, 'mae': 1633.6323024407716, 'rmse': 2705.6256288869426}}


: 