In [2]:
import os
import mlflow
import optuna
import joblib
import warnings
warnings.filterwarnings('always')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
from copy import deepcopy
from pathlib import Path
from dotenv import load_dotenv
import sys

source_path = '/home/github/cnooc_vfm_ai'
if source_path not in sys.path:
    sys.path.append(source_path)

from source.constants import *
from source.model_parameterizer.classifier import ClassifierParameterizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.outliers import ArbitraryOutlierCapper
from feature_engine.timeseries.forecasting import WindowFeatures
from minio import Minio
import matplotlib
matplotlib.rcParams['font.size'] = 15
matplotlib.rcParams['lines.markersize'] = 8.5

load_dotenv(dotenv_path=DOTENV_PATH)



True

# Download data from bucket

In [21]:
URL_MINIO = os.environ.get("URL_MINIO")
print(URL_MINIO)
ACESS_KEY_MINIO = os.environ.get("ACESS_KEY_MINIO")
print(ACESS_KEY_MINIO)
SECRET_KEY_MINIO = os.environ.get("SECRET_KEY_MINIO")
print(SECRET_KEY_MINIO)

client = Minio(URL_MINIO, ACESS_KEY_MINIO, SECRET_KEY_MINIO, secure=False)

dataset = {
    'name': '3w',
    'version': 'public',
    'minio': {
        'bucket': '3wdataset',
    }
}

scheme = {
  'name': 'hn'
}

seed = 1234

positive_event = 3
include_simulated = True

test_size = 0.2
validation_size = 0.2 

periods_undersample = 60

window_size = 120

number_trials = 5

10.218.43.83:9000/
redda
SnowinRio


List all the files in the bucket

In [11]:
# List all objects in the specified bucket
bucket_name = dataset['minio']['bucket'] 
objects = client.list_objects(bucket_name, recursive=True)

# Iterate over all objects and print details
for obj in objects:
    print(f"Object Name: {obj.object_name}")
    print(f"Size: {obj.size} bytes")
    print(f"Last Modified: {obj.last_modified}")
    print('-' * 40)


Object Name: public/0/WELL-00001_20170201020207.csv
Size: 2180707 bytes
Last Modified: 2024-08-09 19:56:32.677000+00:00
----------------------------------------
Object Name: public/0/WELL-00001_20170201070114.csv
Size: 2187173 bytes
Last Modified: 2024-08-09 19:56:32.708000+00:00
----------------------------------------
Object Name: public/0/WELL-00001_20170201120124.csv
Size: 2185953 bytes
Last Modified: 2024-08-09 19:56:32.805000+00:00
----------------------------------------
Object Name: public/0/WELL-00001_20170201170311.csv
Size: 2172899 bytes
Last Modified: 2024-08-09 19:56:32.717000+00:00
----------------------------------------
Object Name: public/0/WELL-00001_20170201220228.csv
Size: 2178145 bytes
Last Modified: 2024-08-09 19:56:32.809000+00:00
----------------------------------------
Object Name: public/0/WELL-00001_20170202030343.csv
Size: 2168995 bytes
Last Modified: 2024-08-09 19:56:32.831000+00:00
----------------------------------------
Object Name: public/0/WELL-00001_2

In [78]:
def download_event_data_from_minio(
    minio_connection, 
    bucket, 
    data_version,
    instances_to_remove: dict = dict(),
    positive_event: str = '3',
    include_simulated: bool = True, 
    include_noa: bool = False,
    download_dir: str = './bucket'
):
    """
    Download event data from a Minio bucket, filtering and structuring the 
    files into local directories based on the event type (e.g., '0', '3').
    
    Parameters:
        minio_connection: Minio client connection object.
        bucket: The bucket name from which to download data.
        data_version: The prefix (folder) in the bucket where the dataset is stored.
        instances_to_remove: Dictionary of events and instances to exclude from downloading.
        positive_event: Event type that signifies positive examples (default is '3').
        include_simulated: Whether to include simulated instances in the download.
        include_noa: Whether to include events that are not of interest (events other than '0' and positive_event).
        download_dir: Local directory where files will be downloaded and organized.
    """
    dataset_path = data_version
    objects = minio_connection.list_objects(bucket, prefix=dataset_path, recursive=True)

    for obj in objects:
        address = obj.object_name
        print(f"Processing object: {address}")  # Debugging line
        
        # Split the address based on '/' to get instance and event folder
        split_address = address.split('/')
        
        # Check if the expected structure exists
        if len(split_address) < 3:
            print(f"Skipping {address}, as it does not match the expected folder structure.")
            continue
        
        instance = split_address[-1]  # Get the file name
        event = split_address[1]  # Get the event folder name (e.g., '0', '3')
        print(f"Extracted event: {event}, instance: {instance}")  # Debugging line

        drawn_instance_flag, remove_noa_flag, instance_to_remove_flag = False, False, False

        # Add logic for drawn instances and simulated instances if needed
        if DRAWN_INSTANCE_LABEL(data_version) in address:
            drawn_instance_flag = True

        if SIMULATED_INSTANCE_LABEL(data_version) in address:
            drawn_instance_flag = True if not include_simulated else drawn_instance_flag

        # This line filters out any event that is not 0 or the positive_event (3)
        if event not in ['0', positive_event]:
            remove_noa_flag = True if not include_noa else remove_noa_flag

        if event in instances_to_remove.keys():
            for inst in instances_to_remove[event]:
                if inst in address:
                    instance_to_remove_flag = True

        if not drawn_instance_flag and not remove_noa_flag and not instance_to_remove_flag:
            # Create the local directory structure if it doesn't exist
            local_event_dir = os.path.join(download_dir, data_version, event)
            os.makedirs(local_event_dir, exist_ok=True)

            # Define the local file path
            local_file_path = os.path.join(local_event_dir, instance)
            
            # Download the file to the specified directory
            minio_connection.fget_object(bucket, address, local_file_path)
            print(f"Downloaded {instance} to {local_file_path}")

In [79]:
include_noa = True if scheme['name'] == 'hnoa' else False

# Assuming you've already set up the MinIO client connection
client = Minio(URL_MINIO, ACESS_KEY_MINIO, SECRET_KEY_MINIO, secure=False)

download_event_data_from_minio(
    minio_connection=client, 
    bucket='3wdataset',  # The bucket name
    data_version='public',
    positive_event='3',
    include_simulated=True,
    include_noa=False
)

Processing object: public/0/WELL-00001_20170201020207.csv
Extracted event: 0, instance: WELL-00001_20170201020207.csv
Downloaded WELL-00001_20170201020207.csv to ./bucket/public/0/WELL-00001_20170201020207.csv
Processing object: public/0/WELL-00001_20170201070114.csv
Extracted event: 0, instance: WELL-00001_20170201070114.csv
Downloaded WELL-00001_20170201070114.csv to ./bucket/public/0/WELL-00001_20170201070114.csv
Processing object: public/0/WELL-00001_20170201120124.csv
Extracted event: 0, instance: WELL-00001_20170201120124.csv
Downloaded WELL-00001_20170201120124.csv to ./bucket/public/0/WELL-00001_20170201120124.csv
Processing object: public/0/WELL-00001_20170201170311.csv
Extracted event: 0, instance: WELL-00001_20170201170311.csv
Downloaded WELL-00001_20170201170311.csv to ./bucket/public/0/WELL-00001_20170201170311.csv
Processing object: public/0/WELL-00001_20170201220228.csv
Extracted event: 0, instance: WELL-00001_20170201220228.csv
Downloaded WELL-00001_20170201220228.csv t

In [80]:
def load_local_data(download_dir, positive_event='3'):
    addresses, instances, events, y = [], [], [], []
    
    for root, dirs, files in os.walk(download_dir):
        for file in files:
            if file.endswith('.csv'):
                address = os.path.join(root, file)  # Full file path
                instance = file  # The file name itself is the instance
                event = root.split('/')[-1]  # Event is the directory (e.g., '0', '3', etc.)
                y_ = '1' if event == positive_event else '0'
                
                addresses.append(address)
                instances.append(instance)
                events.append(event)
                y.append(y_)
    sorted_data = sorted(zip(addresses, instances, events, y), key=lambda x: x[0])
    addresses, instances, events, y = zip(*sorted_data)
    
    return list(addresses), list(instances), list(events), list(y)

In [81]:
download_dir = './bucket/public'  # Path where the CSV files are stored
addresses, instances, events, y = load_local_data(download_dir, positive_event='3')

addresses_train, addresses_test, instances_train, instances_test, events_train, events_test, y_train, y_test = train_test_split(
    addresses, instances, events, y,
    stratify=y,
    test_size=test_size,
    random_state=seed
)

data_instances_info = np.column_stack(
    [
        np.concatenate([addresses_train, addresses_test], axis=0),
        np.concatenate([instances_train, instances_test], axis=0),
        np.concatenate([events_train, events_test], axis=0),
        np.concatenate([y_train, y_test], axis=0),
        np.concatenate([['Train']*len(y_train), ['Test']*len(y_test)])
    ]
)

df_instances_info = pd.DataFrame(data=data_instances_info, columns=['Address', 'Instance', 'Event', 'Class', 'Set'])
df_instances_info['id'] = np.arange(df_instances_info.shape[0])
df_instances_info.head()

Unnamed: 0,Address,Instance,Event,Class,Set,id
0,./bucket/public/0/WELL-00006_20170507230053.csv,WELL-00006_20170507230053.csv,0,0,Train,0
1,./bucket/public/3/SIMULATED_00048.csv,SIMULATED_00048.csv,3,1,Train,1
2,./bucket/public/0/WELL-00006_20170222100000.csv,WELL-00006_20170222100000.csv,0,0,Train,2
3,./bucket/public/0/WELL-00006_20170822130000.csv,WELL-00006_20170822130000.csv,0,0,Train,3
4,./bucket/public/0/WELL-00002_20170809210000.csv,WELL-00002_20170809210000.csv,0,0,Train,4


# Preprocessing local data

In [82]:
ids_train = df_instances_info[df_instances_info.Set == 'Train'].id.values
ids_test = df_instances_info[df_instances_info.Set == 'Test'].id.values

mean_y_train = np.mean(np.array(y_train).astype(float))
print(mean_y_train)
mean_y_test = np.mean(np.array(y_test).astype(float))
print(mean_y_test)

0.1512455516014235
0.14893617021276595


In [83]:
def load_data_local(
    local_path: str,  
    positive_event: str = '3', 
    periods_undersample: int or str or None = None):

    df = pd.read_csv(local_path)
    
    df['timestamp'] = pd.to_datetime(df['timestamp']) 
    df.index = df['timestamp']  
    df = df.drop(columns=['timestamp'])  
        
    df['class'] = df['class'].apply(lambda x: positive_event in str(x)).astype(int)

    df = df.ffill().bfill()

    for col in df.columns:
        if col != 'class': 
            df[f"{col}__is_missing"] = 1 if df.isna().sum()[col] == df.shape[0] else 0

    df = df.fillna(0)

    if type(periods_undersample) in [int, str]:
        if type(periods_undersample) == int:
            periods_undersample = f'{periods_undersample}s'  
            
        df = df.resample(periods_undersample).last() 
    
    return df


In [84]:
def load_df_from_local_instances(
    addresses, instances, 
    events, ids,
    positive_event: str = '3',
    periods_undersample: int or str or None = None
):
    df_lst = []

    for address, instance, event, id_ in zip(addresses, instances, events, ids):
        local_file_path = address  

        df_temp = load_data_local(
            local_path=local_file_path,
            positive_event=positive_event,
            periods_undersample=periods_undersample
        )
        
        df_temp['id'] = id_
        df_lst.append(df_temp)
    
    df = pd.concat(df_lst, axis=0)

    return df


In [87]:
local_data_dir = './bucket/public' 
df_train = load_df_from_local_instances( 
    addresses=addresses_train, 
    instances=instances_train, 
    events=events_train, 
    ids=ids_train,
    positive_event='3',
    periods_undersample=60
)
display(df_train.head())

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class,P-PDG__is_missing,P-TPT__is_missing,T-TPT__is_missing,P-MON-CKP__is_missing,T-JUS-CKP__is_missing,P-JUS-CKGL__is_missing,T-JUS-CKGL__is_missing,QGL__is_missing,id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2017-05-07 23:00:00,44858050.0,18064670.0,118.5555,2496341.0,79.55852,6238379.0,0.0,0.0,0,0,0,0,0,0,0,1,0,0
2017-05-07 23:01:00,44858050.0,18062630.0,118.5676,2496278.0,79.55851,6238633.0,0.0,0.0,0,0,0,0,0,0,0,1,0,0
2017-05-07 23:02:00,44858050.0,18073250.0,118.5775,2496214.0,79.55849,6238886.0,0.0,0.0,0,0,0,0,0,0,0,1,0,0
2017-05-07 23:03:00,44858050.0,18069900.0,118.5658,2496151.0,79.55848,6239140.0,0.0,0.0,0,0,0,0,0,0,0,1,0,0
2017-05-07 23:04:00,44858050.0,18066550.0,118.5858,2496088.0,79.55846,6239393.0,0.0,0.0,0,0,0,0,0,0,0,1,0,0


In [88]:
local_data_dir = './bucket/public' 
df_test = load_df_from_local_instances( 
    addresses=addresses_test, 
    instances=instances_test, 
    events=events, 
    ids=ids_test,
    positive_event='3',
    periods_undersample=60 
)
display(df_test.head())

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class,P-PDG__is_missing,P-TPT__is_missing,T-TPT__is_missing,P-MON-CKP__is_missing,T-JUS-CKP__is_missing,P-JUS-CKGL__is_missing,T-JUS-CKGL__is_missing,QGL__is_missing,id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-04-11 04:43:00,14704650.0,7654939.0,112.2108,4002365.0,60.48167,0.0,0.0,0.0,1,0,0,0,0,0,1,1,1,562
2018-04-11 04:44:00,14818890.0,7747736.0,112.4392,4002360.0,60.71173,0.0,0.0,0.0,1,0,0,0,0,0,1,1,1,562
2018-04-11 04:45:00,14958110.0,7863992.0,112.1221,4002362.0,59.71706,0.0,0.0,0.0,1,0,0,0,0,0,1,1,1,562
2018-04-11 04:46:00,15123500.0,7994116.0,112.2366,4002368.0,59.75391,0.0,0.0,0.0,1,0,0,0,0,0,1,1,1,562
2018-04-11 04:47:00,15240230.0,8090635.0,112.3785,4002364.0,59.87905,0.0,0.0,0.0,1,0,0,0,0,0,1,1,1,562


In [91]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 193898 entries, 2017-05-07 23:00:00 to 2017-02-20 03:00:00
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   P-PDG                   193898 non-null  float64
 1   P-TPT                   193898 non-null  float64
 2   T-TPT                   193898 non-null  float64
 3   P-MON-CKP               193898 non-null  float64
 4   T-JUS-CKP               193898 non-null  float64
 5   P-JUS-CKGL              193898 non-null  float64
 6   T-JUS-CKGL              193898 non-null  float64
 7   QGL                     193898 non-null  float64
 8   class                   193898 non-null  int64  
 9   P-PDG__is_missing       193898 non-null  int64  
 10  P-TPT__is_missing       193898 non-null  int64  
 11  T-TPT__is_missing       193898 non-null  int64  
 12  P-MON-CKP__is_missing   193898 non-null  int64  
 13  T-JUS-CKP__is_missing   193898 non-null 

In [92]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 53293 entries, 2018-04-11 04:43:00 to 2017-08-11 12:57:00
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   P-PDG                   53293 non-null  float64
 1   P-TPT                   53293 non-null  float64
 2   T-TPT                   53293 non-null  float64
 3   P-MON-CKP               53293 non-null  float64
 4   T-JUS-CKP               53293 non-null  float64
 5   P-JUS-CKGL              53293 non-null  float64
 6   T-JUS-CKGL              53293 non-null  float64
 7   QGL                     53293 non-null  float64
 8   class                   53293 non-null  int64  
 9   P-PDG__is_missing       53293 non-null  int64  
 10  P-TPT__is_missing       53293 non-null  int64  
 11  T-TPT__is_missing       53293 non-null  int64  
 12  P-MON-CKP__is_missing   53293 non-null  int64  
 13  T-JUS-CKP__is_missing   53293 non-null  int64  
 14  P-J