# Modifying Data and ML modeling

## Loading library and data

In [208]:
import sys
import os
import copy
import numpy as np
import pandas as pd
import matplotlib
import scipy.spatial
import sklearn.preprocessing
import datashader as ds
import colorcet as cc
from findpeaks import findpeaks
import seaborn as sns
import missingno
from statsmodels.graphics.tsaplots import acf
import kydlib
import scipy.stats as stats
import seaborn as sns
import tkinter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# sys.path.append(os.path.join('..', '..'))
sys.path.append(os.path.join(os.getcwd(), '..'))
import toolkit as tk
# /home/ainbahar/dataproject/3W-research-project/toolkit
color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [209]:
# Section: Load Data
real_instances, simulated_instances, drawn_instances = tk.get_all_labels_and_files()

In [210]:
# Table of Instance
toi = tk.create_table_of_instances(real_instances, simulated_instances, drawn_instances)
toi

SOURCE,REAL,SIMULATED,HAND-DRAWN,TOTAL
INSTANCE LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0 - Normal Operation,594,0,0,594
1 - Abrupt Increase of BSW,5,114,10,129
2 - Spurious Closure of DHSV,22,16,0,38
3 - Severe Slugging,32,74,0,106
4 - Flow Instability,344,0,0,344
5 - Rapid Productivity Loss,11,439,0,450
6 - Quick Restriction in PCK,6,215,0,221
7 - Scaling in PCK,5,0,10,15
8 - Hydrate in Production Line,0,81,0,81
TOTAL,1019,939,20,1978


### Input instance label index

In [211]:
# Prompt user to input the number of instances
try:
    instance_n = int(input("Enter the number of instances: "))
    print(f"Number of instances set to: {toi.index[instance_n]}")
except ValueError:
    print("Invalid input. Please enter a valid integer.")


Number of instances set to: 7 - Scaling in PCK


In [212]:
df = pd.read_csv(f'trainDataset/train_df_instance_{instance_n}.csv', index_col='timestamp', parse_dates=True)
df

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-06-11 02:12:18,0.0,8.713634e+06,109.556000,2.142981e+06,68.811070,7.960069e+06,0.0,0.0
2018-06-11 02:12:19,0.0,8.713652e+06,109.556150,2.142981e+06,68.807995,7.960069e+06,0.0,0.0
2018-06-11 02:12:20,0.0,8.713669e+06,109.556300,2.142981e+06,68.804920,7.960069e+06,0.0,0.0
2018-06-11 02:12:21,0.0,8.713686e+06,109.556450,2.142981e+06,68.801842,7.960069e+06,0.0,0.0
2018-06-11 02:12:22,0.0,8.713704e+06,109.556620,2.142981e+06,68.798764,7.960069e+06,0.0,0.0
...,...,...,...,...,...,...,...,...
2018-06-11 14:08:27,0.0,8.722798e+06,109.494825,2.187673e+06,68.287542,8.490644e+06,0.0,7.0
2018-06-11 14:08:28,0.0,8.722796e+06,109.494842,2.187677e+06,68.287542,8.490657e+06,0.0,7.0
2018-06-11 14:08:29,0.0,8.722793e+06,109.494859,2.187680e+06,68.287542,8.490671e+06,0.0,7.0
2018-06-11 14:08:30,0.0,8.722791e+06,109.494876,2.187683e+06,68.287542,8.490684e+06,0.0,7.0


In [213]:
cols_to_check = df.columns.difference(['class'])

## Time Windowing

In [214]:
def time_windowing(df, window_size=60, step_size=15):
    windows = [df.iloc[i:i + window_size] for i in range(0, len(df), window_size)]
    # Select every 'step_size' window
    selected_windows = windows[::step_size]
    
     # Add window ID
    for window_id, window in enumerate(windows, start=1):
        window['id'] = window_id

    # Combine the selected windows into a single DataFrame
    result_df = pd.concat(selected_windows)

    return result_df

In [215]:
resampled_df = time_windowing(df)
resampled_df

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,QGL,class,id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-06-11 02:12:18,0.0,8.713634e+06,109.556000,2.142981e+06,68.811070,7.960069e+06,0.0,0.0,1
2018-06-11 02:12:19,0.0,8.713652e+06,109.556150,2.142981e+06,68.807995,7.960069e+06,0.0,0.0,1
2018-06-11 02:12:20,0.0,8.713669e+06,109.556300,2.142981e+06,68.804920,7.960069e+06,0.0,0.0,1
2018-06-11 02:12:21,0.0,8.713686e+06,109.556450,2.142981e+06,68.801842,7.960069e+06,0.0,0.0,1
2018-06-11 02:12:22,0.0,8.713704e+06,109.556620,2.142981e+06,68.798764,7.960069e+06,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...
2018-06-11 13:58:13,0.0,8.723662e+06,109.489778,2.190326e+06,68.274614,8.484217e+06,0.0,7.0,706
2018-06-11 13:58:14,0.0,8.723661e+06,109.489796,2.190323e+06,68.274666,8.484227e+06,0.0,7.0,706
2018-06-11 13:58:15,0.0,8.723660e+06,109.489814,2.190320e+06,68.274718,8.484236e+06,0.0,7.0,706
2018-06-11 13:58:16,0.0,8.723660e+06,109.489832,2.190317e+06,68.274770,8.484245e+06,0.0,7.0,706


## Standard Scaling

In [216]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

features_value = resampled_df[cols_to_check]
target_values = resampled_df['class']
window_id = resampled_df['id']


# Fit the scaler on the training data and transform the training data
features_scaled = scaler.fit_transform(features_value)

# Combine scaled features with the labels
resampled_df_scaled = pd.DataFrame(features_scaled, index=features_value.index, columns=features_value.columns)
resampled_df_scaled['class'] = target_values.values
resampled_df_scaled['id'] = window_id.values

resampled_df_scaled

Unnamed: 0_level_0,P-JUS-CKGL,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-TPT,class,id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-06-11 02:12:18,-0.896085,-0.395597,0.0,-1.361673,0.0,0.767983,2.257823,0.0,1
2018-06-11 02:12:19,-0.896085,-0.395597,0.0,-1.355200,0.0,0.754849,2.262924,0.0,1
2018-06-11 02:12:20,-0.896085,-0.395597,0.0,-1.348728,0.0,0.741714,2.268026,0.0,1
2018-06-11 02:12:21,-0.896085,-0.395597,0.0,-1.342347,0.0,0.728569,2.273128,0.0,1
2018-06-11 02:12:22,-0.896085,-0.395597,0.0,-1.335930,0.0,0.715419,2.278909,0.0,1
...,...,...,...,...,...,...,...,...,...
2018-06-11 13:58:13,1.839905,2.161575,0.0,2.347225,0.0,-1.523448,0.005571,7.0,706
2018-06-11 13:58:14,1.839952,2.161394,0.0,2.346996,0.0,-1.523226,0.006183,7.0,706
2018-06-11 13:58:15,1.840000,2.161228,0.0,2.346749,0.0,-1.523004,0.006802,7.0,706
2018-06-11 13:58:16,1.840047,2.161078,0.0,2.346484,0.0,-1.522780,0.007428,7.0,706


In [217]:
resampled_df_scaled['id'].unique()

array([  1,  16,  31,  46,  61,  76,  91, 106, 121, 136, 151, 166, 181,
       196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346, 361, 376,
       391, 406, 421, 436, 451, 466, 481, 496, 511, 526, 541, 556, 571,
       586, 601, 616, 631, 646, 661, 676, 691, 706])

## Feature Extraction and Selection

In [218]:
resampled_df_scaled = resampled_df_scaled.reset_index()

# Rename the 'timestamp' column to 'time'
resampled_df_scaled.rename(columns={'timestamp': 'time'}, inplace=True)

In [219]:
resampled_df_scaled

Unnamed: 0,time,P-JUS-CKGL,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-TPT,class,id
0,2018-06-11 02:12:18,-0.896085,-0.395597,0.0,-1.361673,0.0,0.767983,2.257823,0.0,1
1,2018-06-11 02:12:19,-0.896085,-0.395597,0.0,-1.355200,0.0,0.754849,2.262924,0.0,1
2,2018-06-11 02:12:20,-0.896085,-0.395597,0.0,-1.348728,0.0,0.741714,2.268026,0.0,1
3,2018-06-11 02:12:21,-0.896085,-0.395597,0.0,-1.342347,0.0,0.728569,2.273128,0.0,1
4,2018-06-11 02:12:22,-0.896085,-0.395597,0.0,-1.335930,0.0,0.715419,2.278909,0.0,1
...,...,...,...,...,...,...,...,...,...,...
2875,2018-06-11 13:58:13,1.839905,2.161575,0.0,2.347225,0.0,-1.523448,0.005571,7.0,706
2876,2018-06-11 13:58:14,1.839952,2.161394,0.0,2.346996,0.0,-1.523226,0.006183,7.0,706
2877,2018-06-11 13:58:15,1.840000,2.161228,0.0,2.346749,0.0,-1.523004,0.006802,7.0,706
2878,2018-06-11 13:58:16,1.840047,2.161078,0.0,2.346484,0.0,-1.522780,0.007428,7.0,706


In [220]:
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

def extract_and_impute_features(data, id_column='id', timestamp_column='timestamp', drop_columns=['class'], custom_fc_parameters=None):
    
    if custom_fc_parameters is None:
        custom_fc_parameters = {
            'mean': None,
            'median': None,
            'standard_deviation': None,
            'variance': None,
            'maximum': None,
            'minimum': None
        }

    # Rename the timestamp column
    data = data.rename(columns={timestamp_column: 'time'})

    # Extract features
    extracted_features = extract_features(
        data.drop(columns=drop_columns),
        column_id=id_column,
        column_sort='time',
        default_fc_parameters=custom_fc_parameters
    )

    # Impute missing values
    selected_features = impute(extracted_features)

    return selected_features


In [221]:
resampled_df_scaled

Unnamed: 0,time,P-JUS-CKGL,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-TPT,class,id
0,2018-06-11 02:12:18,-0.896085,-0.395597,0.0,-1.361673,0.0,0.767983,2.257823,0.0,1
1,2018-06-11 02:12:19,-0.896085,-0.395597,0.0,-1.355200,0.0,0.754849,2.262924,0.0,1
2,2018-06-11 02:12:20,-0.896085,-0.395597,0.0,-1.348728,0.0,0.741714,2.268026,0.0,1
3,2018-06-11 02:12:21,-0.896085,-0.395597,0.0,-1.342347,0.0,0.728569,2.273128,0.0,1
4,2018-06-11 02:12:22,-0.896085,-0.395597,0.0,-1.335930,0.0,0.715419,2.278909,0.0,1
...,...,...,...,...,...,...,...,...,...,...
2875,2018-06-11 13:58:13,1.839905,2.161575,0.0,2.347225,0.0,-1.523448,0.005571,7.0,706
2876,2018-06-11 13:58:14,1.839952,2.161394,0.0,2.346996,0.0,-1.523226,0.006183,7.0,706
2877,2018-06-11 13:58:15,1.840000,2.161228,0.0,2.346749,0.0,-1.523004,0.006802,7.0,706
2878,2018-06-11 13:58:16,1.840047,2.161078,0.0,2.346484,0.0,-1.522780,0.007428,7.0,706


In [222]:
y = resampled_df_scaled.groupby('id')['class'].first()
X = extract_and_impute_features(resampled_df_scaled)

Feature Extraction: 100%|██████████| 10/10 [00:00<00:00, 263.61it/s]


In [223]:
print(X.head())
print(y.head())

    P-JUS-CKGL__mean  P-JUS-CKGL__median  P-JUS-CKGL__standard_deviation  \
1          -0.896085           -0.896085                    1.110223e-16   
16         -0.885541           -0.885543                    4.143325e-04   
31         -0.868763           -0.868768                    4.445877e-04   
46         -0.836679           -0.836674                    5.562732e-04   
61         -0.801743           -0.801716                    1.205338e-03   

    P-JUS-CKGL__variance  P-JUS-CKGL__maximum  P-JUS-CKGL__minimum  \
1           1.232595e-32            -0.896085            -0.896085   
16          1.716714e-07            -0.884834            -0.886245   
31          1.976582e-07            -0.867995            -0.869513   
46          3.094399e-07            -0.835740            -0.837635   
61          1.452840e-06            -0.799741            -0.803843   

    P-MON-CKP__mean  P-MON-CKP__median  P-MON-CKP__standard_deviation  \
1         -0.408539          -0.395597           

In [224]:
processed_data = pd.concat([X, y], axis=1)
# model_data = processed_data[processed_data['class'] != 7]
model_data = processed_data
model_data

Unnamed: 0,P-JUS-CKGL__mean,P-JUS-CKGL__median,P-JUS-CKGL__standard_deviation,P-JUS-CKGL__variance,P-JUS-CKGL__maximum,P-JUS-CKGL__minimum,P-MON-CKP__mean,P-MON-CKP__median,P-MON-CKP__standard_deviation,P-MON-CKP__variance,...,T-JUS-CKP__variance,T-JUS-CKP__maximum,T-JUS-CKP__minimum,T-TPT__mean,T-TPT__median,T-TPT__standard_deviation,T-TPT__variance,T-TPT__maximum,T-TPT__minimum,class
1,-0.896085,-0.896085,1.110223e-16,1.2325950000000001e-32,-0.896085,-0.896085,-0.408539,-0.395597,0.021751,0.000473,...,0.03049081,0.767983,0.238172,2.186833,2.229633,0.121185,0.014686,2.327431,1.965559,0.0
16,-0.885541,-0.885543,0.0004143325,1.716714e-07,-0.884834,-0.886245,-0.855952,-0.857024,0.006558,4.3e-05,...,3.247339e-06,0.273917,0.266663,2.610011,2.607677,0.006759,4.6e-05,2.624352,2.60174,0.0
31,-0.868763,-0.868768,0.0004445877,1.976582e-07,-0.867995,-0.869513,-0.709083,-0.710023,0.002611,7e-06,...,0.0003817121,0.721989,0.658324,2.306829,2.304432,0.005371,2.9e-05,2.318819,2.301961,0.0
46,-0.836679,-0.836674,0.0005562732,3.094399e-07,-0.83574,-0.837635,-0.521733,-0.520007,0.003964,1.6e-05,...,0.0002849335,1.494487,1.4346,1.812628,1.81348,0.004221,1.8e-05,1.819434,1.804589,0.0
61,-0.801743,-0.801716,0.001205338,1.45284e-06,-0.799741,-0.803843,-0.558425,-0.559009,0.004024,1.6e-05,...,5.486716e-05,1.902829,1.877594,1.830739,1.830503,0.00536,2.9e-05,1.839244,1.821407,0.0
76,-0.76523,-0.765237,0.0007603757,5.781712e-07,-0.763923,-0.766513,-0.423002,-0.423446,0.003582,1.3e-05,...,3.971649e-05,1.61873,1.59602,1.937297,1.936576,0.007788,6.1e-05,1.951509,1.925094,0.0
91,-0.746186,-0.746187,5.716091e-06,3.267369e-11,-0.746174,-0.746193,-0.493687,-0.487595,0.010399,0.000108,...,2.112374e-06,1.36805,1.363227,1.220057,1.219773,0.014843,0.00022,1.246684,1.195443,0.0
106,-0.758909,-0.758903,0.0007603757,5.781712e-07,-0.757626,-0.760216,-0.91065,-0.910537,0.005583,3.1e-05,...,0.0001597691,1.836049,1.794148,0.627857,0.628625,0.006461,4.2e-05,0.637082,0.615918,0.0
121,-0.79728,-0.797303,0.001201091,1.442619e-06,-0.795198,-0.799284,-1.055357,-1.055936,0.007972,6.4e-05,...,3.309602e-07,2.089596,2.087472,0.790488,0.793082,0.004851,2.4e-05,0.794868,0.77908,0.0
136,-0.829212,-0.829215,0.0001834467,3.36527e-08,-0.828895,-0.82952,-1.150281,-1.14991,0.00385,1.5e-05,...,0.0001276392,1.787406,1.748915,0.762861,0.76241,0.004113,1.7e-05,0.769842,0.756557,0.0


In [225]:
def update_class_label(instance_n, class_column):
    class_mappings = {
        1: {0: 0, 101: 1, 1: 1},
        2: {0: 0, 102: 1, 2: 1},
        3: {0: 0, 103: 1, 3: 1},
        4: {0: 0, 104: 1, 4: 1},
        5: {0: 0, 105: 1, 5: 1},
        6: {0: 0, 106: 1, 6: 1},
        7: {0: 0, 107: 1, 7: 1},
        8: {0: 0, 108: 1, 8: 1}
    }
    
    if instance_n in class_mappings:
        return class_column.replace(class_mappings[instance_n])
    else:
        print("instance_number is out of range")

In [226]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Assuming 'class' is your target variable
X = model_data.drop(columns=['class'])  # Features
y = model_data['class']  # Target
y = update_class_label(instance_n, y)

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled features and target
processed_data_resampled = pd.concat([X_resampled, y_resampled], axis=1)

# Display the new class distribution
print("Resampled class distribution:", Counter(processed_data_resampled['class']))


Resampled class distribution: Counter({0.0: 28, 1.0: 28})


In [227]:
from sklearn.model_selection import train_test_split

X = processed_data_resampled.drop(columns=['class'])  # Features
y = processed_data_resampled['class']  # Target
# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42, stratify=y_resampled)


# Random Forest

In [228]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_de

In [229]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

def evaluate_model(model, X_test, y_test):
    """
    Evaluates the given model using the provided test data and returns key performance metrics.
    """
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc_roc = roc_auc_score(y_test, y_prob)
    auc_prc = average_precision_score(y_test, y_prob)
    
    # Classification report
    report = classification_report(y_test, y_pred)

    # Compile all metrics into a dictionary
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "auc_roc": auc_roc,
        "auc_prc": auc_prc,
        "classification_report": report
    }
    
    return metrics, y_pred



In [230]:
X_train.columns.to_list()

['P-JUS-CKGL__mean',
 'P-JUS-CKGL__median',
 'P-JUS-CKGL__standard_deviation',
 'P-JUS-CKGL__variance',
 'P-JUS-CKGL__maximum',
 'P-JUS-CKGL__minimum',
 'P-MON-CKP__mean',
 'P-MON-CKP__median',
 'P-MON-CKP__standard_deviation',
 'P-MON-CKP__variance',
 'P-MON-CKP__maximum',
 'P-MON-CKP__minimum',
 'P-PDG__mean',
 'P-PDG__median',
 'P-PDG__standard_deviation',
 'P-PDG__variance',
 'P-PDG__maximum',
 'P-PDG__minimum',
 'P-TPT__mean',
 'P-TPT__median',
 'P-TPT__standard_deviation',
 'P-TPT__variance',
 'P-TPT__maximum',
 'P-TPT__minimum',
 'QGL__mean',
 'QGL__median',
 'QGL__standard_deviation',
 'QGL__variance',
 'QGL__maximum',
 'QGL__minimum',
 'T-JUS-CKP__mean',
 'T-JUS-CKP__median',
 'T-JUS-CKP__standard_deviation',
 'T-JUS-CKP__variance',
 'T-JUS-CKP__maximum',
 'T-JUS-CKP__minimum',
 'T-TPT__mean',
 'T-TPT__median',
 'T-TPT__standard_deviation',
 'T-TPT__variance',
 'T-TPT__maximum',
 'T-TPT__minimum']

In [231]:
# Get the best model from grid search
best_model_rf = grid_search_rf.best_estimator_

# Test model
metrics_rf, y_pred_rf = evaluate_model(best_model_rf, X_test, y_test)

# Print the metrics
print("Accuracy:", metrics_rf['accuracy'])
print("Precision:", metrics_rf['precision'])
print("Recall:", metrics_rf['recall'])
print("F1 Score:", metrics_rf['f1_score'])
print("AUC-ROC:", metrics_rf['auc_roc'])
print("AUC-PRC:", metrics_rf['auc_prc'])
print("\nClassification Report:\n", metrics_rf['classification_report'])

# Save the best model
joblib.dump(best_model_rf, f'./predictionModel/{instance_n}/best_random_forest_model.pkl')
print("Model saved as 'best_random_forest_model.pkl'")


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC-ROC: 1.0
AUC-PRC: 1.0

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        12
         1.0       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

Model saved as 'best_random_forest_model.pkl'


In [232]:
def save_or_update_metrics(model_name, metrics, index, filename):
    """
    Save or update the metrics dictionary to a CSV file with a specified index.
    """
    # Prepare the metrics dictionary with the proper format
    metrics_formatted = {
        f'{model_name}_accuracy': metrics['accuracy'],
        f'{model_name}_precision': metrics['precision'],
        f'{model_name}_recall': metrics['recall'],
        f'{model_name}_f1': metrics['f1_score'],
        f'{model_name}_auc_roc': metrics['auc_roc'],
        f'{model_name}_auc_prc': metrics['auc_prc']
    }

    # Convert metrics dictionary to DataFrame with the specified index
    metrics_df = pd.DataFrame([metrics_formatted], index=[index])

    # Get the current working directory
    current_directory = os.getcwd()
    
    filepath = os.path.join(current_directory, filename)

    # Check if the CSV file already exists
    if os.path.exists(filepath):
        # Read existing CSV file
        existing_df = pd.read_csv(filepath, index_col=0)
        if index in existing_df.index:
            # Update existing columns or add new columns for the same index
            for col in metrics_df.columns:
                existing_df.loc[index, col] = metrics_df.loc[index, col]
            updated_df = existing_df
        else:
            # Append new metrics if the index doesn't exist
            updated_df = pd.concat([existing_df, metrics_df])
    else:
        # Create new DataFrame if file does not exist
        updated_df = metrics_df

    # Save DataFrame to CSV
    updated_df.to_csv(filepath)


In [233]:
# Save or update the metrics to CSV
save_or_update_metrics('rf',metrics_rf, instance_n, 'model_metrics.csv')

In [234]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve

def plot_metrics(y_test, y_prob):
    """
    Plots ROC Curve and Precision-Recall Curve.
    """
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right')

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    prc_auc = auc(recall, precision)

    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, color='blue', lw=2, label=f'Precision-Recall curve (area = {prc_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower left')

    plt.tight_layout()
    plt.show()

# Get the best model from grid search
best_model_rf = grid_search_rf.best_estimator_

# Test model
metrics_rf, y_pred_rf = evaluate_model(best_model_rf, X_test, y_test)

# Plot the ROC and Precision-Recall curves
plot_metrics(y_test, best_model_rf.predict_proba(X_test)[:, 1])


In [235]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt


# Loop through the first 10 trees in the random forest
for i, tree in enumerate(best_model_rf.estimators_[:10]):
    if i == 9:
        plt.figure(figsize=(20, 10))
        plot_tree(tree, filled=True, feature_names=X_test.columns, class_names=['normal', 'faulty'], rounded=True)
        plt.title(f"Decision Tree from Random Forest", fontsize=16)
        plt.show()



In [236]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

# Add title and labels
plt.title('Confusion Matrix for Random Forest Classifier', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.xticks(ticks=[0.5, 1.5], labels=['Normal', 'Faulty'], fontsize=12)  # Adjust labels according to your classes
plt.yticks(ticks=[0.5, 1.5], labels=['Normal', 'Faulty'], fontsize=12)

# Show plot
plt.show()


In [237]:
import shap

# Create a SHAP explainer
explainer = shap.TreeExplainer(best_model_rf)
shap_values = explainer.shap_values(X_test)

# Plot SHAP values
shap.summary_plot(shap_values, X_test)


# XGBoost


In [238]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

# Define parameter grid
param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 6, 9],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}

# Create a pipeline with XGBoost
pipeline = Pipeline([
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])
# Initialize model
xg_boost = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search_xgb.fit(X_train,y_train)

# Best parameters and model
best_params_xgb = grid_search_xgb.best_params_
print("Best Parameters:", best_params_xgb)
best_model_xgb = grid_search_xgb.best_estimator_

# Make predictions
y_pred_xgb = best_model_xgb.predict(X_test)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'xgb__colsample_bytree': 0.8, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 100, 'xgb__subsample': 0.8}


In [239]:
best_model_xgb

In [240]:
# Get the best model from grid search
best_model_xgb = grid_search_xgb.best_estimator_

# Test model
metrics_xgb, y_pred_xgb = evaluate_model(best_model_xgb, X_test, y_test)

# Print the metrics
# Print the metrics
print("Accuracy:", metrics_xgb['accuracy'])
print("Precision:", metrics_xgb['precision'])
print("Recall:", metrics_xgb['recall'])
print("F1 Score:", metrics_xgb['f1_score'])
print("AUC-ROC:", metrics_xgb['auc_roc'])
print("AUC-PRC:", metrics_xgb['auc_prc'])
print("\nClassification Report:\n", metrics_xgb['classification_report'])


# Save the best model

joblib.dump(best_model_rf, f'./predictionModel/{instance_n}/best_xgboost_model.pkl')
print("Model saved as 'best_xgboost_model.pkl'")

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC-ROC: 1.0
AUC-PRC: 1.0

Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        12
         1.0       1.00      1.00      1.00        11

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23

Model saved as 'best_xgboost_model.pkl'


In [241]:
# Save or update the metrics to CSV
save_or_update_metrics('xgb',metrics_xgb, instance_n, 'model_metrics.csv')

# LOF

In [242]:
X_lof = processed_data.drop(columns=['class'])
y_lof = update_class_label(instance_n, processed_data['class'])

# Separate the data based on the class
X_class_0 = X_lof[y_lof == 0]
y_class_0 = y_lof[y_lof == 0]

X_class_1 = X_lof[y_lof == 1]
y_class_1 = y_lof[y_lof == 1]

# Split class 0 data into training and testing sets (60% train, 40% test)
X_class_0_train, X_class_0_test, y_class_0_train, y_class_0_test = train_test_split(
    X_class_0, y_class_0, test_size=0.40, random_state=42
)

# 60% of train dataset
X_lof_train = X_class_0_train
y_lof_train = y_class_0

# 40% of test dataset
X_lof_test = pd.concat([X_class_0_test, X_class_1])
y_lof_test = pd.concat([y_class_0_test, y_class_1])

# Print sizes of the resulting splits for verification
print(f"Training data size: {X_lof_train.shape}")
print(f"Testing data size: {X_lof_test.shape}")


Training data size: (12, 42)
Testing data size: (36, 42)


In [243]:
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_neighbors': [5, 10, 20, 30],  # Explore different numbers of neighbors
    'contamination': [0.05, 0.1, 0.15] , # Explore different contamination levels
    'novelty': [True]
}

# Initialize the LOF model
lof = LocalOutlierFactor()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lof, param_grid=param_grid, 
                           scoring='f1',  # Choose an appropriate scoring metric
                           cv=3)  # 3-fold cross-validation

# Fit the grid search on the training data
grid_search.fit(X_lof_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_lof = grid_search.best_estimator_

print("Best Parameters:", best_params)

Best Parameters: {'contamination': 0.05, 'n_neighbors': 5, 'novelty': True}


In [244]:
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_neighbors': [5, 10, 20, 30],  # Explore different numbers of neighbors
    'contamination': [0.05, 0.1, 0.15] , # Explore different contamination levels
    'novelty': [True]
}

# Initialize the LOF model
lof = LocalOutlierFactor()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lof, param_grid=param_grid, 
                           scoring='f1',  # Choose an appropriate scoring metric
                           cv=3)  # 3-fold cross-validation

# Fit the grid search on the training data
grid_search.fit(X_lof_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_lof = grid_search.best_estimator_

print("Best Parameters:", best_params)

Best Parameters: {'contamination': 0.05, 'n_neighbors': 5, 'novelty': True}


In [245]:
from sklearn.metrics import confusion_matrix, classification_report

# Predict outliers on the test data
lof_test_predictions = best_lof.predict(X_lof_test)
lof_test_predictions = np.where(lof_test_predictions == 1, 0, 1)

# Convert predictions to DataFrame if needed
X_lof_test['LOF_Score'] = lof_test_predictions


# Generate the confusion matrix and classification report
conf_matrix = confusion_matrix(y_lof_test, lof_test_predictions)
class_report = classification_report(y_lof_test, lof_test_predictions)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[ 5  3]
 [ 5 23]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.62      0.56         8
         1.0       0.88      0.82      0.85        28

    accuracy                           0.78        36
   macro avg       0.69      0.72      0.70        36
weighted avg       0.80      0.78      0.79        36



In [246]:
# Calculate metrics
accuracy = accuracy_score(y_lof_test, lof_test_predictions)
precision = precision_score(y_lof_test, lof_test_predictions)
recall = recall_score(y_lof_test, lof_test_predictions)
f1 = f1_score(y_lof_test, lof_test_predictions)

# Initialize AUC metrics
roc_auc = None
prc_auc = None

if lof_test_predictions is not None:
    roc_auc = roc_auc_score(y_lof_test, lof_test_predictions)
    prc_auc = average_precision_score(y_lof_test, lof_test_predictions)

# Create a dictionary of metrics
metrics_lof = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'auc_roc': roc_auc,
    'auc_prc': prc_auc
}

print(metrics_lof)

{'accuracy': 0.7777777777777778, 'precision': 0.8846153846153846, 'recall': 0.8214285714285714, 'f1_score': 0.8518518518518519, 'auc_roc': 0.7232142857142858, 'auc_prc': 0.8655372405372406}


In [247]:
# Save or update the metrics to CSV
save_or_update_metrics('lof',metrics_lof, instance_n, 'model_metrics.csv')

# Validation

## Preparing validation dataset


In [248]:
def load_real_instance_n(instance_n):
    files_labeled= [path for label, path in real_instances if label == instance_n]

    # Read the CSV files into DataFrames
    dataframes = [pd.read_csv(file) for file in files_labeled]

    # assign names to these DataFrames for easy access
    df_dict= {file.stem: pd.read_csv(file) for file in files_labeled}
    
    print(f"Number of instances set to: {toi.index[instance_n]}")
    return df_dict

In [249]:
df_dict = load_real_instance_n(instance_n)

Number of instances set to: 7 - Scaling in PCK


In [250]:
instance_list_1 = ['WELL-00001_20140124093303', 'WELL-00006_20170731180930','WELL-00006_20170731220432', 'WELL-00006_20180617200257']
instance_list_2 = ['WELL-00010_20171218190131' ,'WELL-00002_20131104014101','WELL-00009_20170313160804']
instance_list_5 = ['WELL-00015_20170620040530','WELL-00017_20140319031616','WELL-00017_20140314135248']
instance_list_6 = ['WELL-00002_20140325170304','WELL-00002_20140301151700', 'WELL-00002_20140212170333']
instance_list_7 = ['WELL-00018_20180611021218','WELL-00001_20170226140146','WELL-00006_20180617181315', 'WELL-00018_20190403023307', 'WELL-00006_20180620155728']


In [251]:
def handle_missing_data(df, columns) :  
# Subset the DataFrame to only include the specified columns
    print(df.columns)
    subset_df = df[columns]

    # Calculate the percentage of null values for each column in the subset
    null_percentages = subset_df.isnull().mean() * 100

    # List the columns with more than 18% null values
    columns_with_high_nulls = null_percentages[null_percentages > 18].index.tolist()

    # Drop the columns with high null values from the subset DataFrame
    modified_df = subset_df.drop(columns=columns_with_high_nulls)

    # Forward fill na values in the 'class' column
    if 'class' in modified_df.columns:
        modified_df['class'] = modified_df['class'].fillna(method='ffill')
        
    
    # modified_df = modified_df[modified_df['class'].notnull()]

    # # replace 'class' with 107.0 as 7.0
    # modified_df['class'] = modified_df['class'].replace(107.0, 7.0)

    # Smooth the DataFrame using a moving average 
    window_size = 1800
    smoothed_df = modified_df.copy()
    sensor_columns = modified_df.columns.difference(['class'])
    smoothed_df[sensor_columns] = modified_df[sensor_columns].rolling(window=window_size, min_periods=1).mean()

    return(modified_df, smoothed_df)

In [252]:
def z_score_outlier(df, columns):
    from scipy.stats import zscore
    

    # Calculate Z-scores for each column
    df_zscores = df[columns].apply(zscore)

    # Set a threshold for Z-scores to identify outliers
    threshold = 3

    # Identify outliers
    outliers = (np.abs(df_zscores) > threshold).any(axis=1)

    # Replace outliers with rolling average
    window_size = 3  # Set window size for rolling average
    z_score_df = df.copy()

    for col in columns:
        rolling_avg = df[col].rolling(window=window_size, min_periods=1).mean()
        z_score_df.loc[outliers, col] = rolling_avg[outliers]


    return(z_score_df)



In [253]:
def validation_metrics(model, X, y):
    y_pred = model.predict(X)

     # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    auc_roc = roc_auc_score(y, y_pred)
    auc_prc = average_precision_score(y, y_pred)

    # Compile all metrics into a dictionary
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "auc_roc": auc_roc,
        "auc_prc": auc_prc
    }
    
    return y_pred, metrics


## Validation functions

In [254]:
def get_instance_list_var(instance_n):
    # Mapping of instance_n to the corresponding instance list
    instance_lists = {
        1: instance_list_1,
        2: instance_list_2,
        5: instance_list_5,
        6: instance_list_6,
        7: instance_list_7
    }
    
    # Check if instance_n is valid
    if instance_n not in instance_lists:
        print(f"Error: instance_n {instance_n} is not valid")
        return None

    # Get the corresponding instance list
    instance_list = instance_lists[instance_n]

    return (instance_list)

In [255]:
rf_metrics_list = []
xgb_metrics_list = []
lof_metrics_list = []

train_df = get_instance_list_var(instance_n)[0]
list = get_instance_list_var(instance_n)

for instances in df_dict:
    val_df= df_dict[instances]
    # print(instances)
    # print(val_df.head())
    if instances != train_df and instances in list :
        print(instances)
        # print(val_df.head())
        # print(val_df['timestamp'].head())
        val_df['timestamp'] = pd.to_datetime(val_df['timestamp'])
        val_df = val_df.set_index('timestamp').sort_index()    
        # print(val_df.head())

        clean_vdf, smooth_vdf = handle_missing_data(val_df, val_df.columns)
        smooth_vdf = smooth_vdf[smooth_vdf['class'].notnull()]
        # print(smooth_vdf.head())
        z_score_vdf = z_score_outlier(smooth_vdf, cols_to_check)
        resample_vdf = time_windowing(smooth_vdf)
        # print(resample_vdf.head())



        print(resample_vdf.head())
        if 'class' in resample_vdf.columns:
            X_vdf = resample_vdf[cols_to_check]
            y_vdf = resample_vdf['class']
            window = resample_vdf['id']

            scale_vdf = scaler.fit_transform(X_vdf)
            vdf = pd.DataFrame(scale_vdf, index = X_vdf.index, columns = X_vdf.columns)
            vdf['class'] = y_vdf.values
            vdf['id'] = window.values
            vdf = vdf.reset_index()
            vdf.rename(columns={'timestamp':'time'}, inplace = True)

            X_val = extract_and_impute_features(vdf)
            y_val = vdf.groupby('id')['class'].first()
            y_val = update_class_label(instance_n, y_val)

            y_pred_rf, rf_metrics = validation_metrics(best_model_rf, X_val, y_val)
            y_pred_xgb, xgb_metrics = validation_metrics(best_model_xgb, X_val, y_val)

            y_pred_lof = best_lof.predict(X_val)
            y_pred_lof = np.where(y_pred_lof == 1, 0, 1)

            accuracy = accuracy_score(y_val, y_pred_lof)
            precision = precision_score(y_val, y_pred_lof)
            recall = recall_score(y_val, y_pred_lof)
            f1 = f1_score(y_val, y_pred_lof)

            # Initialize AUC metrics
            roc_auc = None
            prc_auc = None

            if lof_test_predictions is not None:
                roc_auc = roc_auc_score(y_val, y_pred_lof)
                prc_auc = average_precision_score(y_val, y_pred_lof)


            # Create a dictionary of metrics
            lof_metrics = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'auc_roc': roc_auc,
                'auc_prc': prc_auc
            }

            rf_metrics_list.append(rf_metrics)
            xgb_metrics_list.append(xgb_metrics)
            lof_metrics_list.append(lof_metrics)


# Convert list of metrics to a DataFrame

rf_metrics_df = pd.DataFrame(rf_metrics_list)
print("Random Forest \n" , rf_metrics_df)

xgb_metrics_df = pd.DataFrame(xgb_metrics_list)
print("XGBoost \n" , xgb_metrics_df)

lof_metrics_df = pd.DataFrame(lof_metrics_list)
print("Local Outlier Factor \n" , lof_metrics_df)

   

WELL-00001_20170226140146
Index(['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP', 'P-JUS-CKGL',
       'T-JUS-CKGL', 'QGL', 'class'],
      dtype='object')
                     P-PDG         P-TPT     T-TPT     P-MON-CKP  T-JUS-CKP  \
timestamp                                                                     
2017-02-26 14:01:46    0.0  1.331329e+07  118.1245  5.333321e+06  74.215710   
2017-02-26 14:01:47    0.0  1.331333e+07  118.1245  5.333460e+06  74.215945   
2017-02-26 14:01:48    0.0  1.331337e+07  118.1245  5.333599e+06  74.216177   
2017-02-26 14:01:49    0.0  1.331342e+07  118.1245  5.333737e+06  74.216410   
2017-02-26 14:01:50    0.0  1.331346e+07  118.1245  5.333876e+06  74.216644   

                       P-JUS-CKGL  QGL  class  id  
timestamp                                          
2017-02-26 14:01:46  3.382723e+06  0.0    0.0   1  
2017-02-26 14:01:47  3.382724e+06  0.0    0.0   1  
2017-02-26 14:01:48  3.382724e+06  0.0    0.0   1  
2017-02-26 14:01:49  3.382

Feature Extraction: 100%|██████████| 10/10 [00:00<00:00, 154.14it/s]

WELL-00006_20180617181315





Index(['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP', 'P-JUS-CKGL',
       'T-JUS-CKGL', 'QGL', 'class'],
      dtype='object')
                            P-PDG         P-TPT      T-TPT     P-MON-CKP  \
timestamp                                                                  
2018-06-17 18:13:15 -1.180116e+42  2.081367e+07  117.84760  1.013681e+07   
2018-06-17 18:13:16 -1.180116e+42  2.081364e+07  117.84755  1.013675e+07   
2018-06-17 18:13:17 -1.180116e+42  2.081361e+07  117.84750  1.013669e+07   
2018-06-17 18:13:18 -1.180116e+42  2.081358e+07  117.84745  1.013662e+07   
2018-06-17 18:13:19 -1.180116e+42  2.081356e+07  117.84740  1.013656e+07   

                     T-JUS-CKP  P-JUS-CKGL  QGL  class  id  
timestamp                                                   
2018-06-17 18:13:15  70.839520   4040100.0  0.0    0.0   1  
2018-06-17 18:13:16  70.839510   4040100.0  0.0    0.0   1  
2018-06-17 18:13:17  70.839503   4040100.0  0.0    0.0   1  
2018-06-17 18:13:18  70.8394

Feature Extraction: 100%|██████████| 10/10 [00:00<00:00, 61.16it/s]


WELL-00018_20190403023307
Index(['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP', 'P-JUS-CKGL',
       'T-JUS-CKGL', 'QGL', 'class'],
      dtype='object')
                     P-PDG       P-TPT       T-TPT     P-MON-CKP  T-JUS-CKP  \
timestamp                                                                     
2019-04-03 02:33:07    0.0  8431399.00  109.856400  1.367659e+06   73.72916   
2019-04-03 02:33:08    0.0  8431364.00  109.856100  1.368448e+06   73.72916   
2019-04-03 02:33:09    0.0  8431329.00  109.855833  1.369236e+06   73.72916   
2019-04-03 02:33:10    0.0  8431293.75  109.855575  1.369631e+06   73.72916   
2019-04-03 02:33:11    0.0  8431258.60  109.855320  1.369867e+06   73.72916   

                     P-JUS-CKGL  QGL  class  id  
timestamp                                        
2019-04-03 02:33:07   8810764.0  0.0    0.0   1  
2019-04-03 02:33:08   8810764.0  0.0    0.0   1  
2019-04-03 02:33:09   8810764.0  0.0    0.0   1  
2019-04-03 02:33:10   8810764.0  0.0

Feature Extraction: 100%|██████████| 10/10 [00:00<00:00, 238.24it/s]


WELL-00006_20180620155728
Index(['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP', 'P-JUS-CKGL',
       'T-JUS-CKGL', 'QGL', 'class'],
      dtype='object')
                            P-PDG         P-TPT       T-TPT     P-MON-CKP  \
timestamp                                                                   
2018-06-20 16:29:03 -1.180116e+42  2.108536e+07  117.632425  1.185283e+07   
2018-06-20 16:29:04 -1.180116e+42  2.108539e+07  117.632493  1.185298e+07   
2018-06-20 16:29:05 -1.180116e+42  2.108541e+07  117.632561  1.185313e+07   
2018-06-20 16:29:06 -1.180116e+42  2.108544e+07  117.632629  1.185328e+07   
2018-06-20 16:29:07 -1.180116e+42  2.108547e+07  117.632696  1.185344e+07   

                     T-JUS-CKP    P-JUS-CKGL  QGL  class  id  
timestamp                                                     
2018-06-20 16:29:03  63.292364  1.459317e+06  0.0    0.0   1  
2018-06-20 16:29:04  63.294436  1.459340e+06  0.0    0.0   1  
2018-06-20 16:29:05  63.296509  1.459362e+06  0.

Feature Extraction: 100%|██████████| 10/10 [00:00<00:00, 100.91it/s]

Random Forest 
    accuracy  precision    recall  f1_score   auc_roc   auc_prc
0  0.914634   0.915034  0.914634  0.914698  0.915072  0.894446
1  0.924528   0.938570  0.924528  0.926146  0.943662  0.962796
2  0.760000   0.852903  0.760000  0.760000  0.806452  0.612903
3  0.595238   0.908009  0.595238  0.662304  0.770270  0.945302
XGBoost 
    accuracy  precision    recall  f1_score   auc_roc   auc_prc
0  0.597561   0.638374  0.597561  0.582397  0.612440  0.611619
1  0.566038   0.812485  0.566038  0.548112  0.676056  0.786075
2  0.740000   0.845625  0.740000  0.738647  0.790323  0.593750
3  0.571429   0.906832  0.571429  0.640306  0.756757  0.942085
Local Outlier Factor 
    accuracy  precision    recall  f1_score   auc_roc   auc_prc
0  0.524390   0.538462  0.795455  0.642202  0.502990  0.538078
1  0.669811   0.669811  1.000000  0.802260  0.500000  0.669811
2  0.660000   0.527778  1.000000  0.690909  0.725806  0.527778
3  0.880952   0.880952  1.000000  0.936709  0.500000  0.880952





In [256]:
# Calculate the mean for each metric in each model's DataFrame
rf_mean_metrics = rf_metrics_df.mean().to_frame().T
xgb_mean_metrics = xgb_metrics_df.mean().to_frame().T
lof_mean_metrics = lof_metrics_df.mean().to_frame().T

# Add a 'model' column to each DataFrame to identify the model
rf_mean_metrics['model'] = 'rf'
xgb_mean_metrics['model'] = 'xgb'
lof_mean_metrics['model'] = 'lof'

# Concatenate the DataFrames into a single DataFrame
mean_metrics_df = pd.concat([rf_mean_metrics, xgb_mean_metrics, lof_mean_metrics], ignore_index=True)

# Reorder the columns so 'model' is the first column
mean_metrics_df = mean_metrics_df[['model', 'accuracy', 'precision', 'recall', 'f1_score', 'auc_roc', 'auc_prc']]

# Display the resulting DataFrame
print(mean_metrics_df)


  model  accuracy  precision    recall  f1_score   auc_roc   auc_prc
0    rf  0.798600   0.903629  0.798600  0.815787  0.858864  0.853862
1   xgb  0.618757   0.800829  0.618757  0.627366  0.708894  0.733382
2   lof  0.683788   0.654251  0.948864  0.768020  0.557199  0.654155


In [257]:
# Calculate the standard deviation for each metric in each model's DataFrame
rf_std_metrics = rf_metrics_df.std().to_frame().T
xgb_std_metrics = xgb_metrics_df.std().to_frame().T
lof_std_metrics = lof_metrics_df.std().to_frame().T

# Add a 'model' column to each DataFrame to identify the model
rf_std_metrics['model'] = 'rf'
xgb_std_metrics['model'] = 'xgb'
lof_std_metrics['model'] = 'lof'

# Concatenate the DataFrames into a single DataFrame
std_metrics_df = pd.concat([rf_std_metrics, xgb_std_metrics, lof_std_metrics], ignore_index=True)

# Reorder the columns so 'model' is the first column
std_metrics_df = std_metrics_df[['model', 'accuracy', 'precision', 'recall', 'f1_score', 'auc_roc', 'auc_prc']]

# Display the resulting DataFrame
print(std_metrics_df)


  model  accuracy  precision    recall  f1_score   auc_roc   auc_prc
0    rf  0.155100   0.036255  0.155100  0.127321  0.083558  0.163234
1   xgb  0.081993   0.115139  0.081993  0.083376  0.080215  0.163968
2   lof  0.147244   0.164356  0.102273  0.130900  0.112414  0.164446
