In [338]:
import sys
import os
import copy
import numpy as np
import pandas as pd
import matplotlib
import scipy.spatial
import sklearn.preprocessing
import datashader as ds
import colorcet as cc
from findpeaks import findpeaks
import seaborn as sns
import missingno
from statsmodels.graphics.tsaplots import acf
import kydlib
import scipy.stats as stats
import seaborn as sns
import tkinter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# sys.path.append(os.path.join('..', '..'))
sys.path.append(os.path.join(os.getcwd(), '..'))
import toolkit as tk
# /home/ainbahar/dataproject/3W-research-project/toolkit
color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [339]:
# Section: Load Data
real_instances, simulated_instances, drawn_instances = tk.get_all_labels_and_files()

## Load Data

In [340]:
# Table of Instance
toi = tk.create_table_of_instances(real_instances, simulated_instances, drawn_instances)
toi

SOURCE,REAL,SIMULATED,HAND-DRAWN,TOTAL
INSTANCE LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0 - Normal Operation,594,0,0,594
1 - Abrupt Increase of BSW,5,114,10,129
2 - Spurious Closure of DHSV,22,16,0,38
3 - Severe Slugging,32,74,0,106
4 - Flow Instability,344,0,0,344
5 - Rapid Productivity Loss,11,439,0,450
6 - Quick Restriction in PCK,6,215,0,221
7 - Scaling in PCK,5,0,10,15
8 - Hydrate in Production Line,0,81,0,81
TOTAL,1019,939,20,1978


# Filter Rare Undesirable Events
Filter events that occur less frequently than the defined threshold.


In [341]:
# Rare undesirable events
threshold = 0.05
rue = tk.filter_rare_undesirable_events(toi, threshold)
rue

SOURCE,REAL,SIMULATED,HAND-DRAWN,TOTAL
INSTANCE LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 - Abrupt Increase of BSW,5,114,10,129
2 - Spurious Closure of DHSV,22,16,0,38
3 - Severe Slugging,32,74,0,106
5 - Rapid Productivity Loss,11,439,0,450
6 - Quick Restriction in PCK,6,215,0,221
7 - Scaling in PCK,5,0,10,15
8 - Hydrate in Production Line,0,81,0,81
TOTAL,81,939,20,1040


In [342]:
# Section: Reference Table
tags = {'P-PDG':'Pa', 
        'P-TPT':'Pa', 'T-TPT':'degC',
         'P-MON-CKP':'Pa', 'T-JUS-CKP':'degC', 
         'P-JUS-CKGL':'Pa', 'T-JUS-CKGL':'degC',
         'QGL': 'sm^3/s'}

names = ['Pressure at the PDG',
         'Pressure at the TPT',
         'Temperature at the TPT',
         'Pressure upstream of the PCK',
         'Temperature downstream of the PCK',
         'Pressure downstream of the GLCK',
         'Temperature downstream of the GLCK',
         'Gas lift flow rate']

reference_table = pd.DataFrame(index=np.arange(1,len(names)+1))
reference_table.index.name='Number'
reference_table['Tag'] = tags.keys()
reference_table['Name'] = names
reference_table['Unit'] = tags.values()
reference_table

Unnamed: 0_level_0,Tag,Name,Unit
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,P-PDG,Pressure at the PDG,Pa
2,P-TPT,Pressure at the TPT,Pa
3,T-TPT,Temperature at the TPT,degC
4,P-MON-CKP,Pressure upstream of the PCK,Pa
5,T-JUS-CKP,Temperature downstream of the PCK,degC
6,P-JUS-CKGL,Pressure downstream of the GLCK,Pa
7,T-JUS-CKGL,Temperature downstream of the GLCK,degC
8,QGL,Gas lift flow rate,sm^3/s


### Input instance label index

In [343]:
# Prompt user to input the number of instances
try:
    instance_n = int(input("Enter the number of instances: "))
    print(f"Number of instances set to: {toi.index[instance_n]}")
except ValueError:
    print("Invalid input. Please enter a valid integer.")


Number of instances set to: 7 - Scaling in PCK


In [344]:
def load_real_instance_n(instance_n):
    files_labeled= [path for label, path in real_instances if label == instance_n]

    # Read the CSV files into DataFrames
    dataframes = [pd.read_csv(file) for file in files_labeled]

    # assign names to these DataFrames for easy access
    df_dict= {file.stem: pd.read_csv(file) for file in files_labeled}
    
    print(f"Number of instances set to: {toi.index[instance_n]}")
    return df_dict

In [345]:
df_dict = load_real_instance_n(instance_n)
df_dict

Number of instances set to: 7 - Scaling in PCK


{'WELL-00018_20180611021218':                         timestamp  P-PDG      P-TPT     T-TPT  P-MON-CKP  \
 0      2018-06-11 02:12:18.000000    0.0  8713634.0  109.5560  2142981.0   
 1      2018-06-11 02:12:19.000000    0.0  8713669.0  109.5563  2142981.0   
 2      2018-06-11 02:12:20.000000    0.0  8713704.0  109.5566  2142981.0   
 3      2018-06-11 02:12:21.000000    0.0  8713738.0  109.5569  2142981.0   
 4      2018-06-11 02:12:22.000000    0.0  8713773.0  109.5573  2142981.0   
 ...                           ...    ...        ...       ...        ...   
 42969  2018-06-11 14:08:27.000000    0.0  8721284.0  109.5114  2193460.0   
 42970  2018-06-11 14:08:28.000000    0.0  8721244.0  109.5116  2192671.0   
 42971  2018-06-11 14:08:29.000000    0.0  8721204.0  109.5117  2191883.0   
 42972  2018-06-11 14:08:30.000000    0.0  8721164.0  109.5119  2191094.0   
 42973  2018-06-11 14:08:31.000000    0.0  8721124.0  109.5121  2190305.0   
 
        T-JUS-CKP  P-JUS-CKGL  T-JUS-CKGL  QG

In [346]:
def check_dataframe_statistics(df_dict):
    for df_name, df in df_dict.items():
        print(f"Statistics for DataFrame: {df_name}")
        print("="*50)
        
        # Column statistics
        print("\nColumn Statistics:")
        print(df.describe(include='all'))  # Includes statistics for numerical and categorical columns

        # # Column distribution
        # print("\nColumn Distribution:")
        # for col in df.columns:
        #     print(f"\nDistribution of '{col}':")
        #     if df[col].dtype == 'object' or df[col].dtype == 'category':
        #         print(df[col].value_counts())
        #     else:
        #         print(df[col].hist(bins=10))
        
        # Column missing value check
        print("\nMissing Values per Column:")
        print(df.isna().sum())

        print("\n" + "="*50 + "\n")
        


In [347]:
check_dataframe_statistics(df_dict)

Statistics for DataFrame: WELL-00018_20180611021218

Column Statistics:
                         timestamp    P-PDG         P-TPT         T-TPT  \
count                        42974  42974.0  4.297400e+04  42974.000000   
unique                       42974      NaN           NaN           NaN   
top     2018-06-11 02:12:18.000000      NaN           NaN           NaN   
freq                             1      NaN           NaN           NaN   
mean                           NaN      0.0  8.717519e+06    109.487770   
std                            NaN      0.0  3.081159e+03      0.030193   
min                            NaN      0.0  8.710634e+06    109.420800   
25%                            NaN      0.0  8.715189e+06    109.466600   
50%                            NaN      0.0  8.716880e+06    109.484000   
75%                            NaN      0.0  8.719709e+06    109.503500   
max                            NaN      0.0  8.727151e+06    109.594200   

           P-MON-CKP     T-

In [348]:
instance_list_1 = ['WELL-00001_20140124093303', 'WELL-00002_20140126161944', 'WELL-00006_20170731180930','WELL-00006_20170731220432', 'WELL-00006_20180617200257']
instance_list_2 = ['WELL-00010_20171218190131' ,'WELL-00002_20131104014101','WELL-00009_20170313160804']
instance_list_5 = ['WELL-00015_20170620040530','WELL-00016_20180426105723', 'WELL-00017_20140317225927', 'WELL-00016_20180517184536', 'WELL-00017_20140319020130'
                 , 'WELL-00017_20140318130603', 'WELL-00016_20180404222255', 'WELL-00016_20180426102531', 'WELL-00017_20140317123419']
instance_list_6 = ['WELL-00002_20140325170304','WELL-00004_20171031194452', 'WELL-00004_20171031181509', 'WELL-00004_20171031190706']
instance_list_7 = ['WELL-00018_20180611021218','WELL-00001_20170226140146','WELL-00006_20180617181315.csv', 'WELL-00006_20180620155728', 'WELL-00018_20190403023307']


In [349]:
def get_dataframe_by_index(df_dict, instance_n, index):
    # Mapping of instance_n to the corresponding instance list
    instance_lists = {
        1: instance_list_1,
        2: instance_list_2,
        5: instance_list_5,
        6: instance_list_6,
        7: instance_list_7
    }
    
    # Check if instance_n is valid
    if instance_n not in instance_lists:
        print(f"Error: instance_n {instance_n} is not valid")
        return None

    # Get the corresponding instance list
    instance_list = instance_lists[instance_n]

    try:
        # Retrieve the instance based on the provided index
        instance = instance_list[index]
        if instance in df_dict:
            return df_dict[instance]
        else:
            print(f"Warning: {instance} not found in df_dict")
            return None
    except IndexError:
        print(f"Error: Index {index} is out of range for the selected instance list")
        return None

In [350]:
df = get_dataframe_by_index(df_dict, instance_n, 0) 
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamp').sort_index()   
df

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-06-11 02:12:18,0.0,8713634.0,109.5560,2142981.0,68.81107,7960069.0,,0.0,0.0
2018-06-11 02:12:19,0.0,8713669.0,109.5563,2142981.0,68.80492,7960069.0,,0.0,0.0
2018-06-11 02:12:20,0.0,8713704.0,109.5566,2142981.0,68.79877,7960069.0,,0.0,0.0
2018-06-11 02:12:21,0.0,8713738.0,109.5569,2142981.0,68.79261,7960069.0,,0.0,0.0
2018-06-11 02:12:22,0.0,8713773.0,109.5573,2142981.0,68.78645,7960069.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...
2018-06-11 14:08:27,0.0,8721284.0,109.5114,2193460.0,68.17934,8506944.0,,0.0,7.0
2018-06-11 14:08:28,0.0,8721244.0,109.5116,2192671.0,68.17917,8506944.0,,0.0,7.0
2018-06-11 14:08:29,0.0,8721204.0,109.5117,2191883.0,68.17899,8506944.0,,0.0,7.0
2018-06-11 14:08:30,0.0,8721164.0,109.5119,2191094.0,68.17882,8506944.0,,0.0,7.0


In [351]:
# matplotlib.use('TkAgg')

In [352]:
def count_ts_per_interval(df, interval='hour'):
    # Ensure the DataFrame index is a DateTimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame index must be a DateTimeIndex.")
    
    # Create a new column based on the interval
    if interval == 'hour':
        df['hour'] = df.index.hour
        group_col = 'hour'
    elif interval == 'day':
        df['day'] = df.index.date
        group_col = 'day'
    else:
        raise ValueError("Interval must be either 'hour' or 'day'.")

    # Group by the interval and class, then count occurrences
    interval_class_counts = df.groupby([group_col, 'class']).size().unstack(fill_value=0)

    # Plotting with seaborn heatmap
    plt.figure(figsize=(15, 8))
    sns.heatmap(interval_class_counts, annot=True, fmt="d", cmap="YlGnBu")
    plt.title(f'Data Presence for Each Class by {interval.capitalize()}')
    plt.xlabel('Class')
    plt.ylabel(interval.capitalize())
    plt.show()

In [353]:
count_ts_per_interval(df, 'day')   


In [354]:
count_ts_per_interval(df, 'hour')  # For hourly interval

In [355]:
# Add background color shading based on 'class'
unique_classes_real = df['class'].unique()
unique_classes_real = unique_classes_real.tolist()
print(unique_classes_real)

[0.0, nan, 107.0, 7.0]


## Time Series Visualization

In [356]:
def time_series_chart(df, columns, time_scale):
   
    # Section: Time Series Visualization
    import plotly.graph_objects as go
    import plotly.express as px

    # Sample data preparation (ensure 'class' column exists)
    sensor_data = df[columns]
    df_resampled = sensor_data.resample(f'{time_scale}T').ffill()
    df_resampled_ffill = df_resampled.ffill()

    # Determine min and max timestamps
    min_timestamp = df_resampled.index.min()
    max_timestamp = df_resampled.index.max()

    # Plotting with Plotly
    fig = go.Figure()

    # Add traces for each column in your time series data
    for column in df_resampled.columns:
        if column != 'class':  # Skip the 'class' column
            fig.add_trace(go.Scatter(x=df_resampled.index, y=df_resampled[column], mode='lines', name=column))


    # Add background color shading based on 'class'
    unique_classes = df_resampled['class'].unique()
    colors = px.colors.qualitative.Plotly  # Use Plotly color palette

    for i, cls in enumerate(unique_classes):
        class_df = df_resampled[df_resampled['class'] == cls]
        fig.add_vrect(
            x0=class_df.index.min(), x1=class_df.index.max(),
            fillcolor=colors[i % len(colors)], opacity=0.2,
            layer="below", line_width=0,
            name=f'Class: {cls}'
        )

    # Update layout for better readability
    fig.update_layout(
        title=f'Time Series Data Visualization (Resampled to {time_scale} Minutes)',
        xaxis_title='Timestamp',
        yaxis_title='Value',
        legend_title='Legend',
        legend=dict(x=0, y=-0.2, orientation='h'),
        xaxis_rangeslider=dict(
            visible=True,
            range=[min_timestamp, max_timestamp]  # Set slider range from min to max timestamp
        )
    )

    fig.show()


In [357]:
# Define attributes columns
well_metric = ['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP', 'P-JUS-CKGL', 'T-JUS-CKGL','QGL', 'class']

In [358]:
time_scale = 5
time_series_chart(df, well_metric, time_scale )

## Line Graph for each variable
Create line graphs for different classes to visualize distribution and patterns.

In [359]:
def sensor_line_graph (df, columns_to_plot, unique_classes):
    plt.figure(figsize=(15, 20))

    for i, column in enumerate(columns_to_plot, 1):
        plt.subplot(len(columns_to_plot), 1, i)
        for unique_class in unique_classes:
            subset = df[df['class'] == unique_class]
            plt.plot(subset.index, subset[column], label=f'class {unique_class}', alpha=0.5)

        plt.xlabel('Datetime')
        plt.ylabel(column)
        plt.title(f'Line Graph of {column} vs Datetime by Class')
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.show()

In [360]:
sensor_columns = ['P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP','T-JUS-CKP', 'P-JUS-CKGL', 'T-JUS-CKGL', 'QGL']

sensor_line_graph(df, sensor_columns, unique_classes_real)

## Box plot 
Create box plot for different classes 

In [361]:
def box_plot(df, unique_classes, columns): 

    # Set up the plotting environment for box plots
    plt.figure(figsize=(15, 20))

    # Create box plots for each column
    for i, column in enumerate(columns, 1):
        plt.subplot(len(columns), 1, i)  # Create subplot for each column
        
        # Prepare data for box plot
        data_to_plot = [df[df['class'] == unique_class][column].dropna() for unique_class in unique_classes]
        
        # Plot box plot
        plt.boxplot(data_to_plot, labels=[f'class {unique_class}' for unique_class in unique_classes])
        
        # Adding labels and title for each subplot
        plt.ylabel(column)
        plt.title(f'Box Plot of {column} by Class')
        plt.grid(True)

    # Adjust layout to prevent overlap
    plt.tight_layout()

    # Show plot
    plt.show()


In [362]:
# Section: Box plot of sensor data by each class

box_plot(df, unique_classes_real, sensor_columns)

## Class label distribution

In [363]:
def class_distribution(df):
    # Count the occurrences of each unique value in the 'class' column, including NA values
    class_counts = df['class'].value_counts(dropna=False)

    # Print the counts for inspection
    print("Class distribution:")
    print(class_counts)

    # Plot the distribution
    plt.figure(figsize=(10, 6))
    plt.bar(class_counts.index.astype(str), class_counts.values, color='skyblue')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.title('Distribution of Classes')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Show the plot
    plt.show()

In [364]:
class_distribution(df)

Class distribution:
class
107.0    23477
0.0      17509
7.0       1659
NaN        329
Name: count, dtype: int64


# Modify

## Handling missing data

In [365]:
def handle_missing_data(df, columns) :  
# Subset the DataFrame to only include the specified columns
    subset_df = df[columns]

    # Calculate the percentage of null values for each column in the subset
    null_percentages = subset_df.isnull().mean() * 100

    # List the columns with more than 18% null values
    columns_with_high_nulls = null_percentages[null_percentages > 18].index.tolist()

    # Drop the columns with high null values from the subset DataFrame
    modified_df = subset_df.drop(columns=columns_with_high_nulls)

    # Forward fill na values in the 'class' column
    if 'class' in modified_df.columns:
        modified_df['class'] = modified_df['class'].fillna(method='ffill')

    modified_df = modified_df[modified_df['class'].notnull()]


    # Smooth the DataFrame using a moving average 
    window_size = 1800
    smoothed_df = modified_df.copy()
    sensor_columns = modified_df.columns.difference(['class'])
    smoothed_df[sensor_columns] = modified_df[sensor_columns].rolling(window=window_size, min_periods=1).mean()

    return(modified_df, smoothed_df)

In [366]:
df

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class,day,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-06-11 02:12:18,0.0,8713634.0,109.5560,2142981.0,68.81107,7960069.0,,0.0,0.0,2018-06-11,2
2018-06-11 02:12:19,0.0,8713669.0,109.5563,2142981.0,68.80492,7960069.0,,0.0,0.0,2018-06-11,2
2018-06-11 02:12:20,0.0,8713704.0,109.5566,2142981.0,68.79877,7960069.0,,0.0,0.0,2018-06-11,2
2018-06-11 02:12:21,0.0,8713738.0,109.5569,2142981.0,68.79261,7960069.0,,0.0,0.0,2018-06-11,2
2018-06-11 02:12:22,0.0,8713773.0,109.5573,2142981.0,68.78645,7960069.0,,0.0,0.0,2018-06-11,2
...,...,...,...,...,...,...,...,...,...,...,...
2018-06-11 14:08:27,0.0,8721284.0,109.5114,2193460.0,68.17934,8506944.0,,0.0,7.0,2018-06-11,14
2018-06-11 14:08:28,0.0,8721244.0,109.5116,2192671.0,68.17917,8506944.0,,0.0,7.0,2018-06-11,14
2018-06-11 14:08:29,0.0,8721204.0,109.5117,2191883.0,68.17899,8506944.0,,0.0,7.0,2018-06-11,14
2018-06-11 14:08:30,0.0,8721164.0,109.5119,2191094.0,68.17882,8506944.0,,0.0,7.0,2018-06-11,14


In [367]:
clean_real_df, smoothed_df = handle_missing_data(df, well_metric)
clean_real_df

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-06-11 02:12:18,0.0,8713634.0,109.5560,2142981.0,68.81107,7960069.0,0.0,0.0
2018-06-11 02:12:19,0.0,8713669.0,109.5563,2142981.0,68.80492,7960069.0,0.0,0.0
2018-06-11 02:12:20,0.0,8713704.0,109.5566,2142981.0,68.79877,7960069.0,0.0,0.0
2018-06-11 02:12:21,0.0,8713738.0,109.5569,2142981.0,68.79261,7960069.0,0.0,0.0
2018-06-11 02:12:22,0.0,8713773.0,109.5573,2142981.0,68.78645,7960069.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2018-06-11 14:08:27,0.0,8721284.0,109.5114,2193460.0,68.17934,8506944.0,0.0,7.0
2018-06-11 14:08:28,0.0,8721244.0,109.5116,2192671.0,68.17917,8506944.0,0.0,7.0
2018-06-11 14:08:29,0.0,8721204.0,109.5117,2191883.0,68.17899,8506944.0,0.0,7.0
2018-06-11 14:08:30,0.0,8721164.0,109.5119,2191094.0,68.17882,8506944.0,0.0,7.0


In [368]:
smoothed_df

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-06-11 02:12:18,0.0,8.713634e+06,109.556000,2.142981e+06,68.811070,7.960069e+06,0.0,0.0
2018-06-11 02:12:19,0.0,8.713652e+06,109.556150,2.142981e+06,68.807995,7.960069e+06,0.0,0.0
2018-06-11 02:12:20,0.0,8.713669e+06,109.556300,2.142981e+06,68.804920,7.960069e+06,0.0,0.0
2018-06-11 02:12:21,0.0,8.713686e+06,109.556450,2.142981e+06,68.801842,7.960069e+06,0.0,0.0
2018-06-11 02:12:22,0.0,8.713704e+06,109.556620,2.142981e+06,68.798764,7.960069e+06,0.0,0.0
...,...,...,...,...,...,...,...,...
2018-06-11 14:08:27,0.0,8.722798e+06,109.494825,2.187673e+06,68.287542,8.490644e+06,0.0,7.0
2018-06-11 14:08:28,0.0,8.722796e+06,109.494842,2.187677e+06,68.287542,8.490657e+06,0.0,7.0
2018-06-11 14:08:29,0.0,8.722793e+06,109.494859,2.187680e+06,68.287542,8.490671e+06,0.0,7.0
2018-06-11 14:08:30,0.0,8.722791e+06,109.494876,2.187683e+06,68.287542,8.490684e+06,0.0,7.0


In [369]:
class_distribution(clean_real_df)

Class distribution:
class
107.0    23608
0.0      17707
7.0       1659
Name: count, dtype: int64


In [370]:
class_distribution(smoothed_df)

Class distribution:
class
107.0    23608
0.0      17707
7.0       1659
Name: count, dtype: int64


In [371]:
cols_to_check = clean_real_df.columns.difference(['class'])

In [372]:
sensor_line_graph(clean_real_df,cols_to_check, clean_real_df['class'].unique() )

In [373]:
sensor_line_graph(smoothed_df,cols_to_check, clean_real_df['class'].unique() )

In [374]:
time_series_chart(clean_real_df, clean_real_df.columns, 5)

In [375]:
time_series_chart(smoothed_df, smoothed_df.columns, 5)

In [376]:
def z_score_outlier(df, columns):
    from scipy.stats import zscore
    

    # Calculate Z-scores for each column
    df_zscores = df[columns].apply(zscore)

    # Set a threshold for Z-scores to identify outliers
    threshold = 3

    # Identify outliers
    outliers = (np.abs(df_zscores) > threshold).any(axis=1)

    # Replace outliers with rolling average
    window_size = 3  # Set window size for rolling average
    z_score_df = df.copy()

    for col in columns:
        rolling_avg = df[col].rolling(window=window_size, min_periods=1).mean()
        z_score_df.loc[outliers, col] = rolling_avg[outliers]

    # # Visualize original and cleaned data
    # plt.figure(figsize=(15, 8))
    # for col in columns:
    #     plt.plot(df.index, df[col], label=f'Original {col}')
    #     plt.plot(z_score_df.index, z_score_df[col], label=f'Cleaned {col}', linestyle='--')
    # plt.legend()
    # plt.show()

    return(z_score_df)



In [377]:
z_score_real_df = z_score_outlier(smoothed_df, cols_to_check) 
z_score_real_df

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-06-11 02:12:18,0.0,8.713634e+06,109.556000,2.142981e+06,68.811070,7.960069e+06,0.0,0.0
2018-06-11 02:12:19,0.0,8.713652e+06,109.556150,2.142981e+06,68.807995,7.960069e+06,0.0,0.0
2018-06-11 02:12:20,0.0,8.713669e+06,109.556300,2.142981e+06,68.804920,7.960069e+06,0.0,0.0
2018-06-11 02:12:21,0.0,8.713686e+06,109.556450,2.142981e+06,68.801842,7.960069e+06,0.0,0.0
2018-06-11 02:12:22,0.0,8.713704e+06,109.556620,2.142981e+06,68.798764,7.960069e+06,0.0,0.0
...,...,...,...,...,...,...,...,...
2018-06-11 14:08:27,0.0,8.722798e+06,109.494825,2.187673e+06,68.287542,8.490644e+06,0.0,7.0
2018-06-11 14:08:28,0.0,8.722796e+06,109.494842,2.187677e+06,68.287542,8.490657e+06,0.0,7.0
2018-06-11 14:08:29,0.0,8.722793e+06,109.494859,2.187680e+06,68.287542,8.490671e+06,0.0,7.0
2018-06-11 14:08:30,0.0,8.722791e+06,109.494876,2.187683e+06,68.287542,8.490684e+06,0.0,7.0


In [378]:
box_plot(z_score_real_df, z_score_real_df['class'].unique(), cols_to_check)

In [379]:
sensor_line_graph(z_score_real_df,cols_to_check, z_score_real_df['class'].unique() )

## Saving resampled data for modeling

In [380]:
z_score_real_df.to_csv(f'trainDataset/train_df_instance_{instance_n}.csv', index=True)

