In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
from pathlib import Path
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict
from natsort import natsorted
import tsfresh as tf

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [3]:
data_path = Path('data_transformed')
events_names = {0: 'Normal',
                1: 'Abrupt Increase of BSW',
                2: 'Spurious Closure of DHSV',
                3: 'Severe Slugging',
                4: 'Flow Instability',
                5: 'Rapid Productivity Loss',
                6: 'Quick Restriction in PCK',
                7: 'Scaling in PCK',
                8: 'Hydrate in Production Line'
               }
columns = ['P-PDG',
           'P-TPT',
           'T-TPT',
           'P-MON-CKP',
           'T-JUS-CKP',
           'QGL',
           'class']
rare_threshold = 0.01

In [4]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [5]:
real_instances = list(class_and_file_generator(data_path, real=True, simulated=False, drawn=False))
simulated_instances = list(class_and_file_generator(data_path, real=False, simulated=True, drawn=False))

In [7]:
#simulirani primjeri za znacajku 'well' imaju vrijednost -1

def load_instance(instance):
    class_code, instance_path = instance
    try:
        well, instance_id = instance_path.stem.split('_')
        if 'WELL' in well:
            well = well.split('-')[1]
        if 'SIMULATED' in well:
            well = '-1'
        df = pd.read_csv(instance_path, index_col='timestamp', parse_dates=['timestamp'])
        assert (df.columns == columns).all(), "invalid columns in the file {}: {}".format(str(instance_path), str(df.columns.tolist()))
        df['class_code'] = class_code
        df['well'] = well
        df['instance_id'] = instance_id
        df = df[['class_code', 'well', 'instance_id'] + columns]
        return df
    except Exception as e:
        raise Exception('error reading file {}: {}'.format(instance_path, e))
        
def load_instances(instances):
    pool = ThreadPool()
    all_df = []
    try:
        for df in pool.imap_unordered(load_instance, instances):
            all_df.append(df)
    finally:
        pool.terminate()
    return all_df

In [8]:
#ucitavanje stvarnih i simuliranih csv primjera u df-ove

instances = real_instances + simulated_instances
data = load_instances(instances)

In [9]:
#pretvorba identifikatora dogadaja i oznake busotine u integer tipove podataka

for df in data:
    df['instance_id'] = pd.to_numeric(df['instance_id'])
    df['well'] = pd.to_numeric(df['well'])
    
print(df.dtypes)
data[0].head()

class_code       int64
well             int64
instance_id      int64
P-PDG          float64
P-TPT          float64
T-TPT          float64
P-MON-CKP      float64
T-JUS-CKP      float64
QGL            float64
class          float64
dtype: object


Unnamed: 0_level_0,class_code,well,instance_id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-09-22 06:01:20,3,14,20170922060124,23820150.0,14180220.0,56.328092,1924727.0,24.868565,1.237343,3.0
2017-09-22 06:01:30,3,14,20170922060124,23822750.0,14186100.0,56.325837,1903833.0,24.904043,1.241226,3.0
2017-09-22 06:01:40,3,14,20170922060124,23826090.0,14194770.0,56.322797,1879878.0,24.94839,1.230881,3.0
2017-09-22 06:01:50,3,14,20170922060124,23830080.0,14202470.0,56.321162,1860787.0,24.99274,1.231334,3.0
2017-09-22 06:02:00,3,14,20170922060124,23833720.0,14211920.0,56.316696,1836318.0,25.037085,1.273969,3.0


In [10]:
#koliko je null vrijednosti po znacajci u skupu podataka

x = None
for dataFrame in data:
    if x is None:
        x = dataFrame.isna().sum(axis = 0)
    else:
        x += dataFrame.isna().sum(axis = 0)
print(x)

class_code           0
well                 0
instance_id          0
P-PDG              545
P-TPT              567
T-TPT           581249
P-MON-CKP       112190
T-JUS-CKP       170341
QGL            3717542
class              446
dtype: int64


In [11]:
#pretvorba null vrijednosti u konacne vrijednosti uporabom funkcije impute(...)
#dokumentacija https://tsfresh.readthedocs.io/en/latest/api/tsfresh.utilities.html#tsfresh.utilities.dataframe_functions.impute

import tsfresh as tf
print(data[0].dtypes)
imputed_data = []
for df in data:
    df2 = tf.utilities.dataframe_functions.impute(df.loc[:, df.columns != 'timestamp'])
    imputed_df = df2
    imputed_data.append(imputed_df)

class_code       int64
well             int64
instance_id      int64
P-PDG          float64
P-TPT          float64
T-TPT          float64
P-MON-CKP      float64
T-JUS-CKP      float64
QGL            float64
class          float64
dtype: object




In [12]:
#provjera jesu li uklonjene sve null vrijednosti

x = None
for df in imputed_data:
    if x is None:
        x = df.isna().sum(axis = 0)
    else:
        x += df.isna().sum(axis = 0)
print(x)

class_code     0
well           0
instance_id    0
P-PDG          0
P-TPT          0
T-TPT          0
P-MON-CKP      0
T-JUS-CKP      0
QGL            0
class          0
dtype: int64


In [14]:
#stvaranje prozora, svaki prozor sadrzi 30 redaka

data_windowed = []

for df in imputed_data:
    for i in range(df.shape[0]//30):
        window = df.iloc[i*30:(i+1)*30]
        data_windowed.append(window)

In [15]:
print(df.shape)
df.head()


(2881, 10)


Unnamed: 0_level_0,class_code,well,instance_id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-05-13 07:34:40,2,-1,16,28766210.0,20500730.0,125.862543,4063385.0,98.3296,0.0,0.0
2018-05-13 07:34:50,2,-1,16,28766195.0,20500740.0,125.86255,4063385.0,98.329567,0.0,0.0
2018-05-13 07:35:00,2,-1,16,28766195.0,20500740.0,125.86255,4063385.0,98.32953,0.0,0.0
2018-05-13 07:35:10,2,-1,16,28766195.0,20500740.0,125.86255,4063385.0,98.329494,0.0,0.0
2018-05-13 07:35:20,2,-1,16,28766195.0,20500740.0,125.86255,4063385.0,98.329457,0.0,0.0


In [26]:
data_windowed[0].head()

Unnamed: 0_level_0,class_code,well,instance_id,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-09-22 06:01:20,3,14,20170922060124,23820150.0,14180220.0,56.328092,1924727.0,24.868565,1.237343,3.0
2017-09-22 06:01:30,3,14,20170922060124,23822750.0,14186100.0,56.325837,1903833.0,24.904043,1.241226,3.0
2017-09-22 06:01:40,3,14,20170922060124,23826090.0,14194770.0,56.322797,1879878.0,24.94839,1.230881,3.0
2017-09-22 06:01:50,3,14,20170922060124,23830080.0,14202470.0,56.321162,1860787.0,24.99274,1.231334,3.0
2017-09-22 06:02:00,3,14,20170922060124,23833720.0,14211920.0,56.316696,1836318.0,25.037085,1.273969,3.0


In [46]:
df = data_windowed[0]

maximum = tf.feature_extraction.feature_calculators.maximum(df)
minimum = tf.feature_extraction.feature_calculators.minimum(df)
median = tf.feature_extraction.feature_calculators.median(df)
quantile = tf.feature_extraction.feature_calculators.quantile(df, 0.2)
mean_change = tf.feature_extraction.feature_calculators.mean_change(df)
avg_scnd_derivative = tf.feature_extraction.feature_calculators.mean_second_derivative_central(df)

#ValueError: shapes (30,6) and (30,) not aligned: 6 (dim 1) != 30 (dim 0)
#mean = tf.feature_extraction.feature_calculators.fft_aggregated(df, [{'aggtype':'centroid'}])
#variance = tf.feature_extraction.feature_calculators.fft_aggregated(df, [{'aggtype':'variance'}])
#skewness = tf.feature_extraction.feature_calculators.fft_aggregated(df, [{'aggtype':'skew'}])
#kurtosis = tf.feature_extraction.feature_calculators.fft_aggregated(df, [{'aggtype':'kurtosis'}])
abs_fft = tf.feature_extraction.feature_calculators.fft_coefficient(df, [{'coeff':1, 'attr':'abs'}])

#ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
coeff_of_variation = tf.feature_extraction.feature_calculators.variation_coefficient(df)

#ValueError: If using all scalar values, you must pass an index
#langevin_1 = tf.feature_extraction.feature_calculators.friedrich_coefficients(df, [{'m':1, 'r':0.2, 'coeff':0}])
#langevin_3 = tf.feature_extraction.feature_calculators.friedrich_coefficients(df, [{'m':3, 'r':0.2, 'coeff':1}])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().