# Priprema za projekt

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
from pathlib import Path
from multiprocessing.dummy import Pool as ThreadPool
from collections import defaultdict
from natsort import natsorted

In [20]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# 1. Učitavanje podataka

In [21]:
data_path = Path('data')
events_names = {0: 'Normal',
                1: 'Abrupt Increase of BSW',
                2: 'Spurious Closure of DHSV',
                3: 'Severe Slugging',
                4: 'Flow Instability',
                5: 'Rapid Productivity Loss',
                6: 'Quick Restriction in PCK',
                7: 'Scaling in PCK',
                8: 'Hydrate in Production Line'
               }
columns = ['P-PDG',
           'P-TPT',
           'T-TPT',
           'P-MON-CKP',
           'T-JUS-CKP',
           'P-JUS-CKGL',
           'T-JUS-CKGL',
           'QGL',
           'class']
rare_threshold = 0.01

In [22]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [23]:
real_instances = list(class_and_file_generator(data_path, real=True, simulated=False, drawn=False))
simulated_instances = list(class_and_file_generator(data_path, real=False, simulated=True, drawn=False))
drawn_instances = list(class_and_file_generator(data_path, real=False, simulated=False, drawn=True))


In [24]:


cols = pd.read_csv(real_instances[0][1]).columns
col_na_val_num = [0]*cols.shape[0]
val_num = 0
num_examples = 0
num_examples_with_nan_val = [0]*cols.shape[0]
print(cols)
for i, instance in enumerate(real_instances[::5]):
    df = pd.read_csv(instance[1])
    val_num += df.shape[0]
    num_examples += 1
    for i, col in enumerate(cols):
        num_na = df[col].isna().sum()
        if num_na > 0:
            num_examples_with_nan_val[i] += 1 
        col_na_val_num[i] += num_na
df2 = pd.read_csv(real_instances[0][1])
df2.head()

print("processed_real")

cols2 = pd.read_csv(simulated_instances[0][1]).columns
col_na_val_num2 = [0]*cols2.shape[0]
val_num2 = 0
num_examples2 = 0
num_examples_with_nan_val2 = [0]*cols2.shape[0]
for i, instance in enumerate(simulated_instances[::5]):
    df = pd.read_csv(instance[1])
    val_num2 += df.shape[0]
    num_examples2 += 1
    for i, col in enumerate(cols):
        num_na2 = df[col].isna().sum()
        if num_na2 > 0:
            num_examples_with_nan_val2[i] += 1 
        col_na_val_num2[i] += num_na2
df_sim = pd.read_csv(simulated_instances[0][1])
df_sim.head()

print("processed_simulated")

cols3 = pd.read_csv(drawn_instances[0][1]).columns
col_na_val_num3 = [0]*cols3.shape[0]
val_num3 = 0
num_examples3 = 0
num_examples_with_nan_val3 = [0]*cols3.shape[0]
for i, instance in enumerate(drawn_instances[::5]):
    df = pd.read_csv(instance[1])
    val_num3 += df.shape[0]
    num_examples3 += 1
    for i, col in enumerate(cols):
        num_na3 = df[col].isna().sum()
        if num_na3 > 0:
            num_examples_with_nan_val3[i] += 1 
        col_na_val_num3[i] += num_na3
df_drawn = pd.read_csv(drawn_instances[0][1])
df_drawn.head()

print("processed_drawn")

zipped = list(zip(cols, col_na_val_num, num_examples_with_nan_val
                 , col_na_val_num2, num_examples_with_nan_val2
                 , col_na_val_num3, num_examples_with_nan_val3))

Index(['timestamp', 'P-PDG', 'P-TPT', 'T-TPT', 'P-MON-CKP', 'T-JUS-CKP',
       'P-JUS-CKGL', 'T-JUS-CKGL', 'QGL', 'class'],
      dtype='object')
processed_real
processed_simulated
processed_drawn


In [25]:

df = pd.DataFrame(zipped)

df.columns = ['value', 'real_na_num','examples_with_na', 
              'sim_na_num','sim_with_na', 
              'drawn_na_num','drawn_with_na',] 
print("real examples:", num_examples)
print("simulated examples", num_examples2)
print("drawn examples", num_examples3)
df.head(10)

real examples: 205
simulated examples 188
drawn examples 4


Unnamed: 0,value,real_na_num,examples_with_na,sim_na_num,sim_with_na,drawn_na_num,drawn_with_na
0,timestamp,0,0,0,0,0,0
1,P-PDG,1087,5,0,0,0,0
2,P-TPT,1179,5,0,0,0,0
3,T-TPT,1190,5,1160957,43,0,0
4,P-MON-CKP,140515,11,0,0,0,0
5,T-JUS-CKP,276373,20,0,0,0,0
6,P-JUS-CKGL,775870,77,6927621,188,705602,4
7,T-JUS-CKGL,2933662,205,6927621,188,705602,4
8,QGL,647187,59,6927621,188,705602,4
9,class,982,10,0,0,0,0


In [6]:
df_real = pd.read_csv(real_instances[0][1])

In [7]:
df_real.head()

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2017-09-25 01:00:31.000000,23423770.0,13704100.0,56.55643,2912794.0,36.08957,20146160.0,,1.629969,3
1,2017-09-25 01:00:32.000000,23423620.0,13704040.0,56.55665,2918400.0,36.0875,20146040.0,,1.629156,3
2,2017-09-25 01:00:33.000000,23423470.0,13703970.0,56.55687,2924006.0,36.08543,20145920.0,,1.628343,3
3,2017-09-25 01:00:34.000000,23423320.0,13703610.0,56.55733,2929612.0,36.08337,20145800.0,,1.62753,3
4,2017-09-25 01:00:35.000000,23423170.0,13703240.0,56.55778,2935218.0,36.08131,20145680.0,,1.626717,3


In [8]:
print(df_real['QGL'].isna().sum())
df_real.tail()

0


Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
17965,2017-09-25 05:59:56.000000,24411670.0,16584760.0,41.25545,1221508.0,33.09589,19826650.0,,1.235328,3
17966,2017-09-25 05:59:57.000000,24411210.0,16584340.0,41.24736,1221281.0,33.09562,19826650.0,,1.232593,3
17967,2017-09-25 05:59:58.000000,24411020.0,16584660.0,41.23891,1221053.0,33.09534,19826650.0,,1.229858,3
17968,2017-09-25 05:59:59.000000,24410830.0,16584970.0,41.23046,1220826.0,33.09507,19826650.0,,1.227123,3
17969,2017-09-25 06:00:00.000000,24410650.0,16585030.0,41.22222,1221356.0,33.09479,19826650.0,,1.224388,3


In [9]:
df_simulated = pd.read_csv("data/1/SIMULATED_00001.csv")

In [28]:
print(df_simulated['QGL'].isna().sum())
print(df_simulated.shape)
df_simulated.head()

45599
(45599, 10)


Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2018-10-06 03:57:02.000000,22520410.0,13068630.0,96.93275,1049626.0,70.33402,,,,0
1,2018-10-06 03:57:03.000000,22520430.0,13068650.0,96.93279,1049626.0,70.33425,,,,0
2,2018-10-06 03:57:04.000000,22520440.0,13068640.0,96.93283,1049626.0,70.33449,,,,0
3,2018-10-06 03:57:05.000000,22520420.0,13068600.0,96.93287,1049626.0,70.33473,,,,0
4,2018-10-06 03:57:06.000000,22520390.0,13068560.0,96.9329,1049626.0,70.33496,,,,0


In [30]:
df_simulated2 = pd.read_csv("data/1/SIMULATED_00004.csv")
print(df_simulated2['QGL'].isna().sum())

59999


In [25]:
df_real[0:1]

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,P-JUS-CKGL,T-JUS-CKGL,QGL,class
0,2017-02-01 02:02:07.000000,0.0,10092110.0,119.0944,1609800.0,84.59782,1564147.0,,0.0,0


In [None]:
grouped = df_

In [27]:
df_real.dtypes

timestamp      object
P-PDG         float64
P-TPT         float64
T-TPT         float64
P-MON-CKP     float64
T-JUS-CKP     float64
P-JUS-CKGL    float64
T-JUS-CKGL    float64
QGL           float64
class           int64
dtype: object

In [43]:
df_real['timestamp']

0        2017-02-01 02:02:07.000000
1        2017-02-01 02:02:08.000000
2        2017-02-01 02:02:09.000000
3        2017-02-01 02:02:10.000000
4        2017-02-01 02:02:11.000000
                    ...            
17869    2017-02-01 06:59:56.000000
17870    2017-02-01 06:59:57.000000
17871    2017-02-01 06:59:58.000000
17872    2017-02-01 06:59:59.000000
17873    2017-02-01 07:00:00.000000
Name: timestamp, Length: 17874, dtype: object

In [40]:
df_real['P-PDG']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
17869    0.0
17870    0.0
17871    0.0
17872    0.0
17873    0.0
Name: P-PDG, Length: 17874, dtype: float64

In [55]:
df_real.iloc(3)

ValueError: No axis named 3 for object type DataFrame