# Overview
This notebook contains examples for the `wtphm` readme file

In [18]:
cd

C:\Users\leahy


In [19]:
cd google drive/ucc/phd/code/modules/wtphm

C:\Users\leahy\google drive\ucc\phd\code\modules\wtphm


# `get_grouped_event_data`

The `get_grouped_event_data` function groups together similar faults or events, and turns them into the "same" event.

An example would be faults across different pitch motors on different turbine blades being grouped as the same type of fault. This is useful, as there typically _very_ fault samples on wind turbines, so treating these as three separate types of faults would give even fewer samples for each class.

In [24]:
import wtphm
import pandas as pd

# to fully display the dataframes:
pd.set_option('display.max_colwidth', 100)

events = pd.read_csv('examples/events_data.csv',
                     parse_dates=['time_on', 'time_off'])
events.duration = pd.to_timedelta(events.duration)

events.head()

Unnamed: 0,turbine_num,code,time_on,time_off,duration,stop_cat,description
0,20,9,2015-06-01 02:51:01,2015-06-01 02:54:13,00:03:12,ok,description anonymised
1,18,9,2015-06-01 02:51:38,2015-06-01 02:53:29,00:01:51,ok,description anonymised
2,18,84,2015-06-01 02:53:29,2015-06-01 03:00:57,00:07:28,ok,description anonymised
3,20,84,2015-06-01 02:54:13,2015-06-01 03:09:24,00:15:11,ok,description anonymised
4,18,9,2015-06-01 03:00:57,2015-06-01 03:03:46,00:02:49,ok,description anonymised


Note the `events` data used in the examples here is anonymised - all codes have been mapped to a random set of numbers, and descriptions have been removed

In [25]:
# codes that cause the turbine to come to a stop
stop_codes = events[events.stop_cat.isin(
    ['fault', 'maintenance', 'test', 'sensor', 'grid'])].code.unique()

# these are groups of codes, where each group represents a set of pitch-related
# events, where each memeber of the set represents the same event but along a
# different blade axis
pitch_code_groups = [[300, 301, 302], [400, 401], [501, 502, 503], [601, 602],
                     [701, 702, 703]]

events[events.code.isin([i for s in pitch_code_groups for i in s])].head()

Unnamed: 0,turbine_num,code,time_on,time_off,duration,stop_cat,description
8,20,502,2015-06-01 03:53:30,2015-06-01 04:09:36,00:16:06,fault,description anonymised pitch axis 3
10,20,601,2015-06-01 03:53:31,2015-06-01 03:54:44,00:01:13,fault,description anonymised pitch axis 2
13,20,300,2015-06-01 03:53:40,2015-06-01 04:09:36,00:15:56,fault,description anonymised pitch axis 1
15,20,302,2015-06-01 03:53:40,2015-06-01 04:09:36,00:15:56,fault,description anonymised pitch axis 3
335,8,502,2015-06-01 20:53:42,2015-06-01 21:03:10,00:09:28,fault,description anonymised pitch axis 3


As can be seen, the events data has a number of different codes for data along different pitch axes.

Below, we group these together as the same code:

In [26]:
# group the data
grouped_events, grouped_stop_codes = wtphm.batch.get_grouped_event_data(
    event_data=events, code_groups=pitch_code_groups,
    fault_codes=stop_codes)

grouped_events[grouped_events.code.isin(
    [i for s in pitch_code_groups for i in s])].head()

Unnamed: 0,turbine_num,code,time_on,time_off,duration,stop_cat,description
8,20,501,2015-06-01 03:53:30,2015-06-01 04:09:36,00:16:06,fault,description anonymised pitch axis 2/3 (original codes 501/502/503)
10,20,601,2015-06-01 03:53:31,2015-06-01 03:54:44,00:01:13,fault,description anonymised pitch axis 2/3 (original codes 601/602)
13,20,300,2015-06-01 03:53:40,2015-06-01 04:09:36,00:15:56,fault,description anonymised pitch axis 1/2/3 (original codes 300/301/302)
15,20,300,2015-06-01 03:53:40,2015-06-01 04:09:36,00:15:56,fault,description anonymised pitch axis 1/2/3 (original codes 300/301/302)
335,8,501,2015-06-01 20:53:42,2015-06-01 21:03:10,00:09:28,fault,description anonymised pitch axis 2/3 (original codes 501/502/503)


# Create the batches

Now, we get the batches. These represent groups of events all linked to the same down-time event, as described in [1]. More information can be found in the documentation for the `batch.get_batch_data` function.

In [30]:
# create the batches
batches = wtphm.batch.get_batch_data(
    event_data=grouped_events, fault_codes=grouped_stop_codes, ok_code=207,
    t_sep_lim='1 hours')

batches.head()

Unnamed: 0,turbine_num,fault_start_codes,all_start_codes,start_time,fault_end_time,down_end_time,fault_dur,down_dur,fault_event_ids,all_event_ids
0,20,"(144, 501)","(144, 501)",2015-06-01 03:53:30,2015-06-01 04:09:47,2015-06-01 04:13:53,0 days 00:16:17,0 days 00:20:23,"Int64Index([8, 9, 10, 11, 12, 13, 14, 15, 17, 21, 20, 22, 23, 25, 26, 27], dtype='int64')","Int64Index([ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,  25, ..."
1,20,"(73,)","(73, 141)",2015-06-01 17:13:59,2015-06-01 17:13:59,2015-06-01 17:14:51,0 days 00:00:00,0 days 00:00:52,"Int64Index([318], dtype='int64')","Int64Index([317, 318, 319, 320, 321], dtype='int64')"
2,20,"(0,)","(0,)",2015-06-02 14:08:59,2015-06-02 14:20:21,2015-06-02 14:22:30,0 days 00:11:22,0 days 00:13:31,"Int64Index([972, 973, 992], dtype='int64')","Int64Index([972, 973, 974, 975, 976, 992, 993, 1001, 1002, 1003], dtype='int64')"
3,20,"(68, 113, 144, 501)","(68, 113, 144, 501)",2015-06-04 16:22:24,2015-06-04 16:27:42,2015-06-04 16:44:33,0 days 00:05:18,0 days 00:22:09,"Int64Index([1910, 1909, 1908, 1907, 1911, 1912, 1913, 1916, 1920, 1919, 1918,  1917, ...","Int64Index([1910, 1909, 1908, 1907, 1911, 1912, 1913, 1914, 1915, 1916, 1917,  1918, ..."
4,20,"(155,)","(155,)",2015-06-05 09:29:21,2015-06-06 16:37:37,2015-06-06 16:38:02,1 days 07:08:16,1 days 07:08:41,"Int64Index([1985, 3090], dtype='int64')","Int64Index([1985, 1986, 1987, 1988, 1993, 1995, 2003, 2337, 2346, 2356, 2359,  2358, ..."


# Labelling the SCADA data for classification

The main use for this package is to label SCADA data for classification purposes. Here, we use the `classification.scada_labelling.label_stoppages` function.

In order to do this, first the batches must be labelled. This is not covered in this 

In [33]:
events.stop_cat.unique()

array(['ok', 'fault', 'test', 'maintenance', 'sensor', 'curtailed',
       'grid'], dtype=object)

In [39]:
grouped_events.loc[grouped_events.code == 207, 'stop_cat'].unique()

array(['ok'], dtype=object)

In [94]:
from collections import Counter

def _get_batch_fault_start_cats(fault_start_codes, events_data):
    """
    Gets the ``stop_cat`` for the fault start codes of a batch
    
    Args
    ----
    fault_start_codes
    """
    fs_cats = tuple()
    for fsc in fault_start_codes:
        cat = events_data.loc[events_data.code == fsc, 'stop_cat']\
            .unique()[0]
        fs_cats += tuple([cat])
    return fs_cats

def _apply_most_common(batch_fault_start_cats):
    batch_fault_cat_counts = batch_fault_start_cats.apply(
        lambda x: Counter(fault for fault in x))

def label_batch_fault_cats(batches, events_data):
    batch_fault_start_cats = batches.fault_start_codes.apply(
        get_batch_fault_start_cats, **{'events_data':grouped_events})
    

batch_fault_start_cats = batches.fault_start_codes.apply(
        _get_batch_fault_start_cats, **{'events_data':grouped_events})

In [117]:
cr = batch_fault_start_cats.apply(
        lambda x: Counter(elem for elem in x)).loc[19]

In [118]:
cur_val = 0
for k, v in cr.items():
    if v > cur_val:
        
    print(k, v)
    

fault 4
grid 3


In [113]:
batch_fault_start_cats[batch_fault_start_cats == ('fault', 'grid', 'grid', 'grid', 'fault', 'fault', 'fault')]

19      (fault, grid, grid, grid, fault, fault, fault)
186     (fault, grid, grid, grid, fault, fault, fault)
522     (fault, grid, grid, grid, fault, fault, fault)
936     (fault, grid, grid, grid, fault, fault, fault)
1196    (fault, grid, grid, grid, fault, fault, fault)
Name: fault_start_codes, dtype: object

In [None]:


def _get_batch_fault_cat_counts(batch_fault_start_cats):
    """
    Gets the count of each stop_cat for each batch
    """
    
        
        

In [92]:
batch_fault_start_cats = batches.fault_start_codes.apply(
        _get_batch_fault_start_cats, **{'events_data':grouped_events})

In [104]:

Counter(elem for elem in batch_fault_start_cats.iloc[0])


Counter({'fault': 2})

In [101]:
batch_fault_start_cats.iloc[0]

('fault', 'fault')

In [100]:
tuple.count()

TypeError: descriptor 'count' of 'tuple' object needs an argument

In [59]:
batches.fault_start_cats.values

array([{'fault'}, {'sensor'}, {'maintenance'}, ..., {'test'}, {'test'},
       {'fault'}], dtype=object)