In [1]:
import glob
import numpy as np
import pandas as pd
from math import sqrt
from pickle import dump
from pickle import load
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV


from pyts.classification import BOSSVS

In [2]:
alarm = '1031_H'

In [3]:
def create_df(alarm_name):
    """ Returns a dataset of all valve values and specific alarm column."""
    
    dfs = {file.split("_")[-4]: pd.read_pickle(file) for file in \
           glob.glob("S:\SRH\BDBA_Sem_2\Case_study_1\data\*.pkl")}
    
    dfs_sorted = dict(sorted(dfs.items()))
    df_single = pd.concat(dfs_sorted, axis=0)
    
    alarms = ['1031_H', '1031_L', '1034_H', '1034_L', '1037_H', '1037_L']
    alarms.remove(alarm_name)
    
    df_alarm = df_single.drop(alarms, axis=1)
    df_alarm.fillna(0, inplace=True)
    
    return df_alarm

In [4]:
df_alarm = create_df(alarm)

In [5]:
def input_sequence(data, past_seq_len, future_window):
    """ Creates a small input sequence of a given seq length and 
        returns two numpy arrays as input and output sequence
        
        Args:
        data: input dataframe
        past_seq_len: integer number
        future_window: integer number

        """
    target_df = data.iloc[:,-1]
    input_x = []
    output_y = []
    for i in range(len(data) - past_seq_len -1):
        ins = data.iloc[i:(i+past_seq_len), 0:data.shape[1]-1]
        ots = np.where((target_df.iloc[(i+past_seq_len):(i+past_seq_len+future_window)]>0).any(), 1, 0)
        input_x.append(ins)
        output_y.append(ots)
    in_array = np.array(input_x).astype(np.float32)
    out_array = np.array(output_y).astype(np.float32)
    
    return in_array, out_array.reshape(-1,1)

In [6]:
def preprocessed_df(df, val_pct):
    """ Creates train, validation and test set after applying normalisation of all feature cols
    Args:
    df: dataframe object
    val_pct: percentage size of validation plus test size (float)
    """
    
    test_data_size = round(df.shape[0] * val_pct)
    
    train_data = df[:-test_data_size]
    test_data = df[-test_data_size:]
    
    # Scaling the data
    scalar = MinMaxScaler()
    scalar.fit(train_data.iloc[:,:-1])
    # save the scaler
    dump(scalar, open('model_objects\scaler_cls_knn'+alarm+'.pkl', 'wb'))
    
    train_scaled = scalar.transform(train_data.iloc[:,:-1]) 
    test_scaled = scalar.transform(test_data.iloc[:,:-1])
    
    df_train = pd.DataFrame(train_scaled)
    df_train['alarm'] = train_data.iloc[:,-1].values
    df_test = pd.DataFrame(test_scaled)
    df_test['alarm'] = test_data.iloc[:,-1].values
    
    return df_train, df_test

In [7]:
df_train, df_test = preprocessed_df(df_alarm, 0.3)

In [8]:
# specify the window size
n_steps = 15
future_window = 20

# split into samples
X_train, y_train = input_sequence(df_train, n_steps, future_window)
X_test, y_test = input_sequence(df_test, n_steps, future_window)

In [9]:
X_train.shape

(16673, 15, 6)

In [10]:
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,ny*nx))

In [11]:
y_train = y_train.ravel()

In [12]:
y_test = y_test.ravel()

In [13]:
nsamplesTest, nx_test, ny_test = X_test.shape
X_test = X_test.reshape((nsamplesTest,nx_test*ny_test))

In [14]:
print(f"X_train.shape: {X_train.shape},y_train.shape: {y_train.shape}\n"
      f"X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}")

X_train.shape: (16673, 90),y_train.shape: (16673,)
X_test.shape: (7137, 90), y_test.shape: (7137,)


In [15]:
# clf = KNeighborsClassifier(metric='dtw')

clf = GridSearchCV(BOSSVS(),{'window_size': ([4, 12, 15])}, cv=5)

clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
accuracy

0.4308532997057587

In [16]:
clf.best_params_

{'window_size': 4}