## This notebook demonstrates the usage of the Fuzzy Inference System pipeline. It traines a FIS on the gas consumption data and shows the performance on a small test batch. 

In [1]:
import numpy as np
import csv
import os
import sys
import matplotlib.pyplot as plt

# add path of needed code
sys.path.append(os.getcwd()+'/Code/')

from Fuzzification import cluster, scale
import FIS

%matplotlib inline

In [2]:
def read_data(path, file):
    '''
    Reads in the gas data and returns it
    as a float type numpy array.
    Formats are in csv and the first two
    columns (row ID and date) are removed
    '''
    data_train= []
    with open(path+file+'.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            try:
                data_train.append([float(x) for x in row[2:]])
            except: 
                header = row
                print('The header: ')
                print(header[2:])
                ValueError
    return np.array(data_train)

In [3]:
path = 'Data/gas/'
file = 'train1'
data = read_data(path, file)
# for this data set, the target column is de first column
target_col = 0

The header: 
['gas', 'before1', 'before2', 'peak5', 'sum5', 'peak24', 'sum24', 'mean15', 'hour', 'FH', 'T', 'Q', 'U', 'peak5T', 'diffT', 'std_day', 'std_year', 'kwh', 'kwhpeak5', 'day_year', 'day_week', 'next_day_week']


In [4]:
# specify overlap of your sets, or the variance of the gaussian
overlap = 0.07
# Gaussian, triangle or trapezoid
mf = 'Gaussian'
# number of clusters per feature
Ncentroids = [11,7,7,7,9,9,11,9,11,11,11,11,11,9,11,9,9,9,9,11,7,7]
# A name for the FIS file
FIS_name='Demo_WM_gas'


## Training

In [5]:
# train the FIS
FIS.train(FIS_name, data, target_col, mf, Ncentroids, overlap)

In [6]:
# read in the trained FIS
method, mf, overlap, target_centroids, feature_centroids, RB = FIS.read(FIS_name +'.FIS')

## Compute the threshold

In [7]:
path = 'Data/gas/'
file = 'NTH_outliers_2008-2012'
anomalies = read_data(path, file)

The header: 
['before1', 'before2', 'peak5', 'sum5', 'peak24', 'sum24', 'mean15', 'hour', 'FH', 'T', 'Q', 'U', 'peak5T', 'diffT', 'std_day', 'std_year', 'kwh', 'kwhpeak5', 'day_year', 'day_week', 'next_day_week']


In [8]:
# compute threshold (NB takes a lot of time so reduce data size)
# if no threshold is given, then it return the mean firing strength of the given data
_, _, threshold = FIS.test(anomalies[0:100], mf, overlap, target_centroids, feature_centroids, RB, target_col)

In [9]:
print('The mean fring strength of the outliers batch is : ', threshold )

('The mean fring strength of the outliers batch is : ', 3.1357921961082006e-16)


## Anomaly Classification

### True Positives

In [11]:
# Anomaly classification (NB takes a lot of time so reduce data size)
# with a different batch of anomalies
_, _, true_positives = FIS.test(anomalies[100:200], mf, overlap, target_centroids, feature_centroids, RB, 
                                target_col, threshold = threshold )

In [12]:
print('The number of true positives is: ', true_positives)

('The number of true positives is: ', 89)


### False negatives

In [13]:
path = 'Data/gas/'
file = 'test1'
data_test = read_data(path, file)

The header: 
['gas', 'before1', 'before2', 'peak5', 'sum5', 'peak24', 'sum24', 'mean15', 'hour', 'FH', 'T', 'Q', 'U', 'peak5T', 'diffT', 'std_day', 'std_year', 'kwh', 'kwhpeak5', 'day_year', 'day_week', 'next_day_week']


In [14]:
# Anomaly classification (NB takes a lot of time so reduce data size)
# with a different batch of anomalies
_, _, false_positives = FIS.test(data_test[100:200], mf, overlap, target_centroids, feature_centroids, RB, 
                                target_col, threshold = threshold )

In [15]:
print('The number of true positives is: ', false_positives)

('The number of true positives is: ', 1)
