# #03 - TEDA-KNN Algorithm

## 1. Modules import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from random import randint, sample

from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, recall_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

#### Importing knn

In [2]:
from sklearn import neighbors

#### Importing our modules:

In [3]:
from teda import TEDA
from treating import dataTreating, dataSpliting, dataProcessing, dataSplit

In [4]:
def find_indices(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

## 2. Data managing

### 2.1 Power data:

In [5]:
data = pd.read_csv("Electric_Production.csv")
data.rename(columns={'DATE':'Date', 'IPG2211A2N':'Production'}, inplace=True)
data.head()

Unnamed: 0,Date,Production
0,1/1/1985,72.5052
1,2/1/1985,70.672
2,3/1/1985,62.4502
3,4/1/1985,57.4714
4,5/1/1985,55.3151


In [6]:
series = "Production"
date = "Date"

### 2.2 Inserting outliers:

In [7]:
data['label'] = 0

In [8]:
outliers_percentage = 8/100

#Outliers do tipo zero
#z = o[0:np.round(len(o)/2).astype(int)]
z = [61,37,287,230,214,162,98,30,278,28,25,314,348,195,113,387]

#Outliers do tipo pico
#p = o[np.round(len(o)/2).astype(int): -1]
p = [125,242,291,377,34,206,249,118,389,12,376,375,74,146,272,368]

#### Peak-type outliers will have **twice** the magnitude. Therefore:

In [9]:
data['Outlier'] = data[series]

data['Outlier'].iloc[p] = 1.5*data[series].iloc[p]
data['label'].iloc[p] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


#### Zero-type outliers has value equals **zero**:

In [10]:
data['Outlier'].iloc[z] = 0
data['label'].iloc[z] = 1

### Let's see the new curve:

In [11]:
figure = make_subplots(specs=[[{"secondary_y": True}]])

figure.update_layout(
    title="Dataset"
)

figure.add_trace(
    go.Scatter(y=data['Production'], name='Without Outliers'),
    secondary_y=False
)

figure.update_xaxes(title='Time (months)')
figure.update_yaxes(title='Eletric Production', secondary_y=False)
figure.show()

In [12]:
figure = make_subplots(specs=[[{"secondary_y": True}]])

figure.update_layout(
    title="Dataset with outliers"
)

figure.add_trace(
    go.Scatter(y=data['Outlier'], name='With Outliers'),
    secondary_y=False
)

figure.update_xaxes(title='Time (months)')
figure.update_yaxes(title='Eletric Production', secondary_y=False)
figure.show()

In [13]:
RMSE_full = []
MAE_full = []

window = 3
forecasting = 2
n_neighbors = 10

# Data processing
df, df_train, df_test, X_train, y_train, X_test, y_test = dataSplit(data["Outlier"], data["label"], window=window, forecasting=forecasting, N_splits=4)

#TEDA
teda = TEDA(threshold=1.2)
N_outlier_max = 3    # Consecutive outlier correction flagN_outlier_max   # Maximum of outliers corrected by TEDAForecasting
correction = True    # Consecutive outlier correction flag

FLAG = []

# kNN Model
n_neighbors = 10
knn_model = neighbors.KNeighborsRegressor(n_neighbors, weights="uniform")

#fit
knn_model.fit(X_train, y_train)


Y_KNN = []
Ya_py = []          # Vector of auxiliary predicted values
Y_py = []           # Vector of predicted values
Y_v = []            # Vector of virtual predicted values
DB = []             # Vector of saved values (database)
outlier_count = 0   # Outlier count 
y_knn_pred_py = 0.0     # Initial predicted value
X_ant = np.zeros(window).tolist()         # Initial previous input


for i in X_test:
    ## Acquiring the data input

    valor_atual = []
    for j in X_ant[1:]:
        valor_atual.append(j)
    valor_atual.append(i[-1])

    #Checking if it is an outlier
    flag = teda.run(valor_atual[-1])
    FLAG.append(flag)

    ## First Stage: Correcting outlier
    if (flag == 1):      
        outlier_count = outlier_count + 1
        valor_atual[-1] = y_knn_pred_py[0]
    else:
        outlier_count = 0
    
    #Evaluating the flood of outliers
    if(correction == True and outlier_count == N_outlier_max + 1):
        valor_atual[-1] = i[0]
    
    #Saving 
    Ya_py.append(y_knn_pred_py)

    input_vector_reshaped = np.array(valor_atual).reshape(1,-1)
    #Predicting the next value
    y_knn = knn_model.predict(input_vector_reshaped)
    y_knn_pred_py = y_knn[0]

    Y_KNN.append(y_knn_pred_py)

    
    # Second stage
    if (flag == 1):
        valor_atual[-1] = y_knn_pred_py[0]

    if(correction == True and outlier_count == N_outlier_max + 1):
        outlier_count = 0
        valor_atual[-1] = i[0]
        
    DB.append(valor_atual)

    #Updating previous values vector
    X_ant = valor_atual


df_test['flag_py'] = FLAG
df_test['save_py'] = DB
df_test['ya_pred_py'] = Ya_py
df_test['y_knn_pred_py'] = Y_KNN

numberOfOutliersPython = df_test['flag_py'].sum()
outliersIndexPython = find_indices(df_test['flag_py'], 1)

# TEDA Metrics:
f1 = f1_score(df_test['label'], df_test['flag_py'])
accuracy = accuracy_score(df_test['label'], df_test['flag_py'])
recall = recall_score(df_test['label'], df_test['flag_py'])


# Forecasting
RMSE_KNN = []
for i in range(0, forecasting):
    rmse = mean_squared_error(y_test[0:-1, i], np.array(Y_KNN)[1:, i], squared=False)
    RMSE_KNN.append(rmse)
rmse_pred = mean_squared_error(y_test[0:-1, 0], np.array(Y_KNN)[1:, 0], squared=False)
RMSE_full.append(RMSE_KNN)
RMSE_mean = np.mean(RMSE_KNN)
RMSE_max = np.max(RMSE_KNN)
RMSE_min = np.min(RMSE_KNN)
RMSE_std = np.std(RMSE_KNN)

MAE_KNN = []
for i in range(0, forecasting):
    mae = mean_absolute_error(y_test[0:-1, i], np.array(Y_KNN)[1:, i])
    MAE_KNN.append(mae)
mae_pred = mean_absolute_error(y_test[0:-1, 0], np.array(Y_KNN)[1:, 0])
MAE_full.append(MAE_KNN)
MAE_mean = np.mean(MAE_KNN)
MAE_max = np.max(MAE_KNN)
MAE_min = np.min(MAE_KNN)
MAE_std = np.std(MAE_KNN)


# Visualization

In [14]:
figure = make_subplots(specs=[[{"secondary_y": True}]])

figure.update_layout(
    title="Dataset with outliers"
)

figure.add_trace(
    go.Scatter(y=y_test[0:-1, 0], name='Production With Outliers'),
    secondary_y=False
)

figure.add_trace(
    go.Scatter(y=np.array(Y_KNN)[1:, 0], name='kNN'),
    secondary_y=False
)


figure.update_xaxes(title='Time (months)')
figure.update_yaxes(title='Eletric Production', secondary_y=False)
figure.show()