<a href="https://colab.research.google.com/github/mhdadizadeh/Time-series-data-anomaly-detection/blob/main/Time_series_anomaly_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [None]:
# trend neighbor residual product test
import os
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.tsa.seasonal import seasonal_decompose
# from dateutil.parser import parse
import math

#++++++++++++++
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
#--------------

# input_path = os.environ.get('DATA_104_PATH') + '/test_data'
# output_path = '/output'

#+++++++++++++++++
output_path = '//drive//MyDrive//data_days//output'
input_path = "//drive//MyDrive//data_days//dataset//"
#-----------------


  #+++++++++++++ 
class mh_plot:

  def __call__(self,df, x_axis, y_axis, hue_by, start=0, end=100000):
      plt.figure(figsize=(30, 5))
      sns.scatterplot(x=x_axis, y=y_axis, hue=hue_by, data=df[start:end], legend='full')
      plt.show()
  #--------------

class TimeSeriesAnomalyDetector:

    def __call__(self, df):

        main_df = df.copy()
        main_df['value'] += abs(main_df['value'].min()) + 1
        main_df['predict_1']= None
        main_df['predict_2']= None
        main_df['label'] = None

        model_1_ = self.regression_outlier_model(main_df)
        model_2_predict = self.neighborhood_distance(main_df)

        main_df['predict_1'] = self.regression_outlier_model(main_df)
        main_df['predict_2'] = self.neighborhood_distance(main_df)
        main_df['predict_3'] = self.decomposition_model(main_df)

        main_df['label'] = main_df['predict_1'] | main_df['predict_2'] | main_df['predict_3'] 

        main_df['label'] = main_df['label'].replace(False, 0)
        main_df['label'] = main_df['label'].replace(True, 1)     

        
        return main_df[['timestamp', 'value', 'label']]


    def linear_detect_outliers(self, input_values):

        values = input_values.copy()

        Q1 = np.percentile(values, 25, interpolation = 'midpoint')  
        Q3 = np.percentile(values, 75, interpolation = 'midpoint')
        IQR = Q3 - Q1

        upper = values >= (Q3+1.5*IQR)
        lower = values <= (Q1-1.5*IQR)

        outliers = upper | lower 

        return outliers


    def decompose_detect_outliers(self, input_values):

        values = input_values.copy()

        Q1 = np.percentile(values, 5, interpolation = 'midpoint')  
        Q3 = np.percentile(values, 95, interpolation = 'midpoint')
        IQR = Q3 - Q1

        upper = values >= (Q3+1.5*IQR)
        lower = values <= (Q1-1.5*IQR)

        outliers = upper | lower 

        return outliers
      
    
    def neighbor_detect_outliers(self, input_values):
      
        values = input_values.copy()

        Q1 = np.percentile(values, 3, interpolation = 'midpoint')  
        Q3 = np.percentile(values, 97, interpolation = 'midpoint')
        IQR = Q3 - Q1

        upper = values >= (Q3+1.5*IQR)
        lower = values <= (Q1-1.5*IQR)

        outliers = upper | lower 

        return outliers
        

    def neighborhood_distance(self, input_df):
        
        df = input_df.copy()

        df['left_dist'] = (df['value'] - df['value'].shift(1)).abs()
        df['right_dist'] = (df['value'] - df['value'].shift(-1)).abs()
        df['neighbor_dist'] = df[['left_dist', 'right_dist']].min(axis=1)
        df['predict'] = self.neighbor_detect_outliers(df['neighbor_dist'])
        
        return df['predict']

    def linear_regression(self, input_df):

        df = input_df.copy()
        regr = linear_model.LinearRegression()
        train_x = np.array(df[['timestamp']])
        train_y =  np.array(df[['value']])
        regr.fit (train_x, train_y)
        regression_line = (regr.coef_[0][0]*train_x + regr.intercept_[0])
        print('line: ',regression_line)

        return regression_line


    def Polynomial_line(self, input_df):

        df = input_df.copy()

        train_x = np.array(df[['timestamp']])
        train_y =  np.array(df[['value']])
        
        poly = PolynomialFeatures(degree=2)
        train_x_poly = poly.fit_transform(train_x)

        regr = linear_model.LinearRegression()
        train_y_ = regr.fit(train_x_poly, train_y)
        XX = np.arange(0.0, len(df.index))
        regression_line = (regr.intercept_[0]+ regr.coef_[0][1]*XX+ regr.coef_[0][2]*np.power(XX, 2))
        line_size = len(df.index)
        regression_line = np.reshape(regression_line, (line_size, 1))

        return regression_line


    def regression_outlier_model(self, input_df):
    
        df = input_df.copy()
        no_slope_value = df[['value']] - self.Polynomial_line(df)
        return list(self.linear_detect_outliers(no_slope_value['value']))


    def trend_line(self, input_df):

        initial_df = input_df.copy()
        temp_df = input_df.copy()
        df = initial_df[['timestamp', 'value']]
        df = df.set_index('timestamp')

        additive_decomposition = seasonal_decompose(df, model='additive', period=30)

        trend = additive_decomposition.trend
        trend[0:30] = trend.iloc[31]
        trend[-30:0] = trend.iloc[-31]
        trend = np.array(trend)
        line_size = len(trend)
        trend = np.reshape(trend, (line_size, 1))
      
        return trend


    def decomposition_model(self, input_df):

        initial_df = input_df.copy()
        temp_df = input_df.copy()
        df = initial_df[['timestamp', 'value']]
        df = df.set_index('timestamp')

        additive_decomposition = seasonal_decompose(df, model='additive', period=30)

        trend = additive_decomposition.trend
        seasonal = additive_decomposition.seasonal
        residual = additive_decomposition.resid
        residual = list(residual.fillna(residual.mean()))
        temp_df['value'] = residual

        return  self.decompose_detect_outliers(residual) 

          





anomaly_detector = TimeSeriesAnomalyDetector()
#+++++++++++++
plotter = mh_plot()
#--------------

if __name__ == '__main__':
    #+++++++++++++ 
    score_dict = {}
    score_list = []
    #--------------
    filename_list = ['27.csv', '28.csv']
    # os.listdir(input_path)
    for filename in os.listdir(input_path):
        input_df = pd.read_csv(os.path.join(input_path, filename))
        #+ print(filename, len(input_df))
        result = anomaly_detector(input_df)
        #+++++++++++++
        # print('orginal')
        # plotter(df = input_df, x_axis = 'timestamp', y_axis = 'value', hue_by = 'label')
        # print('predict')
        # plotter(df = result, x_axis = 'timestamp', y_axis = 'value', hue_by = 'label')

        # print(result)

        # print(input_df['label'].value_counts())
        # print(result['label'].value_counts())
        f1 = f1_score(input_df['label'], result['label'])
        score_list.append(f1)
        print('|', f1, end=' ')
        print(filename, len(input_df))
        #--------------
        
        #+ result.to_csv(os.path.join(output_path, filename))
        #- print(f'item {filename} processed.')

    #+++++++++++++
    # print(result)
    score_index = [i.split('.')[0] for i in os.listdir(input_path)]
    result = pd.DataFrame(data= score_list, index= score_index, columns=['F1 score'])
    print(result)
    print('mean',result.mean())
    print('median', result.median())
    print(result.value_counts())
    #--------------

| 0.6666666666666666 12.csv 1680
| 0.8333333333333333 15.csv 1680
| 1.0 13.csv 1680
| 1.0 16.csv 1680
| 0.14423752635277584 0.csv 128562
| 0.09379247847757137 1.csv 129128
| 1.0 18.csv 1680
| 0.888888888888889 19.csv 1680
| 1.0 17.csv 1680
| 0.6666666666666666 11.csv 1680
| 0.9696969696969697 14.csv 1680
| 0.888888888888889 10.csv 1680
| 1.0 34.csv 1680
| 0.967741935483871 44.csv 1680
| 0.6555760936537277 2.csv 146254
| 0.6820512820512821 47.csv 8863
| 0.888888888888889 23.csv 1680
| 0.1935483870967742 45.csv 1680
| 0.00045063243932001116 46.csv 128679
| 0.19878603945371773 4.csv 147668
| 0.018691588785046728 41.csv 1680
| 0.0 42.csv 1680
| 1.0 20.csv 1680
| 0.47058823529411764 40.csv 1680
| 1.0 25.csv 1680
| 0.3636363636363636 35.csv 1680
| 0.053231939163498096 39.csv 1680
| 1.0 32.csv 1680
| 0.7272727272727273 29.csv 1680
| 0.5333333333333333 27.csv 1680
| 1.0 21.csv 1680
| 0.028571428571428574 36.csv 1680
| 0.5 31.csv 1680
| 0.6226893802102211 3.csv 146253
| 0.888888888888889 24.csv

In [None]:
a = np.array([0, 1, 2])

In [None]:
a.shape

(3,)

In [None]:
a = np.reshape(a, (3, 1)) # C-like index ordering
a

array([[0],
       [1],
       [2]])