In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

## Data Preparation

### EDA

#### Read Data

In [2]:
df = pd.read_csv('data/train.csv')
df.head(3)

Unnamed: 0,feature1,feature2,feature3,label
0,156,326,275,0
1,0,372,870,0
2,0,362,8113,0


#### Duplicated

In [3]:
df[df.duplicated()].head(3) # display duplicate samples

Unnamed: 0,feature1,feature2,feature3,label
312,0,293,253,0
388,0,304,1380,0
423,0,305,2507,0


In [4]:
df.drop_duplicates(inplace=True) # drop duplicate samples

#### Basic Info

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56651 entries, 0 to 80000
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   feature1  56651 non-null  int64
 1   feature2  56651 non-null  int64
 2   feature3  56651 non-null  int64
 3   label     56651 non-null  int64
dtypes: int64(4)
memory usage: 2.2 MB


In [6]:
df.describe()

Unnamed: 0,feature1,feature2,feature3,label
count,56651.0,56651.0,56651.0,56651.0
mean,0.891176,273.270957,5070.166,0.000282
std,16.963084,785.547903,21341.2,0.016803
min,0.0,0.0,0.0,0.0
25%,0.0,217.0,751.0,0.0
50%,0.0,249.0,1868.0,0.0
75%,0.0,306.0,5251.0,0.0
max,1484.0,54540.0,4018362.0,1.0


#### Label distribution

In [7]:
df.label.value_counts()

0    56635
1       16
Name: label, dtype: int64

### Training data

In [8]:
# Only non-anomalous data

df = df.sample(frac=1, random_state=0)
train_df = df.loc[df.label == 0][50:]
train_df.head(3)

Unnamed: 0,feature1,feature2,feature3,label
16570,0,204,3010,0
77592,0,239,4664,0
3616,0,327,479,0


### Validation data

In [9]:
val_df = df.loc[df.label == 0][:50] # 50 negative samples (Non-anomalous)
val_df = pd.concat([df.loc[df.label==1], val_df], axis=0) # 16 positive samples (Anomalous)
val_df = val_df.sample(frac=1, random_state=0)
val_df.head(3)

Unnamed: 0,feature1,feature2,feature3,label
64513,0,217,6084,0
67426,0,339,997,0
62766,0,287,999,0


## Model

In [10]:
class AnomalyDetection():
    
    def __init__(self, kind='default'):
        
        self.kind = kind
        
    def estimateGaussian(self, data):
        
        self.mean = data.mean(axis=0)
        if self.kind == 'multi':
            self.sigma = np.cov(data.T)
            self.sigma_det = np.linalg.det(self.sigma)
            self.sigma_inv = np.linalg.inv(self.sigma)
        else:
            self.sigma = np.std(data, axis=0)
            
    def univariate(self, data):
    
        upper_term = (np.exp((-((data - self.mean)**2) / (2*(self.sigma**2)))))
        lower_term = (np.sqrt(2*np.pi) * self.sigma)

        prob = upper_term / lower_term

        return (np.prod(prob, axis=1))
    
    def multivariateGaussian(self, data):

        n = data.shape[1]
        
        upper_term = (np.exp(-(((data - self.mean).dot(self.sigma_inv)).dot((data - self.mean).T)) / 2))
        lower_term = (((2*np.pi)**(n/2))*((self.sigma_det)**(0.5)))

        prob = upper_term / lower_term

        return prob
    
    def getProb(self, data):
        
        if self.kind == 'multi':
            prob = [(self.multivariateGaussian(sample.reshape(1, -1)))[0, 0] for sample in data]
            return np.array(prob)
        else:
            return self.univariate(data)
    
    def optimalThreshold(self, val_x, val_y):
        
        prob = self.getProb(val_x)
    
        stepsize = (np.max(prob) - np.min(prob))/1000
        epsilon = np.arange(np.min(prob), np.max(prob), stepsize)
        epsilon = epsilon[::-1]

        pred = np.where(prob < epsilon[:, np.newaxis], 1, 0)

        F1 = np.array([f1_score(val_y, pred[i]) for i in range(len(pred))])

        return epsilon[F1.argmax()], F1.max()
    
    def predict(self, val_x, threshold):
        
        prob = self.getProb(val_x)
        
        return np.where(prob < threshold, 1, 0)

## Training

#### Train

In [11]:
a = AnomalyDetection(kind='multi') # multivariate gausssian

In [12]:
a.estimateGaussian(train_df.drop(['label'], axis=1).values)

#### Get best threshold

In [13]:
threshold, F1_score = a.optimalThreshold(val_df.drop(['label'], axis=1).values, val_df.label.values)
print(threshold, F1_score)

4.5165663660187224e-10 1.0


#### Prediction

In [14]:
pred = a.predict(val_df.drop(['label'], axis=1).values, threshold)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0])

#### Evaluation

In [15]:
f1_score(val_df.label.values, pred)

1.0