In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [2]:
data = pd.read_csv("data\\ML-MATT-CompetitionQT2021_train.csv", sep=";")

In [3]:
data.head()

Unnamed: 0,Time,CellName,PRBUsageUL,PRBUsageDL,meanThr_DL,meanThr_UL,maxThr_DL,maxThr_UL,meanUE_DL,meanUE_UL,maxUE_DL,maxUE_UL,maxUE_UL+DL,Unusual
0,10:45,3BLTE,12.3848,1.4019,0.3927,0.0438,16.6522,0.6806,1.1293,1.0491,5,3,8,1
1,9:45,1BLTE,22.0438,2.0016,0.562,0.2697,10.3994,1.1771,1.448,1.163,6,5,11,1
2,7:45,9BLTE,0.5105,0.4258,0.0152,0.0106,0.2755,0.1685,1.0379,1.0535,1,2,3,1
3,2:45,4ALTE,1.9963,1.1513,0.9908,0.0245,64.7465,0.8747,1.0766,1.0526,3,2,5,1
4,3:30,10BLTE,0.303,0.404,0.016,0.013,0.348,0.168,1.011,1.011,2,1,3,0


In [4]:
data[["hour", "minute"]] = data["Time"].str.split(":", expand=True).astype(int)
data["morning"] = ((data["hour"] >= 6 ) & (data["hour"] < 11)).astype(int)
data["afternoon"] = ((data["hour"] >= 11 ) | (data["hour"] < 1)).astype(int)
data["night"] = ((data["hour"] >= 1 ) & (data["hour"] < 6)).astype(int)

In [5]:
X, y = data.drop(columns=["Unusual"]), data["Unusual"]

### Statistics approach

In [6]:
def sigma3_outlier_detector(input_data, cols, treshhold):
    cells = input_data["CellName"].unique()
    times = ["morning", "afternoon", "night"]
    data = input_data[cols + ["CellName", "morning", "afternoon", "night"]]
    out_data = input_data.copy()
    out_data["prediction"] = np.nan
    for time in times:
        # For different parts of day different metrics
        time_data = data[data[time] == 1]
        for cell in cells:
            cell_data = time_data.query("CellName == @cell")[cols]
            cell_data = np.log1p(cell_data)
            preds = (np.abs(cell_data - cell_data.mean()) / cell_data.std() > treshhold).any(axis=1).astype(int)

            out_data.at[preds.index, "prediction"] = preds

    return out_data

In [7]:
# pred_cols = ["PRBUsageUL", "PRBUsageDL", "meanThr_DL", "meanThr_UL", "maxThr_DL", "maxThr_UL", "meanUE_DL", "meanUE_UL"]
pred_cols = ["PRBUsageUL"]
# pred_cols = ["PRBUsageUL", "PRBUsageDL", "meanThr_DL", "meanThr_UL", "meanUE_DL", "meanUE_UL"]
treshhold = 3

In [8]:
sigma3_preds = sigma3_outlier_detector(X, pred_cols, treshhold)

In [9]:
sigma3_preds["prediction"].value_counts()

0.0    36688
1.0      216
Name: prediction, dtype: int64

In [10]:
precision_score(y, sigma3_preds["prediction"]), recall_score(y, sigma3_preds["prediction"]), roc_auc_score(y, sigma3_preds["prediction"])

(0.19444444444444445, 0.004124521260925071, 0.49880639445778935)

In [11]:
def modified_sigma_detector(input_data, cols, treshhold):
    cells = input_data["CellName"].unique()
    times = ["morning", "afternoon", "night"]
    data = input_data[cols + ["CellName", "morning", "afternoon", "night"]]
    out_data = input_data.copy()
    out_data["prediction"] = np.nan
    for time in times:
        # For different parts of day different metrics
        time_data = data[data[time] == 1]
        for cell in cells:
            cell_data = time_data.query("CellName == @cell")[cols]
            cell_data = np.log1p(cell_data)
            median_data = cell_data.median()
            mad_data = np.median(np.abs(cell_data - median_data))
            preds = ((cell_data - median_data) / mad_data  > treshhold).any(axis=1).astype(int)

            out_data.at[preds.index, "prediction"] = preds

    return out_data

In [12]:
sigma_m_preds = modified_sigma_detector(X, pred_cols, treshhold)

In [13]:
sigma_m_preds["prediction"].value_counts()

0.0    35371
1.0     1533
Name: prediction, dtype: int64

In [14]:
precision_score(y, sigma_m_preds["prediction"]), recall_score(y, sigma_m_preds["prediction"]), roc_auc_score(y, sigma_m_preds["prediction"])

(0.2700587084148728, 0.04065599528626142, 0.4993894100154222)