# Attack and anomaly detection in IoT sensors in IoT sites using machine learning approaches

Article information:
* Authors: Mahmudul Hasan, Milon Islam, Ishrak Islam Zarif and M.M.A.Hashem
* Publication: 20 May 2019
* DOI: https://doi.org/10.1016/j.iot.2019.100059

## Objective

In this notebook we test different approaches to encode categorical values.

In [None]:
# import numpy and pandas to load the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

# graphics
from pylab import rcParams
%matplotlib inline 
rcParams['figure.figsize'] = 8, 8

# load dataset
df = pd.read_csv('https://media.githubusercontent.com/media/mariolpantunes/ml101/main/datasets/mainSimulationAccessTraces.csv',  error_bad_lines=False, header = 0)

## Data preprocessing

In [None]:
# Fix missing Accessed Node Type
df = df.fillna({'accessedNodeType': 'Malicious'})

# Fix the malformated values in column Value
df = df.fillna({'value': 0.0})
replace_values = {'true': 1.0, 'false': 0.0, 'twenty': 20.0, 'none': 0.0}
df = df.replace({'value':replace_values})

# Fix other errors not mentioned on the paper
df.loc[df.value.str.contains('^org').fillna(False), 'value'] = df[df.value.str.contains('^org').fillna(False)]['value'].apply(lambda x: int(x.split('@')[1],16))

# Replace the names of the target variable to be consistent with the paper
replace_values={'anomalous(DoSattack)':'DoS', 'anomalous(scan)': 'SC', 'anomalous(malitiousControl)':'M.C',
               'anomalous(malitiousOperation)': 'M.O', 'anomalous(spying)':'SP', 'anomalous(dataProbing)':'D.P',
               'anomalous(wrongSetUp)':'W.S', 'normal':'NL'}
df = df.replace({'normality':replace_values})

# Check for NaN values
missing = df.isnull().values.any()
print('Missing Values ? {}'.format(missing))

In [None]:
# Convert timestamp to datetime
time = pd.to_datetime(df['timestamp'], unit='ms')
df['timestamp'] = time

# Sort the samples by the timestamp
df.sort_values('timestamp')

# Drop the non-relevant feature
df = df.drop('timestamp', 1)

df.info()

## Encoding categorical values

In [None]:
print('Dataset datatypes (before transformation):')
print(df.dtypes)

# Convert value to float
df['value'] = pd.to_numeric(df['value'])

# Convert target variable to categorical
df['normality'] = df['normality'].astype('category')

# Apply label enconding to the remaining columns
# The label encoding was applied directly with pandas
columns=['sourceID', 'sourceAddress', 'sourceType','sourceLocation',
'destinationServiceAddress', 'destinationServiceType', 'destinationLocation', 
'accessedNodeAddress', 'accessedNodeType', 'operation']

# Ordinal encoder (label encoding)
for column in columns:
    df[column] = df[column].astype('category')
    df[column] = df[column].cat.codes

# OneHot encoder
dfDummies = pd.get_dummies(df, columns=columns, prefix=columns).drop(['value', 'normality'], 1)
df = df.drop(columns, 1).join(dfDummies)

print('------------------------------------------------------')
print('Dataset datatypes (after transformation):')
print(df.dtypes)

## Dataset convertion

In [None]:
# Import the preprocessing libraries
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Convert pandas dataframe X and y
le = preprocessing.LabelEncoder()
le.fit(df.normality)
y = le.transform(df.normality)
df_y = df['normality'].copy()
df = df.drop('normality', 1)
X = df.copy()

#Scale only the numerical feature (value)
scaler = StandardScaler()
feature = X[['value']]
scaler.fit(feature.values)
feature = scaler.transform(feature.values)
X.loc[:, 'value'] = feature

## Models

In [None]:
import math

# Helper functions to compute performance metrics
def decompose_cm(cm, c=0):
    TP = cm[c,c]
    tmp = np.delete(np.delete(cm, c, 0), c, 1)
    TN = np.sum(tmp)
    FP = np.sum(cm[c, :]) - TP
    FN = np.sum(cm[:, c]) - TP
    return TP, TN, FP, FN


def compute_performance_metrics(tp, tn, fp, fn):
    mcc = float(tp*tn - fp*fn) / math.sqrt(float(tp+fp)*float(tp+fn)*float(tn+fp)*float(tn+fn))
    if (tp+tn) == 0.0:
        return 0,0,0,0,mcc
    else:
        acc = (tp+tn)/(tp+tn+fp+fn)
        if tp == 0.0:
            return acc, 0, 0, 0, mcc
        else:
            pre = tp/(tp+fp)
            rec = tp/(tp+fn)
            f1 = (2*tp)/(2*tp + fp + fn)
            
    return acc, pre, rec, f1, mcc

In [None]:
# Library used to speedup computation
from joblib import parallel_backend

# Import the necessary models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Import the stratified KFold library
from sklearn.model_selection import StratifiedKFold

# RoC related libraries
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

# Prepare the dataset to be used in 5-fold cross validation
skf = StratifiedKFold(n_splits=5)

# List of models names and models algorithms
models = [
    ('LR',  SGDClassifier(loss='log')),
    ('SVM', SGDClassifier(loss='hinge')),
    ('DT',  DecisionTreeClassifier()),
    ('RF',  RandomForestClassifier()),
    ('ANN', MLPClassifier())]

# Dictionary that will contain the confusion matrix for each model
cm = {}

# Dictionary that will contain the RoC scores
roc = {}

# fold counter
k=0

# FPR for ROC curve
mean_fpr = np.linspace(0, 1, 100)

# Use the parallel backend to speedup the fit operation
with parallel_backend('threading'):
    
    # For each fold
    for train_index, test_index in skf.split(X, y):
        k += 1
        print('K-Fold: {}'.format(k))
    
        #X_train, X_test = X[train_index], X[test_index]
        X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
        y_train, y_test = y[train_index], y[test_index]
        #y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
    
        # For each model
        for model in models:
            print('\tModel: {}'.format(model[0]))
            model[1].fit(X_train, y_train)
            y_pred = model[1].predict(X_test)
            
            # Store the information for the confusion matrix
            if model[0] in cm:
                cm[model[0]] = np.add(cm[model[0]], confusion_matrix(y_test, y_pred))
            else:
                cm[model[0]] = confusion_matrix(y_test, y_pred)
            
            # Store the information for the RoC curve
            tpr = []
            
            for i in range (0, len(cm[model[0]])):
                f, t, _ = roc_curve(y_test, y_pred, pos_label=i)
                r = auc(f, t)
                interp_tpr = np.interp(mean_fpr, f, t)
                interp_tpr[0] = 0.0
                tpr.append(interp_tpr)
            
            if model[0] in roc:
                roc[model[0]]['tpr'].append(tpr)
            else:
                d = {}
                d['tpr'] = [tpr]
                roc[model[0]] = d

# Compute the mean values for TPR
for model in models:
    d = roc[model[0]]
    d['tpr'] = np.array(d['tpr']).mean(0)

## Comparison Table

In [None]:
print('Evaluation  Classifiers')
print('Metrics     LR    SVM   DT    RF    ANN')

accuracy = []
precision = []
recall = []
f1 = []
mcc = []

# For all models
for model in models:
    acc_score = pre_score = rec_score = f1_score = mcc_score = 0
    
    # For all classes
    for i in range(0, len(cm[model[0]])):
        tp, tn, fp, fn = decompose_cm(cm[model[0]], i)
        a, p, r, f, m  = compute_performance_metrics(tp, tn, fp, fn)
        acc_score += a
        pre_score += p
        rec_score += r
        f1_score  += f
        mcc_score += m
    
    # Compute macro average
    acc_score /= len(cm[model[0]])
    pre_score /= len(cm[model[0]])
    rec_score /= len(cm[model[0]])
    f1_score  /= len(cm[model[0]])
    mcc_score /= len(cm[model[0]])
    
    #
    accuracy.append(acc_score)
    precision.append(pre_score)
    recall.append(rec_score)
    f1.append(f1_score)
    mcc.append(mcc_score)

print('Accuracy    {:4.3f} {:4.3f} {:4.3f} {:4.3f} {:4.3f}'.format(*accuracy))
print('Precision   {:4.3f} {:4.3f} {:4.3f} {:4.3f} {:4.3f}'.format(*precision))
print('Recall      {:4.3f} {:4.3f} {:4.3f} {:4.3f} {:4.3f}'.format(*recall))
print('F1 score    {:4.3f} {:4.3f} {:4.3f} {:4.3f} {:4.3f}'.format(*f1))
print('MCC         {:4.3f} {:4.3f} {:4.3f} {:4.3f} {:4.3f}'.format(*mcc))

## Confusion Matrix

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# For all models
labels = le.inverse_transform(list(range(0, len(cm[model[0]]))))
for model in models:
    # Normalise
    normalized_cm = cm[model[0]].astype('float') / cm[model[0]].sum(axis=1)[:, np.newaxis]
    disp = ConfusionMatrixDisplay(confusion_matrix=normalized_cm, display_labels=labels)
    disp.plot(cmap='cividis')

## Conclusion

In this notebook we tried two different encodings for categorical data: ordinal (similar to label encoding in scikitlearn) and one hot.
Again, we used the MCC to evalute them.

The best values of MCC were obtained with the one hot encoding scheme.