# Armis Data Hack Challenge - Solution

### Imports and Consts

In [27]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import IsolationForest
DEVICES_PATH = "all_devices.csv"
CHUNK_PATH = "chunk-" # + '1','2',...,'23' + '.csv'
submission = pd.DataFrame(columns=["network_id", "device_id", "confidence"])
rng = np.random.RandomState(42)

### Feature exploration

In [None]:
chunk1 = pd.read_csv(CHUNK_PATH + '1' + '.csv',index_col=[0]) #index_col=[0] to use 1st col as index
cols = chunk1.columns
chunk1 = chunk1.groupby(['network_id','device_id']).aggregate({'host': 'nunique','packets_count': 'sum',
                                                           'inbound_bytes_count': 'sum','outbound_bytes_count': 'sum',
                                                           'port_dst' : 'nunique',  'service_device_id' : 'nunique',
                                                           'packets_count' : 'sum','packet_loss' : 'sum',
                                                           'retransmit_count': 'sum', 'latency' : 'sum','session_count' : 'sum'})


In [None]:
chunk1['in_out_ratio'] = chunk1.inbound_bytes_count / chunk1.packets_count
chunk1.corr()

In [None]:
import seaborn as sns
corr = chunk1.corr()
sns.heatmap(corr,xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
chunk1.drop(['inbound_bytes_count', 'outbound_bytes_count'], axis=1, inplace=True) #drop linear correlated features

corr = chunk1.corr()
sns.heatmap(corr,xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
chunk1.reset_index(level=[0,1], inplace=True)

In [None]:
# We use the simple min-max normalization in order to normalize the confidence values to 0-1 range.
# Higher score means that this device is probably more anomalous.
def calc_normalized_decision(decision_function_result):
    decision_function_result = -1 * decision_function_result
    minimum = decision_function_result.min()
    maximum = decision_function_result.max()
    return (decision_function_result - minimum) / (maximum - minimum)

## extract features
- replacing 'inbound_bytes_count', 'outbound_bytes_count' which are highly correlated with 'packet_counts' with 'in_out_ratio'
- this feature can tell if device is sending the requests only for attack proposes.

In [None]:
def extract_features(df_in):
    df = df_in.copy()
    df = df.groupby(['network_id','device_id']).aggregate({'host': 'nunique','packets_count': 'sum',
                                                           'inbound_bytes_count': 'sum','outbound_bytes_count': 'sum',
                                                           'port_dst' : 'nunique',  'service_device_id' : 'nunique',
                                                           'packets_count' : 'sum','packet_loss' : 'sum',
                                                           'retransmit_count': 'sum', 'latency' : 'sum','session_count' : 'sum'})
    df['in_out_ratio'] = df.inbound_bytes_count / df.outbound_bytes_count
    df.drop(['inbound_bytes_count', 'outbound_bytes_count'], axis=1, inplace=True) #drop linear correlated features
    return df

def detect_anomaly(df,esitmator):
    df_out = df.copy()
    esitmator.fit(df_out.values)
    decision_function_result = esitmator.decision_function(df_out.values)
    df_out["confidence"] = calc_normalized_decision(decision_function_result)
    return df_out


In [None]:
#iterating over chunks:
iso = IsolationForest(behaviour='new',random_state=rng,max_samples=0.25,contamination=0.15,n_estimators=250,n_jobs=-1)
cols= pd.read_csv('chunk-1.csv',chunksize = 2,index_col=[0]).get_chunk(2).columns

for i in range (1,23):
    chunk_path = CHUNK_PATH + str(i) + '.csv'
    if i == 1:
       mini_chunk_iso = pd.read_csv(chunk_path,index_col=[0])
    else:
        mini_chunk_iso = pd.read_csv(chunk_path,index_col=[0],names=cols,header=None) #index_col=[0] to use 1st col as index, names= we have the header only in the 1st file
        #print("chunk-"+ str(i) + str(mini_chunk.network_id.iloc[0])) which net_id related to chunk
        #print(len(mini_chunk.device_id.unique().tolist())) how many unique devices in each mini_chunk
        
    mini_chunk_iso = extract_features(mini_chunk_iso)
    mini_chunk_iso = detect_anomaly(mini_chunk_iso,iso)
    mini_chunk_iso.drop(['in_out_ratio','host','packets_count','port_dst','service_device_id','packets_count',
                     'packet_loss','retransmit_count', 'latency','session_count'],axis='columns',inplace=True)
    mini_chunk_iso = mini_chunk_iso.groupby(['network_id','device_id']).aggregate({"confidence": "mean"})
    mini_chunk_iso.reset_index(level=[0,1], inplace=True)
    submission = submission.append(mini_chunk_iso)
    submission = submission.groupby(['network_id','device_id']).aggregate({"confidence": "mean"})
    submission.reset_index(level=[0,1], inplace=True)
    total = len(submission.index)
    print("at chunk-" + str(i)+ f' {total:,}')
        

In [None]:


print(submission.device_id.unique())
print(submission.info())
print(submission.describe())
print(submission.head())

# Submissions

In [None]:
arr_to_submit = submission.to_json(orient='values')

from urllib import request
import json

leaderboard_name = "armis"
host = "leaderboard.datahack.org.il"

# Name of the user
submitter = "Data Sniffers"

predictions = json.loads(arr_to_submit)

jsonStr = json.dumps({'submitter': submitter, 'predictions': predictions})
data = jsonStr.encode('utf-8')
req = request.Request(f"https://{host}/{leaderboard_name}/api/",
                      headers={'Content-Type': 'application/json'},
                      data=data)
resp = request.urlopen(req)
print(json.load(resp))