# Importing libraries

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

# Read data

In [2]:
af = 'vv'

# df__train = pd.read_csv(f'data/tr.{af}1.csv')
# df__test = pd.read_csv(f'./data/t.{af}1.csv')
df = pd.read_csv(f'./data/v.{af}1.csv', index_col=[0])

In [3]:
df.loc[df['Label'] != 5, 'LabelBin'] = 0
df.loc[df['Label'] == 5, 'LabelBin'] = 1

In [4]:
df.columns

Index(['StreamID', 'StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir',
       'DstAddr', 'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes',
       'SrcBytes', 'Label', 'LabelStr', 'PktsPerSec', 'BytesPerSec',
       'SrcBytesPerSec', 'BytesPerPkt', 'DstBytes', 'DstBytesPerSec', 'Dir_0',
       'Dir_1', 'Dir_2', 'Dir_3', 'Dir_4', 'Dir_5', 'Dir_6', 'sTos_0',
       'sTos_2', 'sTos_3', 'sTos_1', 'sTos_-1', 'dTos_0', 'dTos_-1', 'dTos_3',
       'dTos_2', 'dTos_1', 'State_orig', 'State_CON', 'State_alltcp',
       'State_INT', 'State_S_', 'State_URP', 'State_ECO', 'State_RED',
       'State_REQ', 'State_ECR', 'State_URH', 'State_TXD', 'State_URFIL',
       'State_R_', 'State_URN', 'State_RSP', 'State_URHPRO', 'State_A_',
       'State_other', 'Flag_nan', 'Flag_S', 'Flag_A', 'Flag_P', 'Flag_R',
       'Flag_F', 'Proto_udp', 'Proto_tcp', 'Proto_icmp', 'Proto_rtp',
       'Proto_rtcp', 'Proto_igmp', 'Proto_arp', 'Proto_other', 'Service',
       'Service_80', 'Service_443', 'Service_2

In [9]:
def mean_encoding(df_train, df_test, categorical_vars):    
    #? temporary copy of the original dataframes
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    
    #? iterate over each variable
    for col in categorical_vars:
        
        #? make a dictionary of categories, target-mean pairs
        target_mean_dict = df_train.groupby([col])['LabelBin'].mean().to_dict()
        
        #? replace the categories by the mean of the target
        df_train_temp[col] = df_train[col].map(target_mean_dict)
        df_test_temp[col] = df_test[col].map(target_mean_dict)
    
    #? drop the target from the daatset
    df_train_temp.drop(['LabelBin'], axis=1, inplace=True)
    df_test_temp.drop(['LabelBin'], axis=1, inplace=True)
    
    return df_train_temp, df_test_temp

# On categorical features

In [14]:
cat_fts = [
    'Proto', 'Service', 'State', 'Dir', 'sTos', 'dTos'
]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df[cat_fts + ['LabelBin']],
    df['LabelBin'],
    test_size=0.3,
    random_state=0
)

X_train.shape, X_test.shape

((535242, 7), (229390, 7))

In [20]:
X_train_enc, X_test_enc = mean_encoding(X_train, X_test, cat_fts)

In [21]:
X_train_enc.head()

Unnamed: 0,Proto,Service,State,Dir,sTos,dTos
87202,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
358990,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
583705,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
600690,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
80990,0.000494,0.003,0.000512,0.000512,0.015214,0.004661


In [17]:
X_test_enc.head()

Unnamed: 0,Proto,Service,State,Dir,sTos,dTos
312240,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
15746,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
104911,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
360555,0.000494,0.003,0.000512,0.000512,0.015214,0.004661
263250,0.000494,0.003,0.000512,0.000512,0.015214,0.004661


### Determine the roc-auc using the variable values as input

In [18]:
from sklearn.metrics import roc_auc_score

roc_values = []

for ft in cat_fts:
    roc_values.append(roc_auc_score(y_test, X_test_enc[ft])) 

In [19]:
m1 = pd.Series(roc_values)
m1.index = cat_fts
m1.sort_values(ascending=False)

State      0.962036
Proto      0.917941
Dir        0.898655
Service    0.869783
dTos       0.822295
sTos       0.501622
dtype: float64

# On numerical features

In [5]:
num_fts = [
    'Dur', 
    'Sport', 'Dport', 
    'sTos', 'dTos', 
    'TotPkts', 'TotBytes', 'SrcBytes', 
    'PktsPerSec', 'BytesPerSec', 'SrcBytesPerSec', 'BytesPerPkt', 
    'DstBytes', 'DstBytesPerSec', 

    'State_CON', 'State_alltcp',
    'State_INT', 'State_S_', 'State_URP', 'State_ECO', 'State_RED',
    'State_REQ', 'State_ECR', 'State_URH', 'State_TXD', 'State_URFIL',
    'State_R_', 'State_URN', 'State_RSP', 'State_URHPRO', 'State_A_',
    'State_other', 
    'Flag_nan', 'Flag_S', 'Flag_A', 'Flag_P', 'Flag_R', 'Flag_F', 
    'Proto_udp', 'Proto_tcp', 'Proto_icmp', 'Proto_rtp', 'Proto_rtcp', 'Proto_igmp', 'Proto_arp', 'Proto_other', 
    'Service_80', 'Service_443', 'Service_21', 'Service_22', 'Service_25',
    'Service_6667', 'Service_other'
]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df[num_fts + ['LabelBin']],
    df['LabelBin'],
    test_size=0.3,
    random_state=0
)

X_train.shape, X_test.shape

((535242, 54), (229390, 54))

In [7]:
binned_fts = []

for ft in num_fts:
    # print('\n-----------')
    # print(ft)
    X_train[f'{ft}_binned'], intervals = pd.qcut(
        X_train['Dur'],
        q = 5,
        labels=False,
        retbins=True,
        precision=3,
        duplicates='drop',
    )

    # display(X_train[[f'{ft}_binned', ft]].head(10))

    #? count the number of distinct bins
    # print(X_train[f'{ft}_binned'].nunique())
    # print(X_train[f'{ft}_binned'].unique())

    #? Use the interval limits calculated in the previous cell to bin the testing set
    X_test[f'{ft}_binned'] = pd.cut(x = X_test[ft], bins=intervals, labels=False)

    # display(X_test[[f'{ft}_binned', ft]].head(10))

    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)

    binned_fts.append(f'{ft}_binned')

### Replace bins with target mean

In [10]:
"""
now we use our previous function to encode the variables with the target mean
"""

X_train_enc, X_test_enc = mean_encoding(X_train[binned_fts + ['LabelBin']], X_test[binned_fts + ['LabelBin']], binned_fts)

X_train_enc.head()

Unnamed: 0,Dur_binned,Sport_binned,Dport_binned,sTos_binned,dTos_binned,TotPkts_binned,TotBytes_binned,SrcBytes_binned,PktsPerSec_binned,BytesPerSec_binned,...,Proto_igmp_binned,Proto_arp_binned,Proto_other_binned,Service_80_binned,Service_443_binned,Service_21_binned,Service_22_binned,Service_25_binned,Service_6667_binned,Service_other_binned
87202,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,...,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392
358990,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,...,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719,0.001719
583705,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,...,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392,0.000392
600690,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,...,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127,0.071127
80990,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,...,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102,0.002102


### Determine the roc-auc using the variable values as input

In [11]:
from sklearn.metrics import roc_auc_score

roc_values = []

for ft in binned_fts:
    roc_values.append(roc_auc_score(y_test, X_test_enc[ft])) 

In [12]:
m1 = pd.Series(roc_values)
m1.index = binned_fts
m1.sort_values(ascending=False)

Flag_S_binned            0.920108
Proto_tcp_binned         0.917719
Dur_binned               0.883219
State_S__binned          0.849325
Sport_binned             0.822717
Service_80_binned        0.812319
SrcBytesPerSec_binned    0.811048
PktsPerSec_binned        0.805036
BytesPerSec_binned       0.771623
Dport_binned             0.737811
State_alltcp_binned      0.568498
Flag_A_binned            0.568337
Flag_R_binned            0.556462
Flag_P_binned            0.551479
Flag_F_binned            0.548486
DstBytesPerSec_binned    0.539700
Service_25_binned        0.538156
TotBytes_binned          0.536377
SrcBytes_binned          0.535586
Service_443_binned       0.506440
BytesPerPkt_binned       0.505620
TotPkts_binned           0.505019
Service_6667_binned      0.500735
Flag_nan_binned          0.500000
State_other_binned       0.499996
Proto_other_binned       0.499991
Service_21_binned        0.499985
State_RSP_binned         0.499967
State_A__binned          0.499962
State_URHPRO_b

In [14]:
m1.index

Index(['Dur_binned', 'Sport_binned', 'Dport_binned', 'sTos_binned',
       'dTos_binned', 'TotPkts_binned', 'TotBytes_binned', 'SrcBytes_binned',
       'PktsPerSec_binned', 'BytesPerSec_binned', 'SrcBytesPerSec_binned',
       'BytesPerPkt_binned', 'DstBytes_binned', 'DstBytesPerSec_binned',
       'State_CON_binned', 'State_alltcp_binned', 'State_INT_binned',
       'State_S__binned', 'State_URP_binned', 'State_ECO_binned',
       'State_RED_binned', 'State_REQ_binned', 'State_ECR_binned',
       'State_URH_binned', 'State_TXD_binned', 'State_URFIL_binned',
       'State_R__binned', 'State_URN_binned', 'State_RSP_binned',
       'State_URHPRO_binned', 'State_A__binned', 'State_other_binned',
       'Flag_nan_binned', 'Flag_S_binned', 'Flag_A_binned', 'Flag_P_binned',
       'Flag_R_binned', 'Flag_F_binned', 'Proto_udp_binned',
       'Proto_tcp_binned', 'Proto_icmp_binned', 'Proto_rtp_binned',
       'Proto_rtcp_binned', 'Proto_igmp_binned', 'Proto_arp_binned',
       'Proto_other_bi