In [8]:
import numpy as np
import pandas as pd
from os import path
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.callbacks import EarlyStopping
from collections import Counter
import sklearn
from tensorflow.keras.callbacks import LearningRateScheduler

from sklearn import preprocessing, metrics
from sklearn.metrics import (roc_curve, auc, accuracy_score, precision_score, 
                             recall_score, f1_score, balanced_accuracy_score, 
                             matthews_corrcoef)
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, LabelBinarizer
import shap
from imblearn.over_sampling import RandomOverSampler
import innvestigate

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
np.random.seed(0)

tf.compat.v1.disable_eager_execution()

output_file_name = 'SML_XAI.txt'

In [9]:
def confusion_metrics (name_model,predictions,true_labels):

    name = name_model
    pred_label = predictions
    y_test_01 = true_labels 

    with open(output_file_name, "a") as f: print('--------------------------------------------------------------------------', file = f)

    with open(output_file_name, "a") as f: print(name, file = f)


    print('---------------------------------------------------------------------------------')
    print('CONFUSION MATRIX')
    print('---------------------------------------------------------------------------------')


    # pred_label = label[ypred]

    confusion_matrix = pd.crosstab(y_test_01, pred_label,rownames=['Actual ALERT'],colnames = ['Predicted ALERT'], dropna=False).sort_index(axis=0).sort_index(axis=1)
    all_unique_values = sorted(set(pred_label) | set(y_test_01))
    z = np.zeros((len(all_unique_values), len(all_unique_values)))
    rows, cols = confusion_matrix.shape
    z[:rows, :cols] = confusion_matrix
    confusion_matrix  = pd.DataFrame(z, columns=all_unique_values, index=all_unique_values)
    # confusion_matrix.to_csv('Ensemble_conf_matrix.csv')
    # with open(output_file_name, "a") as f:print(confusion_matrix,file=f)
    print(confusion_matrix)
    with open(output_file_name, "a") as f: print('Confusion Matrix', file = f)

    with open(output_file_name, "a") as f: print(confusion_matrix, file = f)


    FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
    FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    TP = np.diag(confusion_matrix)
    TN = confusion_matrix.values.sum() - (FP + FN + TP)
    TP_total = sum(TP)
    TN_total = sum(TN)
    FP_total = sum(FP)
    FN_total = sum(FN)

    TP_total = np.array(TP_total,dtype=np.float64)
    TN_total = np.array(TN_total,dtype=np.float64)
    FP_total = np.array(FP_total,dtype=np.float64)
    FN_total = np.array(FN_total,dtype=np.float64)



    #----------------------------------------------------------------#----------------------------------------------------------------

    print('---------------------------------------------------------------------------------')
    print('METRICS')
    print('---------------------------------------------------------------------------------')


    Acc = accuracy_score(y_test_01, pred_label)
    Precision = precision_score(y_test_01, pred_label, average='macro')
    Recall = recall_score(y_test_01, pred_label, average='macro')
    F1 =  f1_score(y_test_01, pred_label, average='macro')
    BACC = balanced_accuracy_score(y_test_01, pred_label)
    MCC = matthews_corrcoef(y_test_01, pred_label)


    # voting_acc_01 = Acc
    # voting_pre_01 = Precision
    # weighed_avg_rec_01 = Recall
    # weighed_avg_f1_01 = F1
    # weighed_avg_bacc_01 = BACC
    # weighed_avg_mcc_01 = MCC
    # with open(output_file_name, "a") as f:print('Accuracy total: ', Acc,file=f)
    print('Accuracy total: ', Acc)
    print('Precision total: ', Precision )
    print('Recall total: ', Recall )
    print('F1 total: ', F1 )
    print('BACC total: ', BACC)
    print('MCC total: ', MCC)

    with open(output_file_name, "a") as f: print('Accuracy total: ', Acc, file = f)
    with open(output_file_name, "a") as f: print('Precision total: ', Precision, file = f)
    with open(output_file_name, "a") as f: print('Recall total: ', Recall , file = f)
    with open(output_file_name, "a") as f: print('F1 total: ', F1, file = f)
    with open(output_file_name, "a") as f: print('BACC total: ', BACC , file = f)
    with open(output_file_name, "a") as f: print('MCC total: ', MCC, file = f)

    return Acc, Precision, Recall, F1, BACC, MCC

In [10]:

#---------------------------------------------------------------------
# Defining features of interest
print('---------------------------------------------------------------------------------')
print('Defining features of interest')
print('---------------------------------------------------------------------------------')
print('')

req_cols = ['FLOW_DURATION_MILLISECONDS','FIRST_SWITCHED',
            'TOTAL_FLOWS_EXP','TCP_WIN_MSS_IN','LAST_SWITCHED',
            'TCP_WIN_MAX_IN','TCP_WIN_MIN_IN','TCP_WIN_MIN_OUT',
           'PROTOCOL','TCP_WIN_MAX_OUT','TCP_FLAGS',
            'TCP_WIN_SCALE_OUT','TCP_WIN_SCALE_IN','SRC_TOS',
            'DST_TOS','FLOW_ID','L4_SRC_PORT','L4_DST_PORT',
           'MIN_IP_PKT_LEN','MAX_IP_PKT_LEN','TOTAL_PKTS_EXP',
           'TOTAL_BYTES_EXP','IN_BYTES','IN_PKTS','OUT_BYTES','OUT_PKTS',
            'ALERT']
#---------------------------------------------------------------------
#Load Databases from csv file
address = '/home/oarreche@ads.iu.edu/HITL/sensor/sensor_db'
print('Loading Database')
print('--------------------------------------------------')

fraction = 0.1
fraction2 = 0.01

#Denial of Service
df0 = pd.read_csv (address + '/dos-03-15-2022-15-44-32.csv', usecols=req_cols).sample(frac = fraction)
df1 = pd.read_csv (address + '/dos-03-16-2022-13-45-18.csv', usecols=req_cols).sample(frac = fraction)
df2 = pd.read_csv (address + '/dos-03-17-2022-16-22-53.csv', usecols=req_cols).sample(frac = fraction)
df3 = pd.read_csv (address + '/dos-03-18-2022-19-27-05.csv', usecols=req_cols).sample(frac = fraction)
df4 = pd.read_csv (address + '/dos-03-19-2022-20-01-53.csv', usecols=req_cols).sample(frac = fraction)
df5 = pd.read_csv (address + '/dos-03-20-2022-14-27-54.csv', usecols=req_cols).sample(frac = fraction)


#Malware
#df6 = pd.read_csv ('sensor_db/malware-03-25-2022-17-57-07.csv', usecols=req_cols)

#Normal
df7 = pd.read_csv  (address + '/normal-03-15-2022-15-43-44.csv', usecols=req_cols).sample(frac = fraction2)
df8 = pd.read_csv  (address + '/normal-03-16-2022-13-44-27.csv', usecols=req_cols).sample(frac = fraction2)
df9 = pd.read_csv  (address + '/normal-03-17-2022-16-21-30.csv', usecols=req_cols).sample(frac = fraction2)
df10 = pd.read_csv (address + '/normal-03-18-2022-19-17-31.csv', usecols=req_cols).sample(frac = fraction2)
df11 = pd.read_csv (address + '/normal-03-18-2022-19-25-48.csv', usecols=req_cols).sample(frac = fraction2)
df12 = pd.read_csv (address + '/normal-03-19-2022-20-01-16.csv', usecols=req_cols).sample(frac = fraction2)
df13 = pd.read_csv (address + '/normal-03-20-2022-14-27-30.csv', usecols=req_cols).sample(frac = fraction2)


#PortScanning
df14 = pd.read_csv  (address + '/portscanning-03-15-2022-15-44-06.csv', usecols=req_cols).sample(frac = fraction)
df15 = pd.read_csv  (address + '/portscanning-03-16-2022-13-44-50.csv', usecols=req_cols).sample(frac = fraction)
df16 = pd.read_csv  (address + '/portscanning-03-17-2022-16-22-53.csv', usecols=req_cols).sample(frac = fraction)
df17 = pd.read_csv  (address + '/portscanning-03-18-2022-19-27-05.csv', usecols=req_cols).sample(frac = fraction)
df18 = pd.read_csv  (address + '/portscanning-03-19-2022-20-01-45.csv', usecols=req_cols).sample(frac = fraction)
df19 = pd.read_csv  (address + '/portscanning-03-20-2022-14-27-49.csv', usecols=req_cols).sample(frac = fraction)


frames = [df0, df1, df2, df3, df4, df5, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19]


df = pd.concat(frames,ignore_index=True)

y = df.pop('ALERT')
X = df

df_max_scaled = X
for col in df_max_scaled.columns:
    t = abs(df_max_scaled[col].max())
    df_max_scaled[col] = df_max_scaled[col]/t
df_max_scaled
df = df_max_scaled.assign( Label = y)
#df
df = df.fillna(0)



# IG

# df.pop('TOTAL_FLOWS_EXP')
# df.pop('LAST_SWITCHED')
# df.pop('TCP_WIN_MAX_IN')
# df.pop('TCP_WIN_MIN_IN')
# df.pop('FIRST_SWITCHED')

# df.pop('L4_SRC_PORT')
# df.pop('TCP_WIN_SCALE_IN')
# df.pop('FLOW_ID')
# df.pop('L4_DST_PORT')
# df.pop('PROTOCOL')

# df.pop('TCP_WIN_MSS_IN')
# df.pop('TCP_WIN_MIN_OUT')
# df.pop('DST_TOS')
# df.pop('FLOW_DURATION_MILLISECONDS')
# df.pop('TCP_FLAGS')
# df.pop('TCP_WIN_MAX_OUT')
# df.pop('TCP_WIN_SCALE_OUT')
# df.pop('SRC_TOS')
# df.pop('IN_PKTS')
# df.pop('OUT_PKTS')
# df.pop('OUT_BYTES')

# df.pop('IN_BYTES')
# df.pop('MIN_IP_PKT_LEN')
# df.pop('MAX_IP_PKT_LEN')
# df.pop('TOTAL_PKTS_EXP')
# df.pop('TOTAL_BYTES_EXP')


# LRP 
# df.pop('TOTAL_FLOWS_EXP')
# df.pop('LAST_SWITCHED')
# df.pop('TCP_WIN_MIN_IN')
# df.pop('TCP_WIN_MAX_IN')
# df.pop('FIRST_SWITCHED')

# df.pop('TCP_WIN_SCALE_IN')
# df.pop('L4_SRC_PORT')
# df.pop('L4_DST_PORT')
# df.pop('FLOW_ID')
# df.pop('PROTOCOL')

# df.pop('TCP_WIN_MSS_IN')
# df.pop('TCP_FLAGS')
# df.pop('FLOW_DURATION_MILLISECONDS')
# df.pop('DST_TOS')
# df.pop('TCP_WIN_MIN_OUT')
# df.pop('TCP_WIN_MAX_OUT')
# df.pop('TCP_WIN_SCALE_OUT')
# df.pop('SRC_TOS')
# df.pop('IN_PKTS')
# df.pop('OUT_PKTS')
# df.pop('OUT_BYTES')

# df.pop('IN_BYTES')
# df.pop('MIN_IP_PKT_LEN')
# df.pop('MAX_IP_PKT_LEN')
# df.pop('TOTAL_PKTS_EXP')
# df.pop('TOTAL_BYTES_EXP')


# SHAP
# df.pop('TCP_WIN_MIN_IN')
# df.pop('TCP_WIN_MIN_OUT')
# df.pop('TCP_WIN_SCALE_OUT')
# df.pop('FLOW_DURATION_MILLISECONDS')
# df.pop('TCP_WIN_MAX_IN')

# df.pop('TCP_WIN_SCALE_IN')
# df.pop('TCP_WIN_MAX_OUT')
# df.pop('FLOW_ID')
# df.pop('L4_SRC_PORT')
# df.pop('TOTAL_FLOWS_EXP')

# df.pop('TCP_FLAGS')
# df.pop('TCP_WIN_MSS_IN')
# df.pop('LAST_SWITCHED')
# df.pop('FIRST_SWITCHED')
# df.pop('IN_PKTS')
# df.pop('OUT_PKTS')
# df.pop('IN_BYTES')
# df.pop('OUT_BYTES')
# df.pop('L4_DST_PORT')
# df.pop('SRC_TOS')
# df.pop('DST_TOS')

# df.pop('PROTOCOL')
# df.pop('MIN_IP_PKT_LEN')
# df.pop('MAX_IP_PKT_LEN')
# df.pop('TOTAL_PKTS_EXP')
# df.pop('TOTAL_BYTES_EXP')
#---------------------------------------------------------------------

# Separate features and labels 
print('---------------------------------------------------------------------------------')
print('Separating features and labels')
print('---------------------------------------------------------------------------------')
print('')

y = df.pop('Label')
X = df
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
print('---------------------------------------------------------------------------------')
result_list = [counter['None'],counter['Denial of Service'], counter['Port Scanning']]
print('number of Labels  ',result_list)
print('---------------------------------------------------------------------------------')

df = X.assign( Label = y)

#---------------------------------------------------------------------

# Separate Training and Testing db
print('---------------------------------------------------------------------------------')
print('Separating Training and Testing db')
print('---------------------------------------------------------------------------------')
print('')

df['is_train'] = np.random.uniform(0, 1, len(df)) <= .70
#print(df.head())

train, test = df[df['is_train']==True], df[df['is_train']==False]
print('Number of the training data:', len(train))
print('Number of the testing data:', len(test))

features = df.columns[:len(req_cols)-1]

y_train, label = pd.factorize(train['Label'])
y_test, label = pd.factorize(test['Label'])

#---------------------------------------------------------------------
# Defining the DNN model

df_y, df_label = pd.factorize(df['Label'])


df2 = df.drop(columns=['Label','is_train'])
df2['Label'] = df_y
df2
train = train.drop(columns=['Label','is_train'])

test.pop('is_train')
test.pop('Label')
y_test
y_train

X_train = train
X_test = test


---------------------------------------------------------------------------------
Defining features of interest
---------------------------------------------------------------------------------

Loading Database
--------------------------------------------------
---------------------------------------------------------------------------------
Separating features and labels
---------------------------------------------------------------------------------

Counter({'Denial of Service': 642515, 'Port Scanning': 417040, 'None': 195521})
---------------------------------------------------------------------------------
number of Labels   [195521, 642515, 417040]
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
Separating Training and Testing db
---------------------------------------------------------------------------------

Number of the training data: 878476
Number of the tes

In [11]:

print('---------------------------------------------------------------------------------')
print('Defining the DNN model')
print('---------------------------------------------------------------------------------')
print('')

# Define the number of nodes per layer
nodes_first_layer = 128
nodes_second_layer = 64
nodes_third_layer = 32

model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(len(X_train.columns,))))

# First dense layer
model.add(tf.keras.layers.Dense(nodes_first_layer, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))  # Dropout layer follows the first dense layer

# Second dense layer
model.add(tf.keras.layers.Dense(nodes_second_layer, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2))  # Dropout layer follows the second dense layer

# Third dense layer
model.add(tf.keras.layers.Dense(nodes_third_layer, activation='relu'))

# Output layer
model.add(tf.keras.layers.Dense(3))  # Cannot use softmax, it is imconpatible with innvestigate

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


---------------------------------------------------------------------------------
Defining the DNN model
---------------------------------------------------------------------------------



In [12]:


print('---------------------------------------------------------------------------------')
print('Training the model')
print('---------------------------------------------------------------------------------')
print('')

# Define a learning rate scheduler
def lr_schedule(epoch, lr):
    if epoch > 10:
        return lr * 0.1
    return lr

lr_scheduler = LearningRateScheduler(lr_schedule, verbose=1)

start = time.time()

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='accuracy', patience=10, restore_best_weights=True)

# Modify model.fit to include the EarlyStopping callback
model.fit(X_train, y_train, epochs=1000, batch_size=len(X_train), callbacks=[early_stopping,lr_scheduler])

end = time.time()
print('---------------------------------------------------------------------------------')
print('ELAPSE TIME TRAINING MODEL: ',(end - start)/60, 'min')
print('---------------------------------------------------------------------------------')
print('')


---------------------------------------------------------------------------------
Training the model
---------------------------------------------------------------------------------

Train on 878476 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 1/1000

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 2/1000

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 3/1000

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 4/1000

Epoch 00005: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 5/1000

Epoch 00006: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 6/1000

Epoch 00007: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 7/1000

Epoch 00008: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 8/1000

Epoch 0

In [13]:

print('---------------------------------------------------------------------------------')
print('Model Prediction')
print('---------------------------------------------------------------------------------')
print('')
print('---------------------------------------------------------------------------------')
start = time.time()
y_pred = model.predict(X_test)
end = time.time()
print('ELAPSE TIME MODEL PREDICTION: ',(end - start)/60, 'min')
print('---------------------------------------------------------------------------------')
print('')

#print(y_pred)
ynew = np.argmax(y_pred,axis = 1)
#print(ynew)
score = model.evaluate(X_test, y_test,verbose=1)
#print(score)
pred_label = label[ynew]
#print(score)

#---------------------------------------------------------------------

accuracy =accuracy_score(y_test, ynew)*100
print(accuracy)

label_counts = Counter(y_test)
print(label_counts)

label_counts = Counter(ynew)
print(label_counts)

accuracy =accuracy_score(y_test, ynew)*100
print(accuracy)

---------------------------------------------------------------------------------
Model Prediction
---------------------------------------------------------------------------------

---------------------------------------------------------------------------------


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.


ELAPSE TIME MODEL PREDICTION:  0.20019287665685018 min
---------------------------------------------------------------------------------

84.82819968135954
Counter({0: 192329, 2: 125302, 1: 58969})
Counter({0: 201598, 2: 172875, 1: 2127})
84.82819968135954


In [14]:
confusion_metrics('dnn', ynew, y_test)


---------------------------------------------------------------------------------
CONFUSION MATRIX
---------------------------------------------------------------------------------
          0       1         2
0  192034.0     0.0     295.0
1    9564.0  2127.0   47278.0
2       0.0     0.0  125302.0
---------------------------------------------------------------------------------
METRICS
---------------------------------------------------------------------------------
Accuracy total:  0.8482819968135953
Precision total:  0.8924572597077574
Recall total:  0.6781786564448292
F1 total:  0.6283514903077259
BACC total:  0.6781786564448292
MCC total:  0.7645696095568848


(0.8482819968135953,
 0.8924572597077574,
 0.6781786564448292,
 0.6283514903077259,
 0.6781786564448292,
 0.7645696095568848)

In [15]:
single_sample_df = X_test.iloc[[0]]

## Integrated Gradients


In [16]:
%%time
# Create an analyzer for the model
analyzer = innvestigate.create_analyzer("integrated_gradients", model)

# Perform LRP analysis on the input data
analysis = analyzer.analyze(X_test)
#uncomment for single sample
# analysis = analyzer.analyze(single_sample_df)

# Perform LRP analysis on a certain number of samples
# analysis = analyzer.analyze(X_test.sample(100))

# Print or use the analysis results as needed
print(analysis)

print(len(X_test))
print(len(X_test.columns))
names = X_test.columns
print(analysis.shape)
print(type(analysis))
scores = pd.DataFrame(analysis)
print(analysis)
scores_abs = scores.abs()

# Calculate the sum of each column
sum_of_columns = scores_abs.sum(axis=0)

names = list(names)

sum_of_columns = list(sum_of_columns)

# Zip the two lists together
combined = list(zip(names, sum_of_columns))

# Sort the combined list in descending order based on the values from sum_of_columns
sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)

# Unzip the sorted_combined list to separate names and sum_of_columns
sorted_names, sorted_sum_of_columns = zip(*sorted_combined)

print(sorted_names)
print(sorted_sum_of_columns)


sorted_names


`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.


[[ 0.064 -0.04  -0.    ...  0.     0.    -0.   ]
 [ 0.05  -0.035 -0.    ...  0.     0.    -0.   ]
 [ 0.039 -0.084 -0.    ...  0.     0.    -0.   ]
 ...
 [ 0.032  0.096  0.08  ... -0.     0.    -0.   ]
 [ 0.049  0.091  0.021 ... -0.    -0.    -0.   ]
 [ 0.039  0.094  0.022 ... -0.    -0.    -0.   ]]
376600
26
(376600, 26)
<class 'numpy.ndarray'>
[[ 0.064 -0.04  -0.    ...  0.     0.    -0.   ]
 [ 0.05  -0.035 -0.    ...  0.     0.    -0.   ]
 [ 0.039 -0.084 -0.    ...  0.     0.    -0.   ]
 ...
 [ 0.032  0.096  0.08  ... -0.     0.    -0.   ]
 [ 0.049  0.091  0.021 ... -0.    -0.    -0.   ]
 [ 0.039  0.094  0.022 ... -0.    -0.    -0.   ]]
('LAST_SWITCHED', 'FIRST_SWITCHED', 'FLOW_ID', 'L4_SRC_PORT', 'TCP_WIN_MIN_IN', 'TCP_WIN_MAX_IN', 'TOTAL_FLOWS_EXP', 'L4_DST_PORT', 'PROTOCOL', 'TCP_WIN_SCALE_IN', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'TCP_WIN_MIN_OUT', 'TCP_WIN_MAX_OUT', 'TCP_WIN_SCALE_OUT', 'TCP_WIN_MSS_IN', 'DST_TOS', 'SRC_TOS', 'OUT_PKTS', 'OUT_BYTES', 'IN_PKTS', 'IN_BYTES',

('LAST_SWITCHED',
 'FIRST_SWITCHED',
 'FLOW_ID',
 'L4_SRC_PORT',
 'TCP_WIN_MIN_IN',
 'TCP_WIN_MAX_IN',
 'TOTAL_FLOWS_EXP',
 'L4_DST_PORT',
 'PROTOCOL',
 'TCP_WIN_SCALE_IN',
 'TCP_FLAGS',
 'FLOW_DURATION_MILLISECONDS',
 'TCP_WIN_MIN_OUT',
 'TCP_WIN_MAX_OUT',
 'TCP_WIN_SCALE_OUT',
 'TCP_WIN_MSS_IN',
 'DST_TOS',
 'SRC_TOS',
 'OUT_PKTS',
 'OUT_BYTES',
 'IN_PKTS',
 'IN_BYTES',
 'MIN_IP_PKT_LEN',
 'MAX_IP_PKT_LEN',
 'TOTAL_PKTS_EXP',
 'TOTAL_BYTES_EXP')

## LRP


In [17]:
%%time

# Create an analyzer for the model
analyzer = innvestigate.create_analyzer("lrp.z", model)

# Perform LRP analysis on the input data
analysis = analyzer.analyze(X_test)

#uncomment for single sample
# analysis = analyzer.analyze(single_sample_df)

# Perform LRP analysis on a certain number of samples
# analysis = analyzer.analyze(X_test.sample(2500))


# Print or use the analysis results as needed
print(analysis)
print(len(X_test))
print(len(X_test.columns))
names = X_test.columns
print(analysis.shape)
print(type(analysis))
scores = pd.DataFrame(analysis)
print(analysis)
scores_abs = scores.abs()

# Calculate the sum of each column
sum_of_columns = scores_abs.sum(axis=0)

names = list(names)

sum_of_columns = list(sum_of_columns)

# Zip the two lists together
combined = list(zip(names, sum_of_columns))

# Sort the combined list in descending order based on the values from sum_of_columns
sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)

# Unzip the sorted_combined list to separate names and sum_of_columns
sorted_names, sorted_sum_of_columns = zip(*sorted_combined)

print(sorted_names)
print(sorted_sum_of_columns)

sorted_names


[[ 0.077 -0.034 -0.    ...  0.     0.    -0.   ]
 [ 0.052 -0.025 -0.    ...  0.     0.    -0.   ]
 [ 0.036 -0.085 -0.    ... -0.     0.    -0.   ]
 ...
 [ 0.034  0.102  0.088 ...  0.     0.    -0.   ]
 [ 0.076  0.076  0.024 ...  0.    -0.    -0.   ]
 [ 0.078  0.08   0.025 ...  0.    -0.    -0.   ]]
376600
26
(376600, 26)
<class 'numpy.ndarray'>
[[ 0.077 -0.034 -0.    ...  0.     0.    -0.   ]
 [ 0.052 -0.025 -0.    ...  0.     0.    -0.   ]
 [ 0.036 -0.085 -0.    ... -0.     0.    -0.   ]
 ...
 [ 0.034  0.102  0.088 ...  0.     0.    -0.   ]
 [ 0.076  0.076  0.024 ...  0.    -0.    -0.   ]
 [ 0.078  0.08   0.025 ...  0.    -0.    -0.   ]]
('FIRST_SWITCHED', 'LAST_SWITCHED', 'FLOW_ID', 'TCP_WIN_MAX_IN', 'TOTAL_FLOWS_EXP', 'L4_SRC_PORT', 'TCP_WIN_MIN_IN', 'L4_DST_PORT', 'PROTOCOL', 'TCP_WIN_SCALE_IN', 'FLOW_DURATION_MILLISECONDS', 'TCP_WIN_MSS_IN', 'TCP_WIN_MAX_OUT', 'TCP_FLAGS', 'TCP_WIN_MIN_OUT', 'TCP_WIN_SCALE_OUT', 'DST_TOS', 'SRC_TOS', 'OUT_PKTS', 'OUT_BYTES', 'IN_PKTS', 'IN_BYTES',

('FIRST_SWITCHED',
 'LAST_SWITCHED',
 'FLOW_ID',
 'TCP_WIN_MAX_IN',
 'TOTAL_FLOWS_EXP',
 'L4_SRC_PORT',
 'TCP_WIN_MIN_IN',
 'L4_DST_PORT',
 'PROTOCOL',
 'TCP_WIN_SCALE_IN',
 'FLOW_DURATION_MILLISECONDS',
 'TCP_WIN_MSS_IN',
 'TCP_WIN_MAX_OUT',
 'TCP_FLAGS',
 'TCP_WIN_MIN_OUT',
 'TCP_WIN_SCALE_OUT',
 'DST_TOS',
 'SRC_TOS',
 'OUT_PKTS',
 'OUT_BYTES',
 'IN_PKTS',
 'IN_BYTES',
 'MIN_IP_PKT_LEN',
 'MAX_IP_PKT_LEN',
 'TOTAL_PKTS_EXP',
 'TOTAL_BYTES_EXP')

## DeepLift


In [18]:
%%time
output_file_name = 'SHAPCIC.txt'
with open(output_file_name, "w") as f:print('',file = f)

print('---------------------------------------------------------------------------------')
print('Generating Explainer')
print('---------------------------------------------------------------------------------')

samples = 2500

#uncomment for single sample
# samples = 1

Label = label

test = X_test
train = X_train

start_index = 0
end_index = samples

explainer = shap.DeepExplainer(model,train[start_index:end_index].values.astype('float'))
shap_values = explainer.shap_values(test[start_index:end_index].values.astype('float'))
# shap_values = explainer.shap_values(test[start_index:len(test)].values.astype('float'))

vals= np.abs(shap_values).mean(1)
feature_importance = pd.DataFrame(list(zip(train.columns, sum(vals))), columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'], ascending=False,inplace=True)
feature_importance.head()
print(feature_importance.to_string())



print('---------------------------------------------------------------------------------')
# feature_importance_vals = 'feature_importance_vals'  # Replace with the name of the column you want to extract
feature_val = feature_importance['feature_importance_vals'].tolist()

# col_name = 'col_name'  # Replace with the name of the column you want to extract
feature_name = feature_importance['col_name'].tolist()

# Use zip to combine the two lists, sort based on list1, and then unzip them
zipped_lists = list(zip(feature_name, feature_val))
zipped_lists.sort(key=lambda x: x[1],reverse=True)

# Convert the sorted result back into separate lists
sorted_list1, sorted_list2 = [list(x) for x in zip(*zipped_lists)]

for k in sorted_list1:
  with open(output_file_name, "a") as f:print("df.pop('",k,"')", sep='',file = f)
with open(output_file_name, "a") as f:print("Trial_ =[", file = f)
for k in sorted_list1:
  with open(output_file_name, "a") as f:print("'",k,"',", sep='', file = f)
  print("'",k,"',", sep='')
with open(output_file_name, "a") as f:print("]", file = f)
print('---------------------------------------------------------------------------------')



---------------------------------------------------------------------------------
Generating Explainer
---------------------------------------------------------------------------------



Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.


                      col_name  feature_importance_vals
4   FLOW_DURATION_MILLISECONDS             7.109965e-02
10              TCP_WIN_MIN_IN             6.256452e-02
9              TCP_WIN_MAX_OUT             4.563568e-02
8               TCP_WIN_MAX_IN             2.987764e-02
14           TCP_WIN_SCALE_OUT             2.458372e-02
13            TCP_WIN_SCALE_IN             2.019242e-02
11             TCP_WIN_MIN_OUT             1.477787e-02
1                  L4_SRC_PORT             1.316693e-02
7                    TCP_FLAGS             9.503863e-03
0                      FLOW_ID             2.083951e-03
17             TOTAL_FLOWS_EXP             1.337469e-03
12              TCP_WIN_MSS_IN             4.277693e-04
25                    OUT_PKTS             5.095696e-06
5                LAST_SWITCHED             3.419828e-06
3               FIRST_SWITCHED             3.293029e-06
23                     IN_PKTS             2.812239e-06
22                    IN_BYTES             1.931