In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import explained_variance_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, BatchNormalization
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

In [5]:
# reading a csv file

df_main = pd.read_csv('/kaggle/input/ip-network-traffic-flows-labeled-with-87-apps/Dataset-Unicauca-Version2-87Atts.csv')

In [6]:
df_main.head()

Unnamed: 0,Flow.ID,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Timestamp,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,...,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,Label,L7Protocol,ProtocolName
0,172.19.1.46-10.200.7.7-52422-3128-6,172.19.1.46,52422,10.200.7.7,3128,6,26/04/201711:11:17,45523,22,55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,131,HTTP_PROXY
1,172.19.1.46-10.200.7.7-52422-3128-6,10.200.7.7,3128,172.19.1.46,52422,6,26/04/201711:11:17,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,131,HTTP_PROXY
2,10.200.7.217-50.31.185.39-38848-80-6,50.31.185.39,80,10.200.7.217,38848,6,26/04/201711:11:17,1,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,7,HTTP
3,10.200.7.217-50.31.185.39-38848-80-6,50.31.185.39,80,10.200.7.217,38848,6,26/04/201711:11:17,217,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,7,HTTP
4,192.168.72.43-10.200.7.7-55961-3128-6,192.168.72.43,55961,10.200.7.7,3128,6,26/04/201711:11:17,78068,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,131,HTTP_PROXY


In [7]:
df_main.shape

(3577296, 87)

In [8]:
df_main.columns

Index(['Flow.ID', 'Source.IP', 'Source.Port', 'Destination.IP',
       'Destination.Port', 'Protocol', 'Timestamp', 'Flow.Duration',
       'Total.Fwd.Packets', 'Total.Backward.Packets',
       'Total.Length.of.Fwd.Packets', 'Total.Length.of.Bwd.Packets',
       'Fwd.Packet.Length.Max', 'Fwd.Packet.Length.Min',
       'Fwd.Packet.Length.Mean', 'Fwd.Packet.Length.Std',
       'Bwd.Packet.Length.Max', 'Bwd.Packet.Length.Min',
       'Bwd.Packet.Length.Mean', 'Bwd.Packet.Length.Std', 'Flow.Bytes.s',
       'Flow.Packets.s', 'Flow.IAT.Mean', 'Flow.IAT.Std', 'Flow.IAT.Max',
       'Flow.IAT.Min', 'Fwd.IAT.Total', 'Fwd.IAT.Mean', 'Fwd.IAT.Std',
       'Fwd.IAT.Max', 'Fwd.IAT.Min', 'Bwd.IAT.Total', 'Bwd.IAT.Mean',
       'Bwd.IAT.Std', 'Bwd.IAT.Max', 'Bwd.IAT.Min', 'Fwd.PSH.Flags',
       'Bwd.PSH.Flags', 'Fwd.URG.Flags', 'Bwd.URG.Flags', 'Fwd.Header.Length',
       'Bwd.Header.Length', 'Fwd.Packets.s', 'Bwd.Packets.s',
       'Min.Packet.Length', 'Max.Packet.Length', 'Packet.Length.Mean',
  

In [9]:
single_unique_cols = [col for col in df_main.columns if df_main[col].nunique() == 1]
df_main.drop(single_unique_cols, axis = 1, inplace = True)

df_main.drop(['Timestamp', 'Flow.ID'], axis = 1, inplace = True)

In [10]:
df_main.columns

Index(['Source.IP', 'Source.Port', 'Destination.IP', 'Destination.Port',
       'Protocol', 'Flow.Duration', 'Total.Fwd.Packets',
       'Total.Backward.Packets', 'Total.Length.of.Fwd.Packets',
       'Total.Length.of.Bwd.Packets', 'Fwd.Packet.Length.Max',
       'Fwd.Packet.Length.Min', 'Fwd.Packet.Length.Mean',
       'Fwd.Packet.Length.Std', 'Bwd.Packet.Length.Max',
       'Bwd.Packet.Length.Min', 'Bwd.Packet.Length.Mean',
       'Bwd.Packet.Length.Std', 'Flow.Bytes.s', 'Flow.Packets.s',
       'Flow.IAT.Mean', 'Flow.IAT.Std', 'Flow.IAT.Max', 'Flow.IAT.Min',
       'Fwd.IAT.Total', 'Fwd.IAT.Mean', 'Fwd.IAT.Std', 'Fwd.IAT.Max',
       'Fwd.IAT.Min', 'Bwd.IAT.Total', 'Bwd.IAT.Mean', 'Bwd.IAT.Std',
       'Bwd.IAT.Max', 'Bwd.IAT.Min', 'Fwd.PSH.Flags', 'Fwd.Header.Length',
       'Bwd.Header.Length', 'Fwd.Packets.s', 'Bwd.Packets.s',
       'Min.Packet.Length', 'Max.Packet.Length', 'Packet.Length.Mean',
       'Packet.Length.Std', 'Packet.Length.Variance', 'FIN.Flag.Count',
       'SYN.

In [11]:
print(df_main['L7Protocol'].nunique())
print(df_main['ProtocolName'].nunique())

78
78


In [12]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df_main.groupby(['L7Protocol', 'ProtocolName']).size())

L7Protocol  ProtocolName     
1           FTP_CONTROL              25
5           DNS                    1695
7           HTTP                 683734
9           NTP                     135
11          NFS                       1
13          BGP                      11
14          SNMP                      4
36          EDONKEY                  95
37          BITTORRENT               10
40          CONTENT_FLASH          8589
48          QQ                        2
51          MAIL_IMAPS                9
60          HTTP_DOWNLOAD           516
64          SSL_NO_CERT             856
67          UNENCRYPED_JABBER        45
68          MSN                   14478
69          OSCAR                     7
70          YAHOO                 21268
81          IP_ICMP                1631
85          IP_OSPF                   5
91          SSL                  404883
92          SSH                     102
114         MSSQL                    21
119         FACEBOOK              29033
120       

In [13]:
# df_main.groupby(['L7Protocol', 'ProtocolName']).size()
# print(df_main.groupby(['L7Protocol', 'ProtocolName']).size().shape)
# df_main['Destination.IP'].value_counts()
df_main = df_main[~df_main['Destination.IP'].astype(str).str.startswith('10.')]
# df_main['Destination.IP'].value_counts()
# df_main.groupby(['L7Protocol', 'ProtocolName']).size()
# df_main.shape

In [14]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df_main.groupby(['L7Protocol', 'ProtocolName']).size())

L7Protocol  ProtocolName     
5           DNS                    1593
7           HTTP                 605068
11          NFS                       1
13          BGP                       5
36          EDONKEY                  55
37          BITTORRENT                8
40          CONTENT_FLASH          8371
48          QQ                        2
51          MAIL_IMAPS                9
60          HTTP_DOWNLOAD           291
64          SSL_NO_CERT             812
67          UNENCRYPED_JABBER        39
68          MSN                    8395
69          OSCAR                     7
70          YAHOO                 10794
85          IP_OSPF                   3
91          SSL                  359968
92          SSH                      86
114         MSSQL                     2
119         FACEBOOK              15117
120         TWITTER                9495
121         DROPBOX                5126
122         GMAIL                 19151
123         GOOGLE_MAPS             401
124       

In [15]:
value_counts = df_main['ProtocolName'].value_counts()
to_remove = value_counts[value_counts < 100000].index
df_main = df_main[~df_main.ProtocolName.isin(to_remove)]

del value_counts
del to_remove

In [16]:
# grouped = df_main.groupby('ProtocolName')

# def filter_group(group):
#     if len(group) > 100000:
#         return group.sample(n=90000, random_state=1)  # Randomly select 90,000 rows
#     else:
#         return group  # Keep all rows if less than or equal to 100,000

# # Apply the filter function to each group and concatenate the results
# filtered_df = pd.concat([filter_group(group) for _, group in grouped])

# # Reset the index of the resulting DataFrame
# filtered_df.reset_index(drop=True, inplace=True)

# df_main = filtered_df

# del grouped
# del filter_group
# del filtered_df

In [17]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df_main.groupby(['L7Protocol', 'ProtocolName']).size())

L7Protocol  ProtocolName
7           HTTP            605068
91          SSL             359968
126         GOOGLE          437261
131         HTTP_PROXY      185384
dtype: int64


In [18]:
# grouped = df_main.groupby('ProtocolName')

# def filter_group(group):
#     if len(group) > 150000:
#         return group.sample(n=150000, random_state=1)  # Randomly select 90,000 rows
#     else:
#         return group  # Keep all rows if less than or equal to 100,000

# # Apply the filter function to each group and concatenate the results
# filtered_df = pd.concat([filter_group(group) for _, group in grouped])

# # Reset the index of the resulting DataFrame
# filtered_df.reset_index(drop=True, inplace=True)

# df_main = filtered_df

# del grouped
# del filter_group
# del filtered_df

In [19]:
# df_main['Source.IP'] = df_main['Source.IP'].apply(lambda x: int(x.replace('.', '')))
# df_main['Destination.IP'] = df_main['Destination.IP'].apply(lambda x: int(x.replace('.', '')))

import ipaddress

# Define a function to convert IP addresses to integers
def ip_to_integer(ip):
    try:
        ip_obj = ipaddress.IPv4Address(ip)
        return int(ip_obj)
    except ipaddress.AddressValueError:
        # Handle invalid IP addresses as needed
        return None
    
df_main['Destination.IP'] = df_main['Destination.IP'].apply(ip_to_integer)
df_main['Source.IP'] = df_main['Source.IP'].apply(ip_to_integer)

In [20]:
df_main.head()

Unnamed: 0,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,Total.Length.of.Fwd.Packets,Total.Length.of.Bwd.Packets,Fwd.Packet.Length.Max,Fwd.Packet.Length.Min,Fwd.Packet.Length.Mean,Fwd.Packet.Length.Std,Bwd.Packet.Length.Max,Bwd.Packet.Length.Min,Bwd.Packet.Length.Mean,Bwd.Packet.Length.Std,Flow.Bytes.s,Flow.Packets.s,Flow.IAT.Mean,Flow.IAT.Std,Flow.IAT.Max,Flow.IAT.Min,Fwd.IAT.Total,Fwd.IAT.Mean,Fwd.IAT.Std,Fwd.IAT.Max,Fwd.IAT.Min,Bwd.IAT.Total,Bwd.IAT.Mean,Bwd.IAT.Std,Bwd.IAT.Max,Bwd.IAT.Min,Fwd.PSH.Flags,Fwd.Header.Length,Bwd.Header.Length,Fwd.Packets.s,Bwd.Packets.s,Min.Packet.Length,Max.Packet.Length,Packet.Length.Mean,Packet.Length.Std,Packet.Length.Variance,FIN.Flag.Count,SYN.Flag.Count,RST.Flag.Count,PSH.Flag.Count,ACK.Flag.Count,URG.Flag.Count,ECE.Flag.Count,Down.Up.Ratio,Average.Packet.Size,Avg.Fwd.Segment.Size,Avg.Bwd.Segment.Size,Fwd.Header.Length.1,Subflow.Fwd.Packets,Subflow.Fwd.Bytes,Subflow.Bwd.Packets,Subflow.Bwd.Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active.Mean,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,L7Protocol,ProtocolName
1,180881159,3128,2886926638,52422,6,1,2,0,12,0.0,6,6,6.0,0.0,0,0,0.0,0.0,12000000.0,2000000.0,1.0,0.0,1.0,1,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0,40,0,2000000.0,0.0,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,9.0,6.0,0.0,40,2,12,0,0,490,-1,1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131,HTTP_PROXY
5,180881158,3128,2886926648,50004,6,105069,136,0,313554,0.0,5840,6,2305.544118,1220.822406,0,0,0.0,0.0,2984267.0,1294.387,778.288889,4540.424367,51842.0,0,105069.0,778.288889,4540.424367,51842.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2720,0,1294.387,0.0,6,5840,2299.372263,1218.46914,1484667.0,0,0,0,0,1,0,0,0,2316.279412,2305.544118,0.0,2720,136,313554,0,0,254,-1,135,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131,HTTP_PROXY
10,180881156,3128,3232281651,57740,6,205118,32,4,6494,3118.0,1460,6,202.9375,464.035311,1460,51,779.5,735.079361,46860.83,175.5087,5860.514286,23425.849096,103196.0,0,205118.0,6616.709677,24836.180077,103196.0,0.0,99727.0,33242.333333,55644.750393,97490.0,400.0,1,640,80,156.0078,19.50097,6,1460,260.837838,514.477864,264687.5,0,1,0,0,1,0,0,0,268.083333,202.9375,779.5,640,32,6494,4,3118,245,255,31,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131,HTTP_PROXY
11,180881156,3128,3232281651,57740,6,3,5,0,9991,0.0,2920,6,1998.2,1267.837214,0,0,0.0,0.0,3330333000.0,1666667.0,0.75,0.5,1.0,0,3.0,0.75,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,100,0,1666667.0,0.0,6,2920,1908.5,1155.078136,1334206.0,0,0,0,0,1,0,0,0,2290.2,1998.2,0.0,100,5,9991,0,0,353,-1,4,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131,HTTP_PROXY
12,180881156,3128,3232281651,57740,6,131,3,0,5611,0.0,4145,6,1870.333333,2099.788164,0,0,0.0,0.0,42832060.0,22900.76,65.5,91.216775,130.0,1,131.0,65.5,91.216775,130.0,1.0,0.0,0.0,0.0,0.0,0.0,0,60,0,22900.76,0.0,6,4145,1767.75,1726.702131,2981500.0,0,0,0,0,1,0,0,0,2357.0,1870.333333,0.0,60,3,5611,0,0,353,-1,2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131,HTTP_PROXY


In [21]:
x = df_main.drop(columns = ['ProtocolName','Fwd.Packet.Length.Std','Bwd.Packet.Length.Std','Fwd.IAT.Std','Bwd.IAT.Std','Fwd.Header.Length','Bwd.Header.Length','Packet.Length.Std','Packet.Length.Variance','Avg.Fwd.Segment.Size','Avg.Bwd.Segment.Size','Fwd.Header.Length.1','Subflow.Fwd.Packets','Subflow.Fwd.Bytes','Subflow.Bwd.Packets','Subflow.Bwd.Bytes','Init_Win_bytes_forward','Init_Win_bytes_backward','act_data_pkt_fwd','min_seg_size_forward','L7Protocol','Flow.IAT.Std', 'Min.Packet.Length', 'Max.Packet.Length', 'Active.Std', 'Active.Max', 'Active.Min', 'Idle.Std', 'Idle.Max', 'Idle.Min'])
y = df_main['L7Protocol']

scaler = StandardScaler()
x = scaler.fit_transform(x)

label_encoder = LabelEncoder()

# Fit the encoder to your categorical data and transform it
y = label_encoder.fit_transform(y)

# Get the mapping of labels to original values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("Label Mapping:")
for value, label in label_mapping.items():
    print(f"{value} -> {label}")

ncategories = len(df_main['L7Protocol'].unique())
print(ncategories)

Label Mapping:
7 -> 0
91 -> 1
126 -> 2
131 -> 3
4


In [22]:
y.shape

(1587681,)

In [23]:
# Split the data into training, validation, and test sets
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

class_labels = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=class_labels, y=y_train)
print(class_weights)


del x_temp
del y_temp

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    patience=10,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore the best model weights when training stops
)

# Define a MirroredStrategy to use both GPUs
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])

with strategy.scope():
    # Build the neural network model
    model = Sequential([
        Input(shape=(x.shape[1],)),  # Input layer with the appropriate input shape
        Dense(64, activation='relu'),   # Fully connected layer with 64 units and ReLU activation
        BatchNormalization(),  # Batch normalization layer
        Dense(128, activation='relu'),  # Fully connected layer with 128 units and ReLU activation
        BatchNormalization(),  # Batch normalization layer
        Dense(256, activation='relu'),  # Fully connected layer with 256 units and ReLU activation
        BatchNormalization(),  # Batch normalization layer
        Dense(64, activation='relu'),   # Fully connected layer with 64 units and ReLU activation
        BatchNormalization(),  # Batch normalization layer
        Dense(128, activation='relu'),   # Fully connected layer with 64 units and ReLU activation
        BatchNormalization(),  # Batch normalization layer
        Dense(256, activation='relu'),   # Fully connected layer with 64 units and ReLU activation
        BatchNormalization(),  # Batch normalization layer
        Dense(16, activation='relu'),   # Fully connected layer with 4 units and ReLU activation
        BatchNormalization(),  # Batch normalization layer
        Dense(ncategories, activation='softmax')  # Output layer with 4 units and softmax activation for multi-class classification
    ])


   # Compile the model with categorical cross-entropy loss
    model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # Use categorical cross-entropy for multi-class classification
              metrics=['accuracy', 'categorical_accuracy'])

# Train the model with early stopping
history = model.fit(
    x_train, tf.keras.utils.to_categorical(y_train, num_classes=ncategories),
    epochs=1000,
    batch_size=64,
    validation_data=(x_val, tf.keras.utils.to_categorical(y_val, num_classes=ncategories)),
    callbacks=[early_stopping],
    class_weight=dict(enumerate(class_weights))  # Assign class weights for each class
)

[0.65602584 1.10505069 0.90713019 2.13512641]
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000


In [24]:
train_logits = model.predict(x_train)
train_probabilities = tf.nn.softmax(train_logits, axis=-1)

# Calculate training accuracy
train_accuracy = sum(tf.argmax(train_logits, axis=1).numpy() == y_train) / len(y_train)

# Print training accuracy
print(f'Training Accuracy: {train_accuracy:.4f}')

del train_logits
del train_probabilities
del train_accuracy

Training Accuracy: 0.9089


In [25]:
validation_logits = model.predict(x_val)
validation_probabilities = tf.nn.softmax(validation_logits, axis=-1)

# Calculate validation accuracy
validation_accuracy = sum(tf.argmax(validation_logits, axis=1).numpy() == y_val) / len(y_val)

# Print validation accuracy
print(f'Validation Accuracy: {validation_accuracy:.4f}')

del validation_logits
del validation_probabilities
del validation_accuracy

Validation Accuracy: 0.9087


In [26]:
# Evaluate the model on the test data and apply softmax for probabilities
test_logits = model.predict(x_test)
test_probabilities = tf.nn.softmax(test_logits, axis=-1)

# Calculate accuracy for the test dataset
test_accuracy = sum(tf.argmax(test_logits, axis=1).numpy() == y_test) / len(y_test)

# Print test accuracy
print(f'Test Accuracy: {test_accuracy:.4f}')

# Generate a classification report for the test dataset
test_report = classification_report(y_test, tf.argmax(test_logits, axis=1).numpy())

# Print the classification report
print("Classification Report for Test Data:")
print(test_report)

del test_logits
del test_probabilities
del test_accuracy
del test_report

Test Accuracy: 0.9088
Classification Report for Test Data:
              precision    recall  f1-score   support

           0       0.91      0.94      0.92     90653
           1       0.94      0.92      0.93     54517
           2       0.93      0.94      0.93     65331
           3       0.80      0.72      0.76     27652

    accuracy                           0.91    238153
   macro avg       0.89      0.88      0.89    238153
weighted avg       0.91      0.91      0.91    238153

