In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import explained_variance_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, BatchNormalization
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [None]:
# reading a csv file

df_main = pd.read_csv('/kaggle/input/ip-network-traffic-flows-labeled-with-87-apps/Dataset-Unicauca-Version2-87Atts.csv')

In [None]:
df_main.head()

In [None]:
df_main.shape

In [None]:
df_main.columns

In [None]:
single_unique_cols = [col for col in df_main.columns if df_main[col].nunique() == 1]
df_main.drop(single_unique_cols, axis = 1, inplace = True)

df_main.drop(['Timestamp', 'Flow.ID'], axis = 1, inplace = True)

In [None]:
df_main.columns

In [None]:
print(df_main['L7Protocol'].nunique())
print(df_main['ProtocolName'].nunique())

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df_main.groupby(['L7Protocol', 'ProtocolName']).size())

In [None]:
# df_main.groupby(['L7Protocol', 'ProtocolName']).size()
# print(df_main.groupby(['L7Protocol', 'ProtocolName']).size().shape)
# df_main['Destination.IP'].value_counts()
# df_main = df_main[~df_main['Destination.IP'].astype(str).str.startswith('10.')]
# df_main['Destination.IP'].value_counts()
# df_main.groupby(['L7Protocol', 'ProtocolName']).size()
# df_main.shape

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df_main.groupby(['L7Protocol', 'ProtocolName']).size())

In [None]:
# value_counts = df_main['ProtocolName'].value_counts()
# to_remove = value_counts[value_counts < 100000].index
# df_main = df_main[~df_main.ProtocolName.isin(to_remove)]

# del value_counts
# del to_remove

In [None]:
# grouped = df_main.groupby('ProtocolName')

# def filter_group(group):
#     if len(group) > 100000:
#         return group.sample(n=90000, random_state=1)  # Randomly select 90,000 rows
#     else:
#         return group  # Keep all rows if less than or equal to 100,000

# # Apply the filter function to each group and concatenate the results
# filtered_df = pd.concat([filter_group(group) for _, group in grouped])

# # Reset the index of the resulting DataFrame
# filtered_df.reset_index(drop=True, inplace=True)

# df_main = filtered_df

# del grouped
# del filter_group
# del filtered_df

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(df_main.groupby(['L7Protocol', 'ProtocolName']).size())

In [None]:
# df_main['Source.IP'] = df_main['Source.IP'].apply(lambda x: int(x.replace('.', '')))
# df_main['Destination.IP'] = df_main['Destination.IP'].apply(lambda x: int(x.replace('.', '')))

import ipaddress

# Define a function to convert IP addresses to integers
def ip_to_integer(ip):
    try:
        ip_obj = ipaddress.IPv4Address(ip)
        return int(ip_obj)
    except ipaddress.AddressValueError:
        # Handle invalid IP addresses as needed
        return None
    
df_main['Destination.IP'] = df_main['Destination.IP'].apply(ip_to_integer)
df_main['Source.IP'] = df_main['Source.IP'].apply(ip_to_integer)

In [None]:
df_main.head()

In [None]:
x = df_main.drop(columns = ['ProtocolName','Fwd.Packet.Length.Std','Bwd.Packet.Length.Std','Fwd.IAT.Std','Bwd.IAT.Std','Fwd.Header.Length','Bwd.Header.Length','Packet.Length.Std','Packet.Length.Variance','Avg.Fwd.Segment.Size','Avg.Bwd.Segment.Size','Fwd.Header.Length.1','Subflow.Fwd.Packets','Subflow.Fwd.Bytes','Subflow.Bwd.Packets','Subflow.Bwd.Bytes','Init_Win_bytes_forward','Init_Win_bytes_backward','act_data_pkt_fwd','min_seg_size_forward','L7Protocol','Flow.IAT.Std', 'Min.Packet.Length', 'Max.Packet.Length', 'Active.Std', 'Active.Max', 'Active.Min', 'Idle.Std', 'Idle.Max', 'Idle.Min'])
y = df_main['L7Protocol']

scaler = StandardScaler()
x = scaler.fit_transform(x)

label_encoder = LabelEncoder()

# Fit the encoder to your categorical data and transform it
y = label_encoder.fit_transform(y)

# Get the mapping of labels to original values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("Label Mapping:")
for value, label in label_mapping.items():
    print(f"{value} -> {label}")

ncategories = len(df_main['L7Protocol'].unique())
print(ncategories)

In [None]:
y.shape

In [None]:
# Split the data into training, validation, and test sets
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

del x_temp
del y_temp

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    patience=10,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore the best model weights when training stops
)

# Build the neural network model
model = keras.Sequential([
    keras.layers.Input(shape=(x.shape[1],)),  # Input layer with the appropriate input shape
    keras.layers.Dense(64, activation='relu'),   # Fully connected layer with 64 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(128, activation='relu'),  # Fully connected layer with 128 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(256, activation='relu'),  # Fully connected layer with 256 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(1028, activation='relu'), # Fully connected layer with 1028 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(2056, activation='relu'), # Fully connected layer with 2056 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(2056, activation='relu'), # Another fully connected layer with 2056 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(1028, activation='relu'), # Fully connected layer with 1028 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(128, activation='relu'),  # Fully connected layer with 128 units and ReLU activation
    keras.layers.BatchNormalization(),  # Batch normalization layer
    keras.layers.Dense(ncategories)  # Output layer without activation for logits
])

# Compile the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model with early stopping
history = model.fit(
    x_train, y_train,
    epochs=1000,
    batch_size=64,
    validation_data=(x_val, y_val),  # Use the validation data
    callbacks=[early_stopping]  # Add early stopping callback
)

In [None]:
train_logits = model.predict(x_train)
train_probabilities = tf.nn.softmax(train_logits, axis=-1)

# Calculate training accuracy
train_accuracy = sum(tf.argmax(train_logits, axis=1).numpy() == y_train) / len(y_train)

# Print training accuracy
print(f'Training Accuracy: {train_accuracy:.4f}')

del train_logits
del train_probabilities
del train_accuracy

In [None]:
validation_logits = model.predict(x_val)
validation_probabilities = tf.nn.softmax(validation_logits, axis=-1)

# Calculate validation accuracy
validation_accuracy = sum(tf.argmax(validation_logits, axis=1).numpy() == y_val) / len(y_val)

# Print validation accuracy
print(f'Validation Accuracy: {validation_accuracy:.4f}')

del validation_logits
del validation_probabilities
del validation_accuracy

In [None]:
# Evaluate the model on the test data and apply softmax for probabilities
test_logits = model.predict(x_test)
test_probabilities = tf.nn.softmax(test_logits, axis=-1)

# Calculate accuracy for the test dataset
test_accuracy = sum(tf.argmax(test_logits, axis=1).numpy() == y_test) / len(y_test)

# Print test accuracy
print(f'Test Accuracy: {test_accuracy:.4f}')

# Generate a classification report for the test dataset
test_report = classification_report(y_test, tf.argmax(test_logits, axis=1).numpy())

# Print the classification report
print("Classification Report for Test Data:")
print(test_report)

del test_logits
del test_probabilities
del test_accuracy
del test_report