# Exploration of data and Random  Forest model - One day only

In [8]:
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px

# statistics
from statsmodels.graphics.mosaicplot import mosaic

# H2O machine learning
import h2o
from h2o.estimators import H2ORandomForestEstimator

import os
dataset_path = r"C:\Users\nithi\OneDrive\Desktop\TEAM\dataset"
os.listdir(dataset_path)

['2020', '2020.06.19.csv', '2021', '2022', 'test.csv', 'train.csv']

In [9]:
import os

# Change dataset path
dataset_path = r"C:\Users\nithi\OneDrive\Desktop\TEAM\dataset\2020"
os.listdir(dataset_path)


['06', '07', '08', '09', '10', '11', '12']

Ok, we have data from June to December 2020.

In [10]:
import os

directory_path = r'C:\Users\nithi\OneDrive\Desktop\TEAM\dataset\2021'

print(os.listdir(directory_path))


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\nithi\\OneDrive\\Desktop\\dhurva\\dataset\\2021'

For each day of the month there is a separate folder

In [None]:
import pandas as pd

file_path = r"C:\Users\nithi\OneDrive\Desktop\dhurva\dataset\2020\10\2020.10.01\2020.10.01.csv"

try:
    df_example = pd.read_csv(file_path)
    print(df_example.head())
except PermissionError as e:
    print(f"PermissionError: {e}. Unable to read the file due to insufficient permissions.")


# Explorations

In [None]:
# dimensions of data
df_example.shape

### For this one day we have already ca. 1 millions rows! Let's stay with this subset for the following.

In [None]:
# imputation of missings and conversion to int
df_example.dest_port = df_example.dest_port.fillna(-1).astype('int64')
df_example.src_port = df_example.src_port.fillna(-1).astype('int64')

In [None]:
# summary of numerical features
df_example.describe()

In [None]:
# init plot size
plt.rcParams['figure.figsize']=(7,4)

In [None]:
# label distribution
df_example.label.value_counts().plot(kind='bar')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
# entropy in bits per byte of the data fields within the flow; ranges from 0 to 8.
df_example.entropy.plot(kind='hist', bins=100)
plt.title('Entropy')
plt.grid()
plt.show()

In [None]:
# total entropy in bytes over all of the bytes in the data fields of the flow
df_example.total_entropy.plot(kind='hist', bins=100)
plt.title('Total Entropy')
plt.grid()
plt.show()

This is not really helpful. Let's remove the zeroes and plot in log scale:

In [None]:
# look at non-zeroes only
total_entropy_pos = df_example.total_entropy[df_example.total_entropy>0]
# show log plot
plt.hist(np.log10(total_entropy_pos),100)
plt.ylabel('Frequency')
plt.title('log10(Total Entropy) - zeroes excluded')
plt.grid()
plt.show()

In [None]:
# flow duration time, with microsecond precision
df_example.duration.plot(kind='hist', bins=100)
plt.title('duration')
plt.grid()
plt.show()

Again, a log plot of the non-zeroes will provide much more details:

In [None]:
# look at non-zeroes only
duration_pos = df_example.duration[df_example.duration>0]
# show log plot
plt.hist(np.log10(duration_pos),100)
plt.ylabel('Frequency')
plt.title('log10(duration) - zeroes excluded')
plt.grid()
plt.show()

In [None]:
# start time of the flow in seconds since the epoch
df_example.time_start.plot(kind='hist', bins=100)
plt.title('time_start')
plt.grid()
plt.show()

In [None]:
# end time of the flow in seconds since the epoch
df_example.time_end.plot(kind='hist', bins=100)
plt.title('time_end')
plt.grid()
plt.show()

In [None]:
# 2D plot of start and end time
plt.scatter(df_example.time_start, df_example.time_end, alpha=0.1)
plt.xlabel('time_start')
plt.ylabel('time_end')
plt.title('time_end vs time_start')
plt.grid()
plt.show()

In [None]:
# mean of the inter-packet arrival times of the flow (in description called "mean_ipt")
df_example.avg_ipt.plot(kind='hist', bins=100)
plt.title('avg_ipt')
plt.grid()
plt.show()

In [None]:
# look at non-zeroes only
avg_ipt_pos = df_example.avg_ipt[df_example.avg_ipt>0]
# show log plot
plt.hist(np.log10(avg_ipt_pos),100)
plt.ylabel('Frequency')
plt.title('log10(avg_ipt) - zeroes excluded')
plt.grid()
plt.show()

In [None]:
# number of bytes transmitted from source to destination
df_example.bytes_in.plot(kind='hist', bins=100)
plt.title('bytes_in')
plt.grid()
plt.show()

In [None]:
# look at non-zeroes only
bytes_in_pos = df_example.bytes_in[df_example.bytes_in>0]
# show log plot
plt.hist(np.log10(bytes_in_pos),100)
plt.ylabel('Frequency')
plt.title('log10(bytes_in) - zeroes excluded')
plt.grid()
plt.show()

In [None]:
# number of bytes transmitted from destination to source.
df_example.bytes_out.plot(kind='hist', bins=100)
plt.title('bytes_out')
plt.grid()
plt.show()

In [None]:
# look at non-zeroes only
bytes_out_pos = df_example.bytes_out[df_example.bytes_out>0]
# show log plot
plt.hist(np.log10(bytes_out_pos),100)
plt.ylabel('Frequency')
plt.title('log10(bytes_out) - zeroes excluded')
plt.grid()
plt.show()

In [None]:
# 2D plot of bytes in / out
plt.figure(figsize=(6,6))
plt.scatter(df_example.bytes_in, df_example.bytes_out, alpha=0.02)
plt.xlabel('bytes_in')
plt.ylabel('bytes_out')
plt.title('bytes_out vs bytes_in')
plt.grid()
plt.show()

In [None]:
# packet count from source to destination
df_example.num_pkts_in.plot(kind='hist', bins=100)
plt.title('num_pkts_in')
plt.grid()
plt.show()

In [None]:
# look at non-zeroes only
num_pkts_in_pos = df_example.num_pkts_in[df_example.num_pkts_in>0]
# show log plot
plt.hist(np.log10(num_pkts_in_pos),100)
plt.ylabel('Frequency')
plt.title('log10(num_pkts_in) - zeroes excluded')
plt.grid()
plt.show()

In [None]:
# packet count from destination to source
df_example.num_pkts_out.plot(kind='hist', bins=100)
plt.title('num_pkts_out')
plt.grid()
plt.show()

In [None]:
# look at non-zeroes only
num_pkts_out_pos = df_example.num_pkts_out[df_example.num_pkts_out>0]
# show log plot
plt.hist(np.log10(num_pkts_out_pos),100)
plt.ylabel('Frequency')
plt.title('log10(num_pkts_out) - zeroes excluded')
plt.grid()
plt.show()

In [None]:
# 2D plot of packets in / out
plt.figure(figsize=(6,6))
plt.scatter(df_example.num_pkts_in, df_example.num_pkts_out, alpha=0.05)
plt.xlabel('num_pkts_in')
plt.ylabel('num_pkts_out')
plt.title('num_pkts_out vs num_pkts_in')
plt.grid()
plt.show()

In [None]:
# protocol number associated with the flow; e. g. TCP is 6
df_example.proto.value_counts().plot(kind='bar')
plt.title('proto')
plt.grid()
plt.show()

In [None]:
# check impact of protocol on target
pd.crosstab(df_example.proto, df_example.label)

In [None]:
# graphical version: mosaic plot
rcpar_save = plt.rcParams['figure.figsize']
plt.rcParams['figure.figsize']=(14,6)
mosaic(df_example, ['proto','label'])
plt.show()

plt.rcParams['figure.figsize'] = rcpar_save # reset plot size to previous status

In [None]:
# source IP (anonymized)
df_example.src_ip.value_counts()

In [None]:
# source IP plot
df_example.src_ip.value_counts()[0:10].plot(kind='bar')
plt.ylabel('Frequency')
plt.title('src_ip - Top 10')
plt.grid()
plt.show()

In [None]:
# destination IP (anonymized)
df_example.dest_ip.value_counts()

In [None]:
# destination IP plot
df_example.dest_ip.value_counts()[0:10].plot(kind='bar')
plt.ylabel('Frequency')
plt.title('dest_ip - Top 10')
plt.grid()
plt.show()

In [None]:
# destination IP vs source IP
plt.figure(figsize=(6,6))
plt.scatter(df_example.src_ip, df_example.dest_ip, alpha=0.1)
plt.xlabel('src_ip')
plt.ylabel('dest_ip')
plt.title('dest_ip vs src_ip')
plt.grid()
plt.show()

In [None]:
# zoom in
plt.figure(figsize=(6,6))
plt.scatter(df_example.src_ip, df_example.dest_ip, alpha=0.1)
plt.xlim(0,5000)
plt.ylim(0,5000)
plt.xlabel('src_ip')
plt.ylabel('dest_ip')
plt.title('dest_ip vs src_ip')
plt.grid()
plt.show()

In [None]:
# most frequent IP pairs
df_example['IP_pair'] = df_example.src_ip.astype(str) + ' >> ' + df_example.dest_ip.astype(str)
df_example.IP_pair.value_counts()[0:20]

In [None]:
# INTERACTIVE treemap visualization of source/destination IP
fig = px.treemap(df_example, path=['src_ip','dest_ip'], title='Source IP => Destination IP',
                 width=1000, height=800)
fig.show()

In [None]:
# source port
df_example.src_port.value_counts()

In [None]:
# source port plot
df_example.src_port.value_counts().iloc[0:10].plot(kind='bar')
plt.ylabel('Frequency')
plt.title('src_port - Top 10')
plt.grid()
plt.show()

In [None]:
# destination port
df_example.dest_port.value_counts()

In [None]:
# destination port plot
df_example.dest_port.value_counts().iloc[0:10].plot(kind='bar')
plt.ylabel('Frequency')
plt.title('dest_port - Top 10')
plt.grid()
plt.show()

In [None]:
# destination port vs source port
plt.figure(figsize=(6,6))
plt.scatter(df_example.src_port, df_example.dest_port, alpha=0.05)
plt.xlabel('src_port')
plt.ylabel('dest_port')
plt.title('dest_port vs src_port')
plt.grid()
plt.show()

In [None]:
# most frequent port pairs
df_example['port_pair'] = df_example.src_port.astype(str) + ' >> ' + df_example.dest_port.astype(str)
df_example.port_pair.value_counts()[0:20]

PCA

In [None]:
# Import PCA
from sklearn.decomposition import PCA

# Select features for PCA
pca_features = df_example.select_dtypes(include=['float64', 'int64'])

# Initialize PCA with desired number of components
pca = PCA(n_components=2)  # You can adjust the number of components as needed

# Fit PCA to the data
pca.fit(pca_features)

# Transform the data
pca_result = pca.transform(pca_features)

# Create a DataFrame to visualize PCA results
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

# Plot PCA results
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.5)
plt.title('Principal Component Analysis (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid()
plt.show()


NameError: name 'df_example' is not defined

# Fit a predictive model

In [None]:
# select features
features = df_example.columns
features = features.drop(['label'])
features = list(features)
print(features)

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4)

In [None]:
# upload data frame in H2O environment
t1 = time.time()
df_hex = h2o.H2OFrame(df_example)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# define target
target = 'label'
# explicitly convert target to categorical => classification problem
df_hex[target] = df_hex[target].asfactor()

In [None]:
# train / test split (80/20)
train_hex, test_hex = df_hex.split_frame(ratios=[0.8], seed=999)

In [None]:
# define (distributed) random forest model
fit_DRF = H2ORandomForestEstimator(ntrees=10,
                                   max_depth=5,
                                   min_rows=10,
                                   nfolds=5,
                                   seed=999)

In [None]:
# train model
t1 = time.time()
fit_DRF.train(x=features,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show training scoring history
fit_DRF.plot()

In [None]:
# variable importance
fit_DRF.varimp_plot()

# > Evaluate performance

In [None]:
# performance on training
perf_train = fit_DRF.model_performance(train=True)
perf_train

In [None]:
# cross validation metrics
fit_DRF.cross_validation_metrics_summary()

# Predict on test set

In [None]:
# calc predictions
pred_test = fit_DRF.predict(test_hex)

In [None]:
# add actual target
pred_test['target'] = test_hex['label']
# and convert to pandas data frame
pred_test = pred_test.as_data_frame()

In [None]:
# show a few examples, the numeric values are the predicted probabilities for the 3 classes
pred_test.tail(100)

In [None]:
# evaluate confusion matrix
pd.crosstab(pred_test.predict, pred_test.target)

In [None]:
# Save the trained model
model_path = "saved_model"
h2o.save_model(model=fit_DRF, path=model_path, force=True)