Preprocessing file for data cleaning and transformation before analysis.
This notebook includes code for:
1. Data loading and cleaning
2. Graph Construction
3. Complex Networks Measures Computation
4. Adding Complex Networks Features to the dataframe

Importing Libraries

In [1]:
import json
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import os

# datasets is a list of available datasets descriptions containing: path, key columns names, and suitable complex network features
from src.data.dataset_info import datasets

## 1. Data loading and cleaning

In [2]:
folder_path = 'datasets/'

# List of file paths
clients_paths = [
    folder_path + "client_0.parquet",
    folder_path + "client_1.parquet",
    folder_path + "client_2.parquet",
    folder_path + "client_3.parquet",
    folder_path + "client_4.parquet",
    folder_path + "client_5.parquet",
    folder_path + "client_6.parquet",
    folder_path + "client_7.parquet",
    folder_path + "test.parquet"
]

In [3]:
dataset = datasets[0]
#name = dataset.name
name = "test"
path = clients_paths[8]
#name = dataset.name
print("dataset: {}".format(name))
new_path = "datasets\\preprocessed\\{}.parquet".format(name)
graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)
df = pd.read_parquet(path)

dataset: test


In [4]:
print(dataset.cn_measures)

['betweenness', 'global_betweenness', 'degree', 'global_degree', 'eigenvector', 'closeness', 'pagerank', 'global_pagerank', 'k_core', 'k_truss', 'mv']


### Dropping infinity values, Nan values, and duplicates

In [5]:
# converting all infinity values into nan then dropping all records containing nan values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=0, how='any', inplace=True)

df.drop_duplicates(subset=list(set(df.columns) - set([dataset.timestamp_col, dataset.flow_id_col])), keep="first", inplace=True)

#df.to_pickle(new_path)
#print(f"DataFrame written to {pickle_filename}")



### Dataset Properties

calculating main dataset properties and saving them in a json file

In [6]:
total_count = len(df)
print(name)
properties = {
    "name": name,
    "length": total_count,
}

num_benign = len(df[df['Label'] == 0])
num_attack = len(df[df['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df["Attack"].unique())  # .to_list()

filename = ('datasets\\datasets_properties\\{}.json'.format(name))
if not os.path.exists(filename):
    with open(filename, 'w') as file:
        json.dump({}, file, indent=4)  
    print(f"File {filename} created.")
else:
    print(f"File {filename} already exists.")

outfile = open(filename, 'w')
outfile.writelines(json.dumps(properties))
outfile.close()

test
File datasets\datasets_properties\test.json already exists.


In [7]:
null_count = df['Label'].isnull().sum()

print(f"Number of null values in 'Label' column: {null_count}")

Number of null values in 'Label' column: 0


## 2. Graph Construction

Graph construction from the records in the dataset.<br>
Nodes are specified by IP addresses. <br>
If there exists atleast one network flow between two different IP addresses, an edge will be created. <br>
Another way can be considered is to use MultiDiGraph class. However, some centralities will not work in addition to transitivity.

In [8]:
G = nx.from_pandas_edgelist(
        df,
        source=dataset.src_ip_col,
        target=dataset.dst_ip_col,
        create_using=nx.DiGraph()
    )

In [9]:
G.remove_nodes_from(list(nx.isolates(G)))

In [10]:
for node in G.nodes():
    G.nodes[node]['label'] = node

In [11]:
import igraph as ig
G1 = ig.Graph.from_networkx(G)


In [12]:
labels = [G.nodes[node]['label'] for node in G.nodes()]
G1.vs['label'] = labels


In [13]:
labels = [G.nodes[node]['label'] for node in G.nodes()]
G1.vs['label'] = labels
print("igraph graph vertex labels:")
for v in G1.vs:
    print(v.index, v['label'])

igraph graph vertex labels:
0 192.168.1.152
1 192.168.1.39
2 192.168.1.190
3 192.168.1.35
4 192.168.1.195
5 176.28.50.165
6 192.168.1.1
7 192.168.1.32
8 192.168.1.79
9 192.168.1.193
10 192.168.1.33
11 232.71.192.168
12 1.169.74.125
13 141.65.192.168
14 1.30.192.168
15 52.28.231.150
16 196.117.10.246
17 69.151.192.168
18 18.194.169.124
19 114.237.10.246
20 192.168.1.36
21 38.205.192.168
22 1.31.192.168
23 210.113.192.168
24 1.38.192.168
25 192.113.192.168
26 196.244.192.168
27 1.169.216.58
28 98.244.192.168
29 1.1.192.168
30 47.114.192.168
31 1.184.192.168
32 44.33.192.168
33 173.203.192.168
34 236.212.192.168
35 192.168.1.37
36 55.246.192.168
37 179.88.10.246
38 206.63.216.58
39 196.142.192.168
40 80.60.216.58
41 137.152.10.246
42 22.178.192.168
43 69.214.192.168
44 1.152.192.168
45 204.72.192.168
46 1.32.192.168
47 113.249.192.168
48 214.178.192.168
49 183.30.192.168
50 38.81.192.168
51 1.43.192.168
52 104.81.192.168
53 226.7.192.168
54 192.168.1.133
55 224.0.0.251
56 3.62.192.168
57 

Specifying the communities in the graph using the methods get_communities. <br>
Since communities can be calculated using different methods, and we want to use get communites at different stages of the code, we implemented it in a separate file, so a change will be done one time.

In [14]:
#import igraph as ig
#G1 = ig.Graph.from_networkx(G)
part = G1.community_infomap()

print("Startttt")
communities = []
for com in part:
    communities.append([G1.vs[node_index]['label'] for node_index in com])

print(f"==>> number of communities: {len(communities)}")
for com in communities:
    print(f"==>> com: {len(com)}")

Startttt
==>> number of communities: 133
==>> com: 31
==>> com: 303
==>> com: 49
==>> com: 46
==>> com: 210
==>> com: 4919
==>> com: 1341
==>> com: 4066
==>> com: 3425
==>> com: 541
==>> com: 2438
==>> com: 541
==>> com: 968
==>> com: 150
==>> com: 1623
==>> com: 2
==>> com: 6
==>> com: 5
==>> com: 662
==>> com: 31
==>> com: 10
==>> com: 7
==>> com: 41
==>> com: 211
==>> com: 199
==>> com: 19
==>> com: 37
==>> com: 173
==>> com: 222
==>> com: 5
==>> com: 16
==>> com: 2
==>> com: 2
==>> com: 52
==>> com: 5
==>> com: 10
==>> com: 2
==>> com: 2
==>> com: 28
==>> com: 75
==>> com: 2
==>> com: 8
==>> com: 98
==>> com: 85
==>> com: 10
==>> com: 26
==>> com: 2
==>> com: 36
==>> com: 11
==>> com: 2
==>> com: 3
==>> com: 5
==>> com: 24
==>> com: 3
==>> com: 11
==>> com: 57
==>> com: 22
==>> com: 28
==>> com: 34
==>> com: 3
==>> com: 4
==>> com: 2
==>> com: 10
==>> com: 7
==>> com: 10
==>> com: 2
==>> com: 2
==>> com: 20
==>> com: 4
==>> com: 2
==>> com: 14
==>> com: 3
==>> com: 15
==>> com: 2
=

In [15]:
for node in G.nodes():
    G.nodes[node]['label'] = f"{node}"
print("NetworkX graph node labels:")
for node, data in G.nodes(data=True):
    print(node, data)

NetworkX graph node labels:
192.168.1.152 {'label': '192.168.1.152'}
192.168.1.39 {'label': '192.168.1.39'}
192.168.1.190 {'label': '192.168.1.190'}
192.168.1.35 {'label': '192.168.1.35'}
192.168.1.195 {'label': '192.168.1.195'}
176.28.50.165 {'label': '176.28.50.165'}
192.168.1.1 {'label': '192.168.1.1'}
192.168.1.32 {'label': '192.168.1.32'}
192.168.1.79 {'label': '192.168.1.79'}
192.168.1.193 {'label': '192.168.1.193'}
192.168.1.33 {'label': '192.168.1.33'}
232.71.192.168 {'label': '232.71.192.168'}
1.169.74.125 {'label': '1.169.74.125'}
141.65.192.168 {'label': '141.65.192.168'}
1.30.192.168 {'label': '1.30.192.168'}
52.28.231.150 {'label': '52.28.231.150'}
196.117.10.246 {'label': '196.117.10.246'}
69.151.192.168 {'label': '69.151.192.168'}
18.194.169.124 {'label': '18.194.169.124'}
114.237.10.246 {'label': '114.237.10.246'}
192.168.1.36 {'label': '192.168.1.36'}
38.205.192.168 {'label': '38.205.192.168'}
1.31.192.168 {'label': '1.31.192.168'}
210.113.192.168 {'label': '210.113.19

172.219.192.168 {'label': '172.219.192.168'}
246.188.192.168 {'label': '246.188.192.168'}
246.31.192.168 {'label': '246.31.192.168'}
193.90.192.168 {'label': '193.90.192.168'}
8.107.192.168 {'label': '8.107.192.168'}
33.95.192.168 {'label': '33.95.192.168'}
199.85.192.168 {'label': '199.85.192.168'}
127.126.192.168 {'label': '127.126.192.168'}
211.159.192.168 {'label': '211.159.192.168'}
45.10.74.125 {'label': '45.10.74.125'}
170.51.10.246 {'label': '170.51.10.246'}
128.1.192.168 {'label': '128.1.192.168'}
60.59.192.168 {'label': '60.59.192.168'}
81.146.10.246 {'label': '81.146.10.246'}
191.94.192.168 {'label': '191.94.192.168'}
212.7.192.168 {'label': '212.7.192.168'}
118.59.192.168 {'label': '118.59.192.168'}
0.67.74.125 {'label': '0.67.74.125'}
89.37.192.168 {'label': '89.37.192.168'}
48.66.192.168 {'label': '48.66.192.168'}
124.246.10.246 {'label': '124.246.10.246'}
141.86.192.168 {'label': '141.86.192.168'}
129.140.10.246 {'label': '129.140.10.246'}
68.119.192.168 {'label': '68.11

52.84.40.166 {'label': '52.84.40.166'}
104.88.43.91 {'label': '104.88.43.91'}
23.210.84.153 {'label': '23.210.84.153'}
151.80.24.232 {'label': '151.80.24.232'}
69.31.33.120 {'label': '69.31.33.120'}
104.94.252.68 {'label': '104.94.252.68'}
151.101.64.68 {'label': '151.101.64.68'}
104.25.215.101 {'label': '104.25.215.101'}
52.84.145.200 {'label': '52.84.145.200'}
54.167.238.223 {'label': '54.167.238.223'}
74.119.118.85 {'label': '74.119.118.85'}
74.217.63.60 {'label': '74.217.63.60'}
52.84.26.189 {'label': '52.84.26.189'}
200.147.4.55 {'label': '200.147.4.55'}
40.77.226.220 {'label': '40.77.226.220'}
52.6.174.16 {'label': '52.6.174.16'}
23.203.29.206 {'label': '23.203.29.206'}
46.105.202.39 {'label': '46.105.202.39'}
209.86.62.44 {'label': '209.86.62.44'}
139.162.41.250 {'label': '139.162.41.250'}
162.248.16.31 {'label': '162.248.16.31'}
54.192.48.6 {'label': '54.192.48.6'}
96.16.195.81 {'label': '96.16.195.81'}
31.13.69.202 {'label': '31.13.69.202'}
85.14.248.91 {'label': '85.14.248.91

## 3. Complex Networks Measures Computation

### 3.1. Computing Graph-level Measures

In [16]:
properties = {}

properties["number_of_nodes"] = G.number_of_nodes()
properties["number_of_edges"] = G.number_of_edges()

degrees = [degree for _, degree in G.degree()]
properties["max_degree"] = max(degrees)
properties["avg_degree"] = sum(degrees) / len(degrees)

In [17]:
properties["transitivity"] = nx.transitivity(G)

In [18]:
properties["density"] =  nx.density(G)

In [19]:
# Assuming G is your graph and communities is a list of sets, where each set contains the nodes in a community

# Step 1: Map each node to its community
node_to_community = {}
for community_index, community in enumerate(communities):
    for node in community:
        node_to_community[node] = community_index

# Step 2: Count inter-cluster edges efficiently
inter_cluster_edges = 0
for u, v in G.edges():
    # Directly check if u and v belong to different communities
    if node_to_community[u] != node_to_community[v]:
        inter_cluster_edges += 1


properties["mixing_parameter"] = inter_cluster_edges / G.number_of_edges()

In [20]:
properties["modularity"] = nx.community.modularity(G, communities)

In [21]:
filename = ('datasets\\datasets_properties\\{}.json'.format("graph_" + name))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(properties))
outfile.close()

properties

{'number_of_nodes': 33432,
 'number_of_edges': 53002,
 'max_degree': 5593,
 'avg_degree': 3.1707346255084947,
 'transitivity': 0.0010088632917112656,
 'density': 4.742207270958833e-05,
 'mixing_parameter': 0.3169503037621222,
 'modularity': 0.6311852677815911}

Using the graph-level metrics, suitable complex networks should be specified and added to the corresponding dataset in the list in src.data.dataset_info

### 3.2. Computing Node-level Measures

In [22]:
community_labels = {}
for i, community in enumerate(communities):
    for node in community:
        community_labels[node] = i

nx.set_node_attributes(G, community_labels, "new_community")

In [23]:
# getting inter and itra graph, to calculate the local and global variations of each centrality
from src.network.network_features import separate_graph

intra_graph, inter_graph = separate_graph(G, communities)

In [24]:
from src.network.network_features import cal_betweenness_centrality

if "betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(G), "betweenness")
    print("calculated")

calculated


In [25]:
print(dataset.cn_measures)

['betweenness', 'global_betweenness', 'degree', 'global_degree', 'eigenvector', 'closeness', 'pagerank', 'global_pagerank', 'k_core', 'k_truss', 'mv']


In [26]:
if "local_betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(intra_graph), "local_betweenness")
    print("calculated")

In [27]:
if "global_betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(inter_graph), "global_betweenness")
    print("calculated")

calculated


In [28]:
if "degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(G), "degree")
    print("calculated")

calculated


In [29]:
if "local_degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(intra_graph), "local_degree")
    print("calculated")

In [30]:
if "global_degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(inter_graph), "global_degree")
    print("calculated")

calculated


In [31]:
if "eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(G, max_iter=600), "eigenvector")
    print("calculated")


calculated


In [32]:
if "local_eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(intra_graph), "local_eigenvector")
    print("calculated")

In [33]:
if "global_eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(inter_graph), "global_eigenvector")
    print("calculated")

In [34]:
if "closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(G), "closeness")
    print("calculated")

calculated


In [35]:
if "local_closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(intra_graph), "local_closeness")
    print("calculated")

In [36]:
if "global_closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(inter_graph), "global_closeness")
    print("calculated")

In [37]:
if "pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(G, alpha=0.85), "pagerank")
    print("calculated")

calculated


In [38]:
if "local_pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(intra_graph, alpha=0.85), "local_pagerank")
    print("calculated")

In [39]:
if "global_pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(inter_graph, alpha=0.85), "global_pagerank")
    print("calculated")

calculated


In [40]:
from src.network.network_features import cal_k_core

if "k_core" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_k_core(G), "k_core")
    print("calculated")

calculated


In [41]:
from src.network.network_features import cal_k_truss
if "k_truss" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_k_truss(G), "k_truss")
    print("calculated")

calculated


In [42]:
from src.network.CommCentralityCode import comm_centreality

if "Comm" in dataset.cn_measures:
    nx.set_node_attributes(G, comm_centreality(G, community_labels), "Comm")
    print("calculated")

In [43]:
from src.network.modularity_vitality import modularity_vitality

if "mv" in dataset.cn_measures:
    nx.set_node_attributes(G, modularity_vitality(G1, part), "mv")
    print("calculated")

calculated


In [44]:
nx.write_gexf(G, graph_path)

## 4. Adding Complex Networks Features to the dataframe

In [45]:
features_dicts = {}
for measure in dataset.cn_measures:
    features_dicts[measure] = nx.get_node_attributes(G, measure)
    print(f"==>> features_dicts: {measure , len(features_dicts[measure])}")
    
for feature in dataset.network_features:
        if feature[:3] == "src":
            df[feature] = df.apply(
                lambda row: features_dicts[feature[4:]].get(row[dataset.src_ip_col], -1), axis=1)
        if feature[:3] == "dst":
            df[feature] = df.apply(
                lambda row: features_dicts[feature[4:]].get(row[dataset.dst_ip_col], -1), axis=1)

==>> features_dicts: ('betweenness', 33432)
==>> features_dicts: ('global_betweenness', 33432)
==>> features_dicts: ('degree', 33432)
==>> features_dicts: ('global_degree', 33432)
==>> features_dicts: ('eigenvector', 33432)
==>> features_dicts: ('closeness', 33432)
==>> features_dicts: ('pagerank', 33432)
==>> features_dicts: ('global_pagerank', 33432)
==>> features_dicts: ('k_core', 33432)
==>> features_dicts: ('k_truss', 33432)
==>> features_dicts: ('mv', 33432)


In [46]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,src_closeness,dst_closeness,src_global_pagerank,dst_global_pagerank,src_k_core,dst_k_core,src_k_truss,dst_k_truss,src_mv,dst_mv
117388,192.168.1.152-192.168.1.39-80-53084-6,192.168.1.152,80.0,192.168.1.39,53084.0,6.0,27/04/2019 07:53:48 pm,24711.0,1.0,1.0,...,0.031511,0.031493,0.000112,7.3e-05,0.409091,0.409091,0.011594,0.007246,0.000163,0.000243
488576,192.168.1.190-192.168.1.35-80-41424-6,192.168.1.190,80.0,192.168.1.35,41424.0,6.0,27/04/2019 05:27:04 pm,2544.0,1.0,1.0,...,0.031543,0.031501,0.000365,0.000102,0.409091,0.409091,0.007246,0.008696,0.002384,0.000264
1497977,192.168.1.35-192.168.1.195-45286-80-6,192.168.1.35,45286.0,192.168.1.195,80.0,6.0,27/04/2019 09:10:57 pm,84266.0,12.0,5.0,...,0.031501,0.038396,0.000102,0.000353,0.409091,0.409091,0.008696,0.011594,0.000264,0.000318
70721,192.168.1.39-176.28.50.165-54388-80-6,192.168.1.39,54388.0,176.28.50.165,80.0,6.0,27/04/2019 10:30:53 pm,763666.0,6.0,4.0,...,0.031493,0.026637,7.3e-05,1.4e-05,0.409091,0.363636,0.007246,0.002899,0.000243,0.000236
2497727,192.168.1.195-192.168.1.1-51530-41952-6,192.168.1.195,51530.0,192.168.1.1,41952.0,6.0,29/04/2019 12:40:00 am,186.0,2.0,0.0,...,0.038396,0.031504,0.000353,0.000232,0.409091,0.409091,0.011594,0.011594,0.000318,0.000127


In [47]:
pd.DataFrame.to_parquet(df, new_path)

In [48]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,src_closeness,dst_closeness,src_global_pagerank,dst_global_pagerank,src_k_core,dst_k_core,src_k_truss,dst_k_truss,src_mv,dst_mv
117388,192.168.1.152-192.168.1.39-80-53084-6,192.168.1.152,80.0,192.168.1.39,53084.0,6.0,27/04/2019 07:53:48 pm,24711.0,1.0,1.0,...,0.031511,0.031493,0.000112,7.3e-05,0.409091,0.409091,0.011594,0.007246,0.000163,0.000243
488576,192.168.1.190-192.168.1.35-80-41424-6,192.168.1.190,80.0,192.168.1.35,41424.0,6.0,27/04/2019 05:27:04 pm,2544.0,1.0,1.0,...,0.031543,0.031501,0.000365,0.000102,0.409091,0.409091,0.007246,0.008696,0.002384,0.000264
1497977,192.168.1.35-192.168.1.195-45286-80-6,192.168.1.35,45286.0,192.168.1.195,80.0,6.0,27/04/2019 09:10:57 pm,84266.0,12.0,5.0,...,0.031501,0.038396,0.000102,0.000353,0.409091,0.409091,0.008696,0.011594,0.000264,0.000318
70721,192.168.1.39-176.28.50.165-54388-80-6,192.168.1.39,54388.0,176.28.50.165,80.0,6.0,27/04/2019 10:30:53 pm,763666.0,6.0,4.0,...,0.031493,0.026637,7.3e-05,1.4e-05,0.409091,0.363636,0.007246,0.002899,0.000243,0.000236
2497727,192.168.1.195-192.168.1.1-51530-41952-6,192.168.1.195,51530.0,192.168.1.1,41952.0,6.0,29/04/2019 12:40:00 am,186.0,2.0,0.0,...,0.038396,0.031504,0.000353,0.000232,0.409091,0.409091,0.011594,0.011594,0.000318,0.000127


In [49]:
df.head(1)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,src_closeness,dst_closeness,src_global_pagerank,dst_global_pagerank,src_k_core,dst_k_core,src_k_truss,dst_k_truss,src_mv,dst_mv
117388,192.168.1.152-192.168.1.39-80-53084-6,192.168.1.152,80.0,192.168.1.39,53084.0,6.0,27/04/2019 07:53:48 pm,24711.0,1.0,1.0,...,0.031511,0.031493,0.000112,7.3e-05,0.409091,0.409091,0.011594,0.007246,0.000163,0.000243


In [50]:
df.shape[:2]

(470076, 113)

In [51]:
df.describe()

Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,src_closeness,dst_closeness,src_global_pagerank,dst_global_pagerank,src_k_core,dst_k_core,src_k_truss,dst_k_truss,src_mv,dst_mv
count,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,...,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0,470076.0
mean,34115.775783,11814.633991,7.661057,8727362.0,7.908951,7.591823,696.4152,11214.74,331.442892,10.36078,...,0.042234,0.050573,0.003028,0.001692,0.503603,0.515815,0.127218,0.046436,-0.001195,0.000601
std,23487.866649,20633.457119,4.619705,145192600.0,736.919906,988.093106,7709.144,2159832.0,619.577049,50.461952,...,0.021199,0.026799,0.006726,0.005347,0.271702,0.302933,0.296336,0.162729,0.003728,0.0044
min,0.0,0.0,0.0,-45625400000.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3e-05,1.4e-05,1.4e-05,0.045455,0.045455,0.002899,0.002899,-0.013078,-0.013078
25%,123.0,80.0,6.0,271.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.031493,0.031501,7.3e-05,7.5e-05,0.409091,0.409091,0.007246,0.007246,0.000107,0.000126
50%,42380.0,80.0,6.0,30311.0,2.0,2.0,70.0,96.0,37.0,0.0,...,0.031501,0.038396,0.000102,0.000225,0.409091,0.409091,0.008696,0.011594,0.000236,0.00018
75%,53422.0,9197.0,6.0,707042.2,5.0,4.0,401.0,1171.0,377.0,2.0,...,0.058163,0.07644,0.000353,0.000365,0.409091,0.954545,0.011594,0.017391,0.000264,0.00027
max,65534.0,65531.0,17.0,120000000.0,218658.0,291260.0,2796955.0,641001400.0,23360.0,2293.0,...,0.085582,0.167305,0.022533,0.030692,1.0,1.0,1.0,1.0,0.004587,0.028604


In [52]:
null_counts = df.isnull().sum()
print(null_counts)

Flow ID        0
Src IP         0
Src Port       0
Dst IP         0
Dst Port       0
              ..
dst_k_core     0
src_k_truss    0
dst_k_truss    0
src_mv         0
dst_mv         0
Length: 113, dtype: int64


In [53]:
pd.set_option('display.max_columns', None)
pd.option_context('display.max_rows', None, 'display.max_columns', None)

<pandas._config.config.option_context at 0x20985420790>

In [54]:
df.info

<bound method DataFrame.info of                                            Flow ID         Src IP  Src Port  \
117388       192.168.1.152-192.168.1.39-80-53084-6  192.168.1.152      80.0   
488576       192.168.1.190-192.168.1.35-80-41424-6  192.168.1.190      80.0   
1497977      192.168.1.35-192.168.1.195-45286-80-6   192.168.1.35   45286.0   
70721        192.168.1.39-176.28.50.165-54388-80-6   192.168.1.39   54388.0   
2497727    192.168.1.195-192.168.1.1-51530-41952-6  192.168.1.195   51530.0   
...                                            ...            ...       ...   
1610657  173.241.242.220-192.168.10.12-443-36564-6  192.168.10.12   36564.0   
1454376        172.16.0.1-192.168.10.50-59926-80-6     172.16.0.1   59926.0   
878714     172.217.12.142-192.168.10.15-80-50204-6  192.168.10.15   50204.0   
1095213        172.16.0.1-192.168.10.50-36996-80-6     172.16.0.1   36996.0   
722237         172.16.0.1-192.168.10.50-55724-80-6     172.16.0.1   55724.0   

                  D