In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
!unzip drive/MyDrive/CP2/archive.zip -d data/

Archive:  drive/MyDrive/CP2/archive.zip
  inflating: data/Dataset-Unicauca-Version2-87Atts.csv  


In [23]:
df = pd.read_csv('data/Dataset-Unicauca-Version2-87Atts.csv', sep=',')

In [8]:
df.shape

(3577296, 87)

In [9]:
df.head()

Unnamed: 0,Flow.ID,Source.IP,Source.Port,Destination.IP,Destination.Port,Protocol,Timestamp,Flow.Duration,Total.Fwd.Packets,Total.Backward.Packets,...,Active.Std,Active.Max,Active.Min,Idle.Mean,Idle.Std,Idle.Max,Idle.Min,Label,L7Protocol,ProtocolName
0,172.19.1.46-10.200.7.7-52422-3128-6,172.19.1.46,52422,10.200.7.7,3128,6,26/04/201711:11:17,45523,22,55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,131,HTTP_PROXY
1,172.19.1.46-10.200.7.7-52422-3128-6,10.200.7.7,3128,172.19.1.46,52422,6,26/04/201711:11:17,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,131,HTTP_PROXY
2,10.200.7.217-50.31.185.39-38848-80-6,50.31.185.39,80,10.200.7.217,38848,6,26/04/201711:11:17,1,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,7,HTTP
3,10.200.7.217-50.31.185.39-38848-80-6,50.31.185.39,80,10.200.7.217,38848,6,26/04/201711:11:17,217,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,7,HTTP
4,192.168.72.43-10.200.7.7-55961-3128-6,192.168.72.43,55961,10.200.7.7,3128,6,26/04/201711:11:17,78068,5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN,131,HTTP_PROXY


There is one to one mapping between L7 Protocol and ProtocolName.Hence we need to drop it while classifying ProtocolName.

In [10]:
unique_L7Protocol_counts = df.groupby("ProtocolName")["L7Protocol"].nunique()
unique_L7Protocol_counts

ProtocolName
99TAXI            1
AMAZON            1
APPLE             1
APPLE_ICLOUD      1
APPLE_ITUNES      1
                 ..
WHOIS_DAS         1
WIKIPEDIA         1
WINDOWS_UPDATE    1
YAHOO             1
YOUTUBE           1
Name: L7Protocol, Length: 78, dtype: int64

Dropping Timestamp and flow  ID as they aren't used for our classification

In [24]:
df.drop(['Timestamp', 'Flow.ID'], axis = 1, inplace = True)

Dropping columns which have only a single value.

In [5]:
single_unique_cols = [col for col in df.columns if df[col].nunique() == 1]
df.drop(single_unique_cols, axis = 1, inplace = True)

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder().fit(df['ProtocolName'])
df['ProtocolName'] = encoder.fit_transform(df['ProtocolName'])

Removing private IPs from Destination IP

In [7]:
# remove values in Destination.IP column where ip starts with 10.
df = df[~df['Destination.IP'].str.startswith("10.")]

In [8]:
import ipaddress

# Define a function to convert IP addresses to integers
def ip_to_integer(ip):
    try:
        ip_obj = ipaddress.IPv4Address(ip)
        return int(ip_obj)
    except ipaddress.AddressValueError:
        # Handle invalid IP addresses as needed
        return None

Converting IPs to integers

In [9]:
# Apply the function to the DataFrame column
df['Destination.IP'] = df['Destination.IP'].apply(ip_to_integer)
df['Source.IP'] = df['Source.IP'].apply(ip_to_integer)

In [17]:
# count the number of times each value in ProtocolName column occurs in the dataset
value_counts = df['ProtocolName'].value_counts()
value_counts

25    605068
22    437261
58    359968
28    185384
77     84623
       ...  
60         1
52         1
55         1
0          1
41         1
Name: ProtocolName, Length: 69, dtype: int64

Dropping useless columns

In [10]:
df = df.drop(columns = ['Fwd.Packet.Length.Std','Bwd.Packet.Length.Std','Fwd.IAT.Std','Bwd.IAT.Std','Fwd.Header.Length','Bwd.Header.Length','Packet.Length.Std','Packet.Length.Variance','Avg.Fwd.Segment.Size','Avg.Bwd.Segment.Size','Fwd.Header.Length.1','Subflow.Fwd.Packets','Subflow.Fwd.Bytes','Subflow.Bwd.Packets','Subflow.Bwd.Bytes','Init_Win_bytes_forward','Init_Win_bytes_backward','act_data_pkt_fwd','min_seg_size_forward','L7Protocol','Flow.IAT.Std', 'Min.Packet.Length', 'Max.Packet.Length', 'Active.Std', 'Active.Max', 'Active.Min', 'Idle.Std', 'Idle.Max', 'Idle.Min'])

Splitting the dataset in two parts

1.   One which has Protocol Names which appears greater than or equalt to 100000 times
2.   Other which has Protocol Names which appears 10000 to 100000 times



In [11]:
protocol_greater_equal_100000 = df[df['ProtocolName'].groupby(df['ProtocolName']).transform('size') >= 100000]
protocol_counts = df['ProtocolName'].value_counts()

protocol_less_than_100000 = df[df['ProtocolName'].isin(
    protocol_counts[(protocol_counts >= 10000) & (protocol_counts < 100000)].index)
]

In [12]:
y_ge = protocol_greater_equal_100000['ProtocolName']
y_lt = protocol_less_than_100000['ProtocolName']

In [13]:
protocol_greater_equal_100000 = protocol_greater_equal_100000.drop(columns= ['ProtocolName'])
protocol_less_than_100000 = protocol_less_than_100000.drop(columns= ['ProtocolName'])

Splitting the test and train data

In [14]:
from sklearn.metrics import explained_variance_score, accuracy_score
from sklearn.model_selection import train_test_split
x_ge_train, x_ge_test, y_ge_train, y_ge_test = train_test_split(protocol_greater_equal_100000, y_ge, test_size=0.2, random_state=3)
x_lt_train, x_lt_test, y_lt_train, y_lt_test = train_test_split(protocol_less_than_100000, y_lt, test_size=0.2, random_state=3)

# Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [14, 18, 22]
}

# Create a Decision Tree Classifier
tree_classify = DecisionTreeClassifier()

# Create GridSearchCV
grid_search = GridSearchCV(tree_classify, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_ge_train, y_ge_train)

# Get the best hyperparameters
best_max_depth = grid_search.best_params_['max_depth']
print("Best Hyperparameter (max_depth):", best_max_depth)

# Get the best model
best_model = grid_search.best_estimator_

# Fit the best model on the training data
best_model.fit(x_ge_train, y_ge_train)

# Predict on training and test sets using the best model
ge_pred_test = best_model.predict(x_ge_test)
ge_pred_train = best_model.predict(x_ge_train)

# Calculate accuracy for the best model
train_accuracy = best_model.score(x_ge_train, y_ge_train)
test_accuracy = best_model.score(x_ge_test, y_ge_test)

print("Train Accuracy for best model:", train_accuracy)
print("Test Accuracy for best model:", test_accuracy)

Best Hyperparameter (max_depth): 18
Train Accuracy for best model: 0.9333792073969566
Test Accuracy for best model: 0.9215430012880389


In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [20, 25, 30]
}

# Create a Decision Tree Classifier
tree_classify = DecisionTreeClassifier()

# Create GridSearchCV
grid_search = GridSearchCV(tree_classify, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_lt_train, y_lt_train)

# Get the best hyperparameters
best_max_depth = grid_search.best_params_['max_depth']
print("Best Hyperparameter (max_depth):", best_max_depth)

# Get the best model
best_model = grid_search.best_estimator_

# Fit the best model on the training data
best_model.fit(x_lt_train, y_lt_train)

# Predict on training and test sets using the best model
ge_pred_test = best_model.predict(x_lt_test)
ge_pred_train = best_model.predict(x_lt_train)

# Calculate accuracy for the best model
train_accuracy = best_model.score(x_lt_train, y_lt_train)
test_accuracy = best_model.score(x_lt_test, y_lt_test)

print("Train Accuracy for best model:", train_accuracy)
print("Test Accuracy for best model:", test_accuracy)

Best Hyperparameter (max_depth): 20
Train Accuracy for best model: 0.9939242828074399
Test Accuracy for best model: 0.9724302636606802


# Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearch
param_grid = {
    'max_depth': [20, 25],
    'n_estimators': [10, 20]
}

# Create a Random Forest Classifier
random_forest = RandomForestClassifier(random_state=42)

# Create GridSearchCV
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(x_ge_train, y_ge_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Calculate accuracy for the best model on training and test sets
best_train_accuracy = best_model.score(x_ge_train, y_ge_train)
best_test_accuracy = best_model.score(x_ge_test, y_ge_test)

print("Train Accuracy for Best Model:", best_train_accuracy)
print("Test Accuracy for Best Model:", best_test_accuracy)


Best Hyperparameters: {'max_depth': 25, 'n_estimators': 20}
Train Accuracy for Best Model: 0.94958366925325
Test Accuracy for Best Model: 0.9185134330802395


In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearch
param_grid = {
    'max_depth': [10, 15],
    'n_estimators': [10, 20]
}

# Create a Random Forest Classifier
random_forest = RandomForestClassifier(random_state=42)

# Create GridSearchCV
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(x_lt_train, y_lt_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_model = grid_search.best_estimator_

# Calculate accuracy for the best model on training and test sets
best_train_accuracy = best_model.score(x_lt_train, y_lt_train)
best_test_accuracy = best_model.score(x_lt_test, y_lt_test)

print("Train Accuracy for Best Model:", best_train_accuracy)
print("Test Accuracy for Best Model:", best_test_accuracy)

Best Hyperparameters: {'max_depth': 15, 'n_estimators': 20}
Train Accuracy for Best Model: 0.9386696472071763
Test Accuracy for Best Model: 0.9124379059992358


# XGBoost

In [19]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()


y_ge_train = label_encoder.fit_transform(y_ge_train)
y_ge_test = label_encoder.fit_transform(y_ge_test)


xgb_classifier = XGBClassifier()

# Fit the model on the training data
xgb_classifier.fit(x_ge_train, y_ge_train)

# Predict on training and test sets
ge_pred_train_xgb = xgb_classifier.predict(x_ge_train)
ge_pred_test_xgb = xgb_classifier.predict(x_ge_test)

# Calculate accuracy for the model
train_accuracy_xgb = xgb_classifier.score(x_ge_train, y_ge_train)
test_accuracy_xgb = xgb_classifier.score(x_ge_test, y_ge_test)

print("XGBoost Train Accuracy:", train_accuracy_xgb)
print("XGBoost Test Accuracy:", test_accuracy_xgb)


XGBoost Train Accuracy: 0.9224418648594175
XGBoost Test Accuracy: 0.9198802029369806


In [20]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()


y_lt_train = label_encoder.fit_transform(y_lt_train)
y_lt_test = label_encoder.fit_transform(y_lt_test)

# Create an XGBoost Classifier model with default hyperparameters
xgb_classifier = XGBClassifier()

# Fit the model on the training data
xgb_classifier.fit(x_lt_train, y_lt_train)

# Predict on training and test sets
lt_pred_train_xgb = xgb_classifier.predict(x_lt_train)
lt_pred_test_xgb = xgb_classifier.predict(x_lt_test)

# Calculate accuracy for the model
train_accuracy_xgb = xgb_classifier.score(x_lt_train, y_lt_train)
test_accuracy_xgb = xgb_classifier.score(x_lt_test, y_lt_test)

print("XGBoost Train Accuracy:", train_accuracy_xgb)
print("XGBoost Test Accuracy:", test_accuracy_xgb)

XGBoost Train Accuracy: 0.9831054939386124
XGBoost Test Accuracy: 0.9723156285823462
