#ECS 171 Project: Detecting Cyber Intrusion on IoT Traffic Data
####Completed by Matthew Armstrong, Karthik Palanisamy, Rahul Prabhu, Matthew Tom, and Kyle Tsuji
#####*Professor Solares, ECS 171: Machine Learning - UC Davis Summer 2023*


##Imports and Data Setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [None]:
# get dataset from github archive
!wget https://github.com/matthewarmstr/ECS-171-Project/raw/main/IoT_Intrusion.zip
!unzip IoT_Intrusion.zip

In [None]:
# load data file
intru_table = pd.read_csv("IoT_Intrusion.csv")
intru_labels_removed = intru_table.drop(columns=['label'])

##Data Preprocessing

In [None]:
# complete one-hot encoding on intrusion labels
ohe = OneHotEncoder()
ohe_intru_labels = ohe.fit_transform(intru_table['label'].values.reshape(-1,1)).toarray()

# normalize traffic attribute data
scaler = MinMaxScaler()
norm_intru_labels_removed = pd.DataFrame(scaler.fit_transform(intru_labels_removed), columns=['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
                                                                                              'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
                                                                                             'psh_flag_number', 'ack_flag_number', 'ece_flag_number', 'cwr_flag_number',
                                                                                             'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP',
                                                                                             'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP',
                                                                                             'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size',
                                                                                             'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight'])
norm_intru_with_classes = norm_intru_labels_removed
norm_intru_with_classes['label'] = intru_table['label']

In [None]:
sns.catplot(intru_table, x='flow_duration', y='label')

In [None]:
sns.catplot(intru_table, x='Header_Length', y='label')

In [None]:
sns.catplot(intru_table, x='Protocol Type', y='label')

In [None]:
sns.catplot(intru_table, x='Duration', y='label')

In [None]:
sns.catplot(intru_table, x='Rate', y='label')

In [None]:
sns.catplot(intru_table, x='Srate', y='label')

In [None]:
sns.catplot(intru_table, x='UDP', y='label')

In [None]:
sns.catplot(
    data=intru_table, y="HTTPS", hue="label", kind="count",
    palette="pastel", edgecolor=".6",
)

In [None]:
# normalized duration attribute, same as above
sns.catplot(norm_intru_with_classes, x='Duration', y='label')

In [None]:
intru_table["set"] = intru_table["HTTPS"].isin([0])
sns.catplot(
    data=intru_table, x="HTTPS", y="label", hue="set",
    kind="bar", dodge=False,
)