# <center> libraries </center>

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from numpy import mean, absolute
from sklearn.impute import KNNImputer

# <center> reading ton dataframe </center>

In [None]:
input_path = f'your-path\\TON-IoT\\normal-attack\\csv-3\\ton.csv'
ton_df = pd.read_csv(input_path, on_bad_lines="error", low_memory=False)

# <center> numeric features </center>

## <center> duration </center>

In [None]:
# change value

print(ton_df['duration'].value_counts()['-'])

ton_df['duration'] = ton_df['duration'].replace('-', 0)

In [189]:
# change datatype

ton_df['duration'] = ton_df['duration'].astype(float)

In [None]:
# transformation

print("old skew is: ", skew(ton_df['duration']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['duration'] = pt.fit_transform(ton_df[['duration']])

print("new skew is: ", skew(ton_df['duration']))

In [None]:
# M-square normalization

duration = ton_df['duration']

median_duration = np.median(duration)
mad_duration = np.median(np.abs(duration - median_duration))

print("Original Median:", median_duration)
print("Original MAD:", mad_duration)

ton_df['duration'] = (duration - median_duration) / mad_duration

new_median = np.median(ton_df['duration'])
new_mad = np.median(np.abs(ton_df['duration'] - new_median))

print("New Median after scaling:", new_median)
print("New MAD after scaling:", new_mad)


In [None]:
# MinMaxScaler

print("original min: ", ton_df['duration'].min())
print("original max: ", ton_df['duration'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['duration'] = scaler.fit_transform(ton_df[['duration']])

print("new min: ", ton_df['duration'].min())
print("new max: ", ton_df['duration'].max())

## <center> src_bytes </center>

In [None]:
# change value

print(ton_df['src_bytes'].value_counts()['-'])

ton_df['src_bytes'] = ton_df['src_bytes'].replace('-', 0)

In [194]:
# change datatype

ton_df['src_bytes'] = ton_df['src_bytes'].astype(float)


In [None]:
# transformation

print("old skew is: ", skew(ton_df['src_bytes']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['src_bytes'] = pt.fit_transform(ton_df[['src_bytes']])

print("new skew is: ", skew(ton_df['src_bytes']))

In [None]:
src_bytes = ton_df['src_bytes']

median_src_bytes = np.median(src_bytes)
mad_src_bytes = np.median(np.abs(src_bytes - median_src_bytes))

print("Original Median:", median_src_bytes)
print("Original MAD:", mad_src_bytes)

ton_df['src_bytes'] = (src_bytes - median_src_bytes) / mad_src_bytes

new_median = np.median(ton_df['src_bytes'])
new_mad = np.median(np.abs(ton_df['src_bytes'] - new_median))

print("New Median after scaling:", new_median)
print("New MAD after scaling:", new_mad)

In [None]:
# MinMaxScaler

print("original min: ", ton_df['src_bytes'].min())
print("original max: ", ton_df['src_bytes'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['src_bytes'] = scaler.fit_transform(ton_df[['src_bytes']])

print("new min: ", ton_df['src_bytes'].min())
print("new max: ", ton_df['src_bytes'].max())

## <center> dst_bytes </center>

In [None]:
# change value

print(ton_df['dst_bytes'].value_counts()['-'])

ton_df['dst_bytes'] = ton_df['dst_bytes'].replace('-', 0)

In [199]:
# change datatype

ton_df['dst_bytes'] = ton_df['dst_bytes'].astype(float)

In [None]:
# transformation

print("old skew is: ", skew(ton_df['dst_bytes']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['dst_bytes'] = pt.fit_transform(ton_df[['dst_bytes']])

print("new skew is: ", skew(ton_df['dst_bytes']))

In [None]:
dst_bytes = ton_df['dst_bytes']

median_dst_bytes = np.median(dst_bytes)
mad_dst_bytes = np.median(np.abs(dst_bytes - median_dst_bytes))

print("Original Median:", median_dst_bytes)
print("Original MAD:", mad_dst_bytes)

ton_df['dst_bytes'] = (dst_bytes - median_dst_bytes) / mad_dst_bytes

new_median = np.median(ton_df['dst_bytes'])
new_mad = np.median(np.abs(ton_df['dst_bytes'] - new_median))

print("New Median after scaling:", new_median)
print("New MAD after scaling:", new_mad)

In [None]:
# MinMaxScaler

print("original min: ", ton_df['dst_bytes'].min())
print("original max: ", ton_df['dst_bytes'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['dst_bytes'] = scaler.fit_transform(ton_df[['dst_bytes']])

print("new min: ", ton_df['dst_bytes'].min())
print("new max: ", ton_df['dst_bytes'].max())

## <center> missed_bytes </center>

In [None]:
# transformation

print("old skew is: ", skew(ton_df['missed_bytes']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['missed_bytes'] = pt.fit_transform(ton_df[['missed_bytes']])

print("new skew is: ", skew(ton_df['missed_bytes']))

In [None]:
# MinMaxScaler

print("original min: ", ton_df['missed_bytes'].min())
print("original max: ", ton_df['missed_bytes'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['missed_bytes'] = scaler.fit_transform(ton_df[['missed_bytes']])

print("new min: ", ton_df['missed_bytes'].min())
print("new max: ", ton_df['missed_bytes'].max())

## <center> src_pkts </center>

In [None]:
# transformation

print("old skew is: ", skew(ton_df['src_pkts']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['src_pkts'] = pt.fit_transform(ton_df[['src_pkts']])

print("new skew is: ", skew(ton_df['src_pkts']))

In [None]:
# M-square normalization

src_pkts = ton_df['src_pkts']

median_src_pkts = np.median(src_pkts)
mad_src_pkts = np.median(np.abs(src_pkts - median_src_pkts))

print("Original Median:", median_src_pkts)
print("Original MAD:", mad_src_pkts)

ton_df['src_pkts'] = (src_pkts - median_src_pkts) / mad_src_pkts

new_median = np.median(ton_df['src_pkts'])
new_mad = np.median(np.abs(ton_df['src_pkts'] - new_median))

print("New Median after scaling:", new_median)
print("New MAD after scaling:", new_mad)

In [None]:
# MinMaxScaler

print("original min: ", ton_df['src_pkts'].min())
print("original max: ", ton_df['src_pkts'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['src_pkts'] = scaler.fit_transform(ton_df[['src_pkts']])

print("new min: ", ton_df['src_pkts'].min())
print("new max: ", ton_df['src_pkts'].max())

## <center> src_ip_bytes </center>

In [None]:
# transformation

print("old skew is: ", skew(ton_df['src_ip_bytes']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['src_ip_bytes'] = pt.fit_transform(ton_df[['src_ip_bytes']])

print("new skew is: ", skew(ton_df['src_ip_bytes']))

In [None]:
# M-square normalization

src_ip_bytes = ton_df['src_ip_bytes']

median_src_ip_bytes = np.median(src_ip_bytes)
mad_src_ip_bytes = np.median(np.abs(src_ip_bytes - median_src_ip_bytes))

print("Original Median:", median_src_ip_bytes)
print("Original MAD:", mad_src_ip_bytes)

ton_df['src_ip_bytes'] = (src_ip_bytes - median_src_ip_bytes) / mad_src_ip_bytes

new_median = np.median(ton_df['src_ip_bytes'])
new_mad = np.median(np.abs(ton_df['src_ip_bytes'] - new_median))

print("New Median after scaling:", new_median)
print("New MAD after scaling:", new_mad)

In [None]:
# MinMaxScaler

print("original min: ", ton_df['src_ip_bytes'].min())
print("original max: ", ton_df['src_ip_bytes'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['src_ip_bytes'] = scaler.fit_transform(ton_df[['src_ip_bytes']])

print("new min: ", ton_df['src_ip_bytes'].min())
print("new max: ", ton_df['src_ip_bytes'].max())

## <center> dst_pkts </center>

In [None]:
# transformation

print("old skew is: ", skew(ton_df['dst_pkts']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['dst_pkts'] = pt.fit_transform(ton_df[['dst_pkts']])

print("new skew is: ", skew(ton_df['dst_pkts']))

In [None]:
# M-square normalization

dst_pkts = ton_df['dst_pkts']

median_dst_pkts = np.median(dst_pkts)
mad_dst_pkts = np.median(np.abs(dst_pkts - median_dst_pkts))

print("Original Median:", median_dst_pkts)
print("Original MAD:", mad_dst_pkts)

ton_df['dst_pkts'] = (dst_pkts - median_dst_pkts) / mad_dst_pkts

new_median = np.median(ton_df['dst_pkts'])
new_mad = np.median(np.abs(ton_df['dst_pkts'] - new_median))

print("New Median after scaling:", new_median)
print("New MAD after scaling:", new_mad)

In [None]:
# MinMaxScaler

print("original min: ", ton_df['dst_pkts'].min())
print("original max: ", ton_df['dst_pkts'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['dst_pkts'] = scaler.fit_transform(ton_df[['dst_pkts']])

print("new min: ", ton_df['dst_pkts'].min())
print("new max: ", ton_df['dst_pkts'].max())

## <center> dst_ip_bytes </center>

In [None]:
# transformation

print("old skew is: ", skew(ton_df['dst_ip_bytes']))

pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['dst_ip_bytes'] = pt.fit_transform(ton_df[['dst_ip_bytes']])

print("new skew is: ", skew(ton_df['dst_ip_bytes']))

In [None]:
# M-square normalization

dst_ip_bytes = ton_df['dst_ip_bytes']

median_dst_ip_bytes = np.median(dst_ip_bytes)
mad_dst_ip_bytes = np.median(np.abs(dst_ip_bytes - median_dst_ip_bytes))

print("Original Median:", median_dst_ip_bytes)
print("Original MAD:", mad_dst_ip_bytes)

ton_df['dst_ip_bytes'] = (dst_ip_bytes - median_dst_ip_bytes) / mad_dst_ip_bytes

new_median = np.median(ton_df['dst_ip_bytes'])
new_mad = np.median(np.abs(ton_df['dst_ip_bytes'] - new_median))

print("New Median after scaling:", new_median)
print("New MAD after scaling:", new_mad)

In [None]:
# MinMaxScaler

print("original min: ", ton_df['dst_ip_bytes'].min())
print("original max: ", ton_df['dst_ip_bytes'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['dst_ip_bytes'] = scaler.fit_transform(ton_df[['dst_ip_bytes']])

print("new min: ", ton_df['dst_ip_bytes'].min())
print("new max: ", ton_df['dst_ip_bytes'].max())

## <center> http_trans_depth </center>

In [217]:
# imputation

ton_df.fillna({'http_trans_depth': 0}, inplace=True)

In [None]:
# transformation

print("old skew is: ", skew(ton_df['http_trans_depth']))
pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['http_trans_depth'] = pt.fit_transform(ton_df[['http_trans_depth']])
print("new skew is: ", skew(ton_df['http_trans_depth']))

## <center> http_request_body_len </center>

In [219]:
# imputation

ton_df.fillna({'http_request_body_len': 0}, inplace=True)

In [None]:
# transformation

print("old skew is: ", skew(ton_df['http_request_body_len']))
pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['http_request_body_len'] = pt.fit_transform(ton_df[['http_request_body_len']])
print("new skew is: ", skew(ton_df['http_request_body_len']))

## <center> http_response_body_len </center>

In [221]:
# imputation

ton_df.fillna({'http_response_body_len': 0}, inplace=True)

In [None]:
# transformation

print("old skew is: ", skew(ton_df['http_response_body_len']))
pt = PowerTransformer(method='yeo-johnson', standardize=False)
ton_df['http_response_body_len'] = pt.fit_transform(ton_df[['http_response_body_len']])
print("new skew is: ", skew(ton_df['http_response_body_len']))

In [None]:
# MinMaxScaler

print("original min: ", ton_df['http_response_body_len'].min())
print("original max: ", ton_df['http_response_body_len'].max())

scaler = MinMaxScaler(feature_range=(-1, 1))
ton_df['http_response_body_len'] = scaler.fit_transform(ton_df[['http_response_body_len']])

print("new min: ", ton_df['http_response_body_len'].min())
print("new max: ", ton_df['http_response_body_len'].max())

# <center> string features </center>

## <center> proto </center>

In [224]:
#ton_df = pd.get_dummies(ton_df, columns=['proto'])
#
#ton_df['proto_tcp'] = ton_df['proto_tcp'].astype(int)
#ton_df['proto_icmp'] = ton_df['proto_icmp'].astype(int)
#ton_df['proto_udp'] = ton_df['proto_udp'].astype(int)

In [None]:
# encoding

le = LabelEncoder()
ton_df['proto'] = le.fit_transform(ton_df['proto'])

proto_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

file_path = r'your-path\\TON-IoT\\normal-attack\\csv-3\\proto_mapping.txt'

with open(file_path, 'w') as f:
    for proto, encoded_value in proto_mapping.items():
        f.write(f"{proto}: {encoded_value}\n")

## <center> service </center>

In [226]:
# change - value

ton_df['service'] = ton_df['service'].replace('-', 'unknown')

In [None]:
# label encoding

le = LabelEncoder()
ton_df['service'] = le.fit_transform(ton_df['service'])

service_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

file_path = r'your-path\\TON-IoT\\normal-attack\\csv-3\\service_mapping.txt'

with open(file_path, 'w') as f:
    for service, encoded_value in service_mapping.items():
        f.write(f"{service}: {encoded_value}\n")

## <center> conn_state </center>

In [None]:
# label encoding

le = LabelEncoder()
ton_df['conn_state'] = le.fit_transform(ton_df['conn_state'])

conn_state_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

file_path = r'your-path\\TON-IoT\\normal-attack\\csv-3\\conn_state_mapping.txt'

with open(file_path, 'w') as f:
    for conn_state, encoded_value in conn_state_mapping.items():
        f.write(f"{conn_state}: {encoded_value}\n")

## <center> dns_query </center>

In [None]:
# it is not ...

ton_df.loc[ton_df['service'] != 2, 'dns_query'] = 0 # 0 means the service is not dns

print("number of null values is: ", ton_df['dns_query'].isnull().sum())

ton_df.fillna({'dns_query': 1}, inplace=True)       # 1 means query is not available

In [230]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_query'][(ton_df['dns_query'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['dns_query'] != 0), 'dns_query'] = encoded_values
ton_df['dns_query'] = ton_df['dns_query'].astype(int)

## <center> dns_qclass </center>

In [None]:
# it is not ...

ton_df.loc[ton_df['service'] != 2, 'dns_qclass'] = 0 # 0 means the service is not dns

print("number of null values is: ", ton_df['dns_qclass'].isnull().sum())

ton_df.fillna({'dns_qclass': 1}, inplace=True)       # 1 means qclass is not available

In [232]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_qclass'][(ton_df['dns_qclass'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['dns_qclass'] != 0), 'dns_qclass'] = encoded_values
ton_df['dns_qclass'] = ton_df['dns_qclass'].astype(int)

## <center> dns_qtype </center>

In [None]:
# it is not ...

ton_df.loc[ton_df['service'] != 2, 'dns_qtype'] = 0 # 0 means the service is not DNS

print("number of null values is: ", ton_df['dns_qtype'].isnull().sum())

ton_df.fillna({'dns_qtype': 1}, inplace=True)       # 1 means query type is not available

In [234]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_qtype'][(ton_df['dns_qtype'] != 0) & (ton_df['dns_qtype'] != 1)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 2
ton_df.loc[(ton_df['dns_qtype'] != 0) & (ton_df['dns_qtype'] != 1), 'dns_qtype'] = encoded_values
ton_df['dns_qtype'] = ton_df['dns_qtype'].astype(int)

## <center> dns_rcode </center>

In [None]:
# it is not ...

ton_df.loc[ton_df['service'] != 2, 'dns_rcode'] = 0 # 0 means the service is not DNS

print("number of null values is: ", ton_df['dns_rcode'].isnull().sum())

ton_df.fillna({'dns_rcode': 1}, inplace=True)       # 1 means dns_rcode is not available

In [236]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_rcode'][(ton_df['dns_rcode'] != 0) & (ton_df['dns_rcode'] != 1)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 2
ton_df.loc[(ton_df['dns_rcode'] != 0) & (ton_df['dns_rcode'] != 1), 'dns_rcode'] = encoded_values
ton_df['dns_rcode'] = ton_df['dns_rcode'].astype(int)

## <center> dns_AA </center>

In [None]:
# it is not ... 

ton_df.loc[ton_df['service'] != 2, 'dns_AA'] = 0 # 0 means the service is not DNS

print("number of null values is: ", ton_df['dns_AA'].isnull().sum())

ton_df.fillna({'dns_AA': 1}, inplace=True)       # 1 means dns_AA is not available

In [238]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_AA'][(ton_df['dns_AA'] != 0) & (ton_df['dns_AA'] != 1)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 2
ton_df.loc[(ton_df['dns_AA'] != 0) & (ton_df['dns_AA'] != 1), 'dns_AA'] = encoded_values
ton_df['dns_AA'] = ton_df['dns_AA'].astype(int)

## <center> dns_RD </center>

In [None]:
# it is not ...

ton_df.loc[ton_df['service'] != 2, 'dns_RD'] = 0 # 0 means the service is not DNS

print("number of null values is: ", ton_df['dns_RD'].isnull().sum())

ton_df.fillna({'dns_RD': 1}, inplace=True)       # 1 means dns_RD is not available

In [240]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_RD'][(ton_df['dns_RD'] != 0) & (ton_df['dns_RD'] != 1)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 2
ton_df.loc[(ton_df['dns_RD'] != 0) & (ton_df['dns_RD'] != 1), 'dns_RD'] = encoded_values
ton_df['dns_RD'] = ton_df['dns_RD'].astype(int)

## <center> dns_RA </center>

In [None]:
# it is not ...

ton_df.loc[ton_df['service'] != 2, 'dns_RA'] = 0 # 0 means the service is not DNS

print("number of null values is: ", ton_df['dns_RA'].isnull().sum())

ton_df.fillna({'dns_RA': 1}, inplace=True)       # 1 means dns_RA is not available

In [242]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_RA'][(ton_df['dns_RA'] != 0) & (ton_df['dns_RA'] != 1)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 2
ton_df.loc[(ton_df['dns_RA'] != 0) & (ton_df['dns_RA'] != 1), 'dns_RA'] = encoded_values
ton_df['dns_RA'] = ton_df['dns_RA'].astype(int)

## <center> dns_rejected </center>

In [None]:
# it is not ...

ton_df.loc[ton_df['service'] != 2, 'dns_rejected'] = 0 # 0 means the service is not DNS

print("number of null values is: ", ton_df['dns_rejected'].isnull().sum())

ton_df.fillna({'dns_rejected': 1}, inplace=True)       # 1 means dns_rejected is not available

In [244]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['dns_rejected'][(ton_df['dns_rejected'] != 0) & (ton_df['dns_rejected'] != 1)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 2
ton_df.loc[(ton_df['dns_rejected'] != 0) & (ton_df['dns_rejected'] != 1), 'dns_rejected'] = encoded_values
ton_df['dns_rejected'] = ton_df['dns_rejected'].astype(int)

## <center> http_method </center>

In [245]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_method'] = 0 # 0 means the service is not HTTP

In [246]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_method'][(ton_df['http_method'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_method'] != 0), 'http_method'] = encoded_values
ton_df['http_method'] = ton_df['http_method'].astype(int)

## <center> http_uri </center>

In [247]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_uri'] = 0 # 0 means the service is not HTTP

In [248]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_uri'][(ton_df['http_uri'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_uri'] != 0), 'http_uri'] = encoded_values
ton_df['http_uri'] = ton_df['http_uri'].astype(int)

## <center> http_referrer </center>

In [249]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_referrer'] = 0 # 0 means the service is not HTTP

In [250]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_referrer'][(ton_df['http_referrer'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_referrer'] != 0), 'http_referrer'] = encoded_values
ton_df['http_referrer'] = ton_df['http_referrer'].astype(int)

## <center> http_version </center>

In [251]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_version'] = 0 # 0 means the service is not HTTP

In [252]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_version'][(ton_df['http_version'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_version'] != 0), 'http_version'] = encoded_values
ton_df['http_version'] = ton_df['http_version'].astype(int)

## <center> http_status_code </center>

In [253]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_status_code'] = 0 # 0 means the service is not HTTP

In [254]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_status_code'][(ton_df['http_status_code'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_status_code'] != 0), 'http_status_code'] = encoded_values
ton_df['http_status_code'] = ton_df['http_status_code'].astype(int)

## <center> http_user_agent </center>

In [255]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_user_agent'] = 0 # 0 means the service is not HTTP

In [256]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_user_agent'][(ton_df['http_user_agent'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_user_agent'] != 0), 'http_user_agent'] = encoded_values
ton_df['http_user_agent'] = ton_df['http_user_agent'].astype(int)

## <center> http_orig_mime_types </center>

In [257]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_orig_mime_types'] = 0 # 0 means the service is not HTTP

In [258]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_orig_mime_types'][(ton_df['http_orig_mime_types'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_orig_mime_types'] != 0), 'http_orig_mime_types'] = encoded_values
ton_df['http_orig_mime_types'] = ton_df['http_orig_mime_types'].astype(int)

## <center> http_resp_mime_types </center>

In [259]:
# it is not http

ton_df.loc[ton_df['service'] != 9, 'http_resp_mime_types'] = 0 # 0 means the service is not HTTP

In [260]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['http_resp_mime_types'][(ton_df['http_resp_mime_types'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['http_resp_mime_types'] != 0), 'http_resp_mime_types'] = encoded_values
ton_df['http_resp_mime_types'] = ton_df['http_resp_mime_types'].astype(int)

## <center> ssl_version </center>

In [261]:
# it is not ssl

ton_df.fillna({'ssl_version': 0}, inplace=True)

In [262]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['ssl_version'][(ton_df['ssl_version'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['ssl_version'] != 0) & (ton_df['ssl_version'] != 1), 'ssl_version'] = encoded_values
ton_df['ssl_version'] = ton_df['ssl_version'].astype(int)

## <center> ssl_cipher </center>

In [263]:
# it is not ssl

ton_df.fillna({'ssl_cipher': 0}, inplace=True)

In [264]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['ssl_cipher'][(ton_df['ssl_cipher'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['ssl_cipher'] != 0) & (ton_df['ssl_cipher'] != 1), 'ssl_cipher'] = encoded_values
ton_df['ssl_cipher'] = ton_df['ssl_cipher'].astype(int)

## <center> ssl_resumed </center>

In [265]:
# it is not ssl

ton_df.fillna({'ssl_resumed': 0}, inplace=True)

In [266]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['ssl_resumed'][(ton_df['ssl_resumed'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['ssl_resumed'] != 0) & (ton_df['ssl_resumed'] != 1), 'ssl_resumed'] = encoded_values
ton_df['ssl_resumed'] = ton_df['ssl_resumed'].astype(int)

## <center> ssl_established </center>

In [267]:
# it is not ssl

ton_df.fillna({'ssl_established': 0}, inplace=True)

In [268]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['ssl_established'][(ton_df['ssl_established'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['ssl_established'] != 0) & (ton_df['ssl_established'] != 1), 'ssl_established'] = encoded_values
ton_df['ssl_established'] = ton_df['ssl_established'].astype(int)

## <center> ssl_subject </center>

In [269]:
# it is not ssl

ton_df.fillna({'ssl_subject': 0}, inplace=True)

In [270]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['ssl_subject'][(ton_df['ssl_subject'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['ssl_subject'] != 0) & (ton_df['ssl_subject'] != 1), 'ssl_subject'] = encoded_values
ton_df['ssl_subject'] = ton_df['ssl_subject'].astype(int)

## <center> ssl_issuer </center>

In [271]:
# it is not ssl

ton_df.fillna({'ssl_issuer': 0}, inplace=True)

In [272]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['ssl_issuer'][(ton_df['ssl_issuer'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['ssl_issuer'] != 0) & (ton_df['ssl_issuer'] != 1), 'ssl_issuer'] = encoded_values
ton_df['ssl_issuer'] = ton_df['ssl_issuer'].astype(int)

## <center> weird_name </center>

In [273]:
ton_df.fillna({'weird_name': 0}, inplace=True)

In [274]:
# label encoding

le = LabelEncoder()
non_zero_values = ton_df['weird_name'][(ton_df['weird_name'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['weird_name'] != 0) & (ton_df['weird_name'] != 1), 'weird_name'] = encoded_values
ton_df['weird_name'] = ton_df['weird_name'].astype(int)

## <center> weird_addl </center>

In [275]:
ton_df.fillna({'weird_addl': 0}, inplace=True)

In [276]:
ton_df['weird_addl'] = ton_df['weird_addl'].replace('-', 'nothing')
le = LabelEncoder()
non_zero_values = ton_df['weird_addl'][(ton_df['weird_addl'] != 0)]
non_zero_values = non_zero_values.astype(str)
encoded_values = le.fit_transform(non_zero_values)
encoded_values = encoded_values + 1
ton_df.loc[(ton_df['weird_addl'] != 0) & (ton_df['weird_addl'] != 1), 'weird_addl'] = encoded_values
ton_df['weird_addl'] = ton_df['weird_addl'].astype(int)

## <center> weird_notice </center>

In [277]:
ton_df.fillna({'weird_notice': 0}, inplace=True)

In [None]:
ton_df['weird_notice'] = ton_df['weird_notice'].replace('F', 1)

# <center> save dataframe </center>

In [None]:
ton_df.to_csv("your-path\\TON-IoT\\normal-attack\\csv-4\\ton.csv", index=False)