In [25]:
# Importing necessary libraries...
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define a function to read data from a CSV file
def read_data(file_path):
  return pd.read_csv(file_path)

# Read data from the three datasets
dataset1 = read_data('k7aditya.csv')
dataset2 = read_data('APA-DDoS-Dataset.csv')
dataset3 = read_data('tonyStark-Jr.csv')
print(dataset1.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP',
       'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',


In [26]:
# Convert all column names to lowercase
dataset1.columns = dataset1.columns.str.lower()
dataset2.columns = dataset2.columns.str.lower()
dataset3.columns = dataset3.columns.str.lower()

# Standardize labels in each dataset
dataset1['label'] = dataset1['label'].apply(lambda x: 1 if x == 'ddos' else 0)
dataset2['label'] = dataset2['label'].apply(lambda x: 1 if x == 'ddos' else 0)
dataset3['label'] = dataset3['label'].apply(lambda x: 1 if x == 'ddos' else 0)

# Select only numeric columns from each dataset
numeric_features1 = dataset1.select_dtypes(include=['float64', 'int64'])
numeric_features2 = dataset2.select_dtypes(include=['float64', 'int64'])
numeric_features3 = dataset3.select_dtypes(include=['float64', 'int64'])

# Initialize a StandardScaler
scaler = StandardScaler()

# Apply the scaler to the numeric columns of each dataset
dataset1[numeric_features1.columns] = scaler.fit_transform(numeric_features1)
dataset2[numeric_features2.columns] = scaler.fit_transform(numeric_features2)
dataset3[numeric_features3.columns] = scaler.fit_transform(numeric_features3)

# Print the standardized data
print(dataset1[numeric_features1.columns])
print(dataset2[numeric_features2.columns])
print(dataset3[numeric_features3.columns])


        unnamed: 0.1  unnamed: 0  src port  dst port  protocol  flow duration  \
0          -1.732039   -1.732039  1.660090       0.0       0.0      -0.481011   
1          -1.732016   -1.732016  1.659866       0.0       0.0      -0.483635   
2          -1.731993   -1.731993  1.659642       0.0       0.0      -0.489148   
3          -1.731970   -1.731970  1.660314       0.0       0.0      -0.494445   
4          -1.731947   -1.731947 -1.564743       0.0       0.0      -0.469008   
...              ...         ...       ...       ...       ...            ...   
149995      1.731947    1.731947 -1.774386       0.0       0.0      -0.486475   
149996      1.731970    1.731970 -1.774610       0.0       0.0      -0.489124   
149997      1.731993    1.731993 -1.774834       0.0       0.0      -0.490007   
149998      1.732016    1.732016  1.433200       0.0       0.0      -0.500506   
149999      1.732039    1.732039  1.432976       0.0       0.0      -0.498692   

        tot fwd pkts  tot b

In [33]:

# Identify common features in all datasets
common_features = list(set(dataset1.columns).intersection(set(dataset2.columns), set(dataset3.columns)))
print(common_features)

# Rename features in dataset2 to match those in dataset1 and dataset3
dataset2 = dataset2.rename(columns={'Bytes': 'Tot Bwd Pkts', 'Packets': 'Tot Fwd Pkts', 'Rx Bytes': 'Fwd Byts/s', 'Rx Packets': 'Fwd Pkts/s', 'Tx Bytes': 'Bwd Byts/s', 'Tx Packets': 'Bwd Pkts/s'})

# Rename features in dataset3 to match those in dataset1 and dataset2
dataset3 = dataset3.rename(columns={'dt': 'Timestamp', 'switch': 'Src IP', 'src': 'Src Port', 'dst': 'Dst IP', 'dst_port': 'Dst Port', 'pktcount': 'Tot Fwd Pkts', 'bytecount': 'Tot Bwd Pkts', 'dur': 'Flow Duration', 'dur_nsec': 'Flow IAT Mean', 'tot_dur': 'Flow IAT Std', 'flows': 'Flow IAT Max', 'packetins': 'Flow IAT Min', 'pktperflow': 'Fwd IAT Tot', 'byteperflow': 'Fwd IAT Mean', 'pktrate': 'Fwd IAT Std', 'Pairflow': 'Fwd IAT Max', 'Protocol': 'Protocol', 'port_no': 'Port', 'tx_bytes': 'Flow Byts/s', 'rx_bytes': 'Flow Pkts/s', 'tx_kbps': 'Flow IAT Mean', 'rx_kbps': 'Flow IAT Std', 'tot_kbps': 'Flow IAT Max', 'Label': 'Label'})

# Drop columns that have no entries
dataset1 = dataset1.dropna(axis=1, how='all')
dataset2 = dataset2.dropna(axis=1, how='all')
dataset3 = dataset3.dropna(axis=1, how='all')

# Drop uncommon features from each dataset
dataset1 = dataset1.drop(columns=[col for col in dataset1.columns if col not in common_features])
dataset2 = dataset2.drop(columns=[col for col in dataset2.columns if col not in common_features])
dataset3 = dataset3.drop(columns=[col for col in dataset3.columns if col not in common_features])

# Concatenate the three datasets
dataset = pd.concat([dataset1, dataset2, dataset3])

# Handle missing values by filling them with the mean of each column
dataset = dataset.fillna(dataset.mean())

# Handle outliers by removing any rows where any column has a value less than Q1 - 1.5 * IQR or greater than Q3 + 1.5 * IQR
Q1 = dataset.quantile(0.25)
Q3 = dataset.quantile(0.75)
IQR = Q3 - Q1
dataset = dataset[~((dataset < (Q1 - 1.5 * IQR)) |(dataset > (Q3 + 1.5 * IQR))).any(axis=1)]

# Assign features to variable X and labels to variable Y
X = dataset.drop('label', axis=1)
Y = dataset['label']

print(X)
print(Y)

['label']
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[405545 rows x 0 columns]
0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
104340    0.0
104341    0.0
104342    0.0
104343    0.0
104344    0.0
Name: label, Length: 405545, dtype: float64
