In [3]:
# Importing necessary libraries...
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Define a function to read data from a CSV file
def read_data(file_path):
  return pd.read_csv(file_path)

# Read data from the three datasets
dataset1 = read_data('k7aditya.csv')
dataset2 = read_data('APA-DDoS-Dataset.csv')
dataset3 = read_data('tonyStark-Jr.csv')
print(dataset1.columns)
print(dataset2.columns)
print(dataset3.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP',
       'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',


In [4]:

# Convert all column names to lowercase
dataset1.columns = dataset1.columns.str.lower()
dataset2.columns = dataset2.columns.str.lower()
dataset3.columns = dataset3.columns.str.lower()

# Standardize labels in each dataset
dataset1['label'] = dataset1['label'].astype(str).str.lower().apply(lambda x: 1 if 'ddos' in x else 0)
dataset2['label'] = dataset2['label'].astype(str).str.lower().apply(lambda x: 1 if 'ddos' in x else 0)
dataset3['label'] = dataset3['label'].astype(str).str.lower().apply(lambda x: 1 if 'ddos' in x else 0)



# Convert all column names to lowercase in DATASET1
dataset1.columns = dataset1.columns.str.lower()

# Rename features in DATASET1
rename_mapping_dataset1 = {
    'fwd pkt len max': 'max_fwd_pkt_len',
    'fwd pkt len min': 'min_fwd_pkt_len',
    'fwd pkt len mean': 'mean_fwd_pkt_len',
    'fwd pkt len std': 'std_fwd_pkt_len',
    'bwd pkt len max': 'max_bwd_pkt_len',
    'bwd pkt len min': 'min_bwd_pkt_len',
    'bwd pkt len mean': 'mean_bwd_pkt_len',
    'bwd pkt len std': 'std_bwd_pkt_len',
    'flow byts/s': 'flow_bytes_per_sec',
    'flow pkts/s': 'flow_packets_per_sec',
    'flow iat mean': 'flow_iat_mean',
    'flow iat std': 'flow_iat_std',
    'flow iat max': 'flow_iat_max',
    'flow iat min': 'flow_iat_min'
}

dataset1.rename(columns=rename_mapping_dataset1, inplace=True)

# Convert all column names to lowercase in DATASET2
dataset2.columns = dataset2.columns.str.lower()

# Rename features in DATASET2
rename_mapping_dataset2 = {
    'ip.src': 'src ip',
    'tcp.srcport': 'src port',
    'ip.dst': 'dst ip',
    'tcp.dstport': 'dst port',
    'ip.proto': 'protocol',
    'frame.time': 'timestamp',
    'bytes': 'totlen fwd pkts',
    'rx packets': 'tot bwd pkts',
    'rx bytes': 'totlen bwd pkts',
}

dataset2.rename(columns=rename_mapping_dataset2, inplace=True)

# Convert all column names to lowercase in DATASET3
dataset3.columns = dataset3.columns.str.lower()

# Rename features in DATASET3
rename_mapping_dataset3 = {
    'src': 'src ip',
    'port_no': 'src port',
    'dst': 'dst ip',
    'protocol': 'ip.proto',
    'dt': 'timestamp',
    'packetins': 'packets',
    'bytecount': 'bytes',
    'byteperflow': 'totlen bwd pkts',
    'packetperflow': 'tot bwd pkts',
    'tx_kbps': 'flow_bytes_per_sec',
    'rx_kbps': 'flow_packets_per_sec',
}
dataset3.rename(columns=rename_mapping_dataset3, inplace=True)


# Drop columns that have no entries
dataset1 = dataset1.dropna(axis=1, how='all')
dataset2 = dataset2.dropna(axis=1, how='all')
dataset3 = dataset3.dropna(axis=1, how='all')

# Select only numeric columns from each dataset
numeric_features1 = dataset1.select_dtypes(include=['float64', 'int64'])
numeric_features2 = dataset2.select_dtypes(include=['float64', 'int64'])
numeric_features3 = dataset3.select_dtypes(include=['float64', 'int64'])

# Initialize a StandardScaler
scaler = StandardScaler()

# Apply the scaler to the numeric columns of each dataset
dataset1[numeric_features1.columns] = scaler.fit_transform(numeric_features1)
dataset2[numeric_features2.columns] = scaler.fit_transform(numeric_features2)
dataset3[numeric_features3.columns] = scaler.fit_transform(numeric_features3)

# Print the standardized data
print(dataset1[numeric_features1.columns])
print(dataset2[numeric_features2.columns])
print(dataset3[numeric_features3.columns])



        unnamed: 0.1  unnamed: 0  src port  dst port  protocol  flow duration  \
0          -1.732039   -1.732039  1.660090       0.0       0.0      -0.481011   
1          -1.732016   -1.732016  1.659866       0.0       0.0      -0.483635   
2          -1.731993   -1.731993  1.659642       0.0       0.0      -0.489148   
3          -1.731970   -1.731970  1.660314       0.0       0.0      -0.494445   
4          -1.731947   -1.731947 -1.564743       0.0       0.0      -0.469008   
...              ...         ...       ...       ...       ...            ...   
149995      1.731947    1.731947 -1.774386       0.0       0.0      -0.486475   
149996      1.731970    1.731970 -1.774610       0.0       0.0      -0.489124   
149997      1.731993    1.731993 -1.774834       0.0       0.0      -0.490007   
149998      1.732016    1.732016  1.433200       0.0       0.0      -0.500506   
149999      1.732039    1.732039  1.432976       0.0       0.0      -0.498692   

        tot fwd pkts  tot b

In [5]:
# Identify common features in all datasets
common_features = list(set(dataset1.columns).intersection(set(dataset2.columns), set(dataset3.columns)))
print(common_features)

['src ip', 'dst ip', 'totlen bwd pkts', 'label', 'src port', 'timestamp']


In [6]:

# Drop uncommon features from each dataset
dataset1 = dataset1.drop(columns=[col for col in dataset1.columns if col not in common_features])
dataset2 = dataset2.drop(columns=[col for col in dataset2.columns if col not in common_features])
dataset3 = dataset3.drop(columns=[col for col in dataset3.columns if col not in common_features])

# Concatenate the three datasets
dataset = pd.concat([dataset1, dataset2, dataset3])

# Handle missing values by filling them with the mean of each column
dataset = dataset.fillna(dataset.mean())

# Handle outliers by removing any rows where any column has a value less than Q1 - 1.5 * IQR or greater than Q3 + 1.5 * IQR
Q1 = dataset.quantile(0.25)
Q3 = dataset.quantile(0.75)
IQR = Q3 - Q1
dataset = dataset[~((dataset < (Q1 - 1.5 * IQR)) |(dataset > (Q3 + 1.5 * IQR))).any(axis=1)]

# Assign features to variable X and labels to variable Y
X = dataset.drop('label', axis=1)
Y = dataset['label']

print(X)
print(Y)

: 