In [1]:
import pandas as pd
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Define the required columns for feature selection
req_cols = [' Packet Length Std', ' Total Length of Bwd Packets', ' Subflow Bwd Bytes', ' Destination Port', 
            ' Packet Length Variance', ' Bwd Packet Length Mean', ' Avg Bwd Segment Size', 'Bwd Packet Length Max', 
            ' Init_Win_bytes_backward', 'Total Length of Fwd Packets', ' Subflow Fwd Bytes', 'Init_Win_bytes_forward', 
            ' Average Packet Size', ' Packet Length Mean', ' Max Packet Length',' Label']

# Load the data from csv files
fraction = 1
frames = []

for filename in ['Wednesday-workingHours.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 
                 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
                 'Monday-WorkingHours.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 
                 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv']:
    
    df = pd.read_csv(f'cicids_db/{filename}', usecols=req_cols).sample(frac=fraction)
    frames.append(df)

df = pd.concat(frames, ignore_index=True)



df_max_scaled = df.copy()
y = df_max_scaled[' Label'].replace({'DoS GoldenEye': 'Dos/Ddos', 'DoS Hulk': 'Dos/Ddos', 'DoS Slowhttptest': 'Dos/Ddos', 'DoS slowloris': 'Dos/Ddos', 'Heartbleed': 'Dos/Ddos', 'DDoS': 'Dos/Ddos','FTP-Patator': 'Brute Force', 'SSH-Patator': 'Brute Force','Web Attack - Brute Force': 'Web Attack', 'Web Attack - Sql Injection': 'Web Attack', 'Web Attack - XSS': 'Web Attack'})
df_max_scaled.pop(' Label')
df_max_scaled
for col in df_max_scaled.columns:
    t = abs(df_max_scaled[col].max())
    df_max_scaled[col] = df_max_scaled[col]/t
df_max_scaled
df = df_max_scaled.assign( Label = y)
#df
df = df.fillna(0)

df['Init_Win_bytes_forward'] = df['Init_Win_bytes_forward'] + 1
df[' Init_Win_bytes_backward'] = df[' Init_Win_bytes_backward'] + 1

df['Init_Win_bytes_forward'] = df['Init_Win_bytes_forward']/df['Init_Win_bytes_forward'].max()
df[' Init_Win_bytes_backward'] = df[' Init_Win_bytes_backward']/df[' Init_Win_bytes_backward'].max()




# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Label']), df['Label'], test_size=0.2, random_state=42)



# Perform chi-square feature selection
chi_selector = SelectKBest(chi2, k=10) # select top 10 features
X_train_chi = chi_selector.fit_transform(X_train, y_train)
X_test_chi = chi_selector.transform(X_test)

# Print the names


In [2]:

# get the indices of the selected features
selected_indices = chi_selector.get_support(indices=True)

# get the names of the selected features
selected_features = list(X_train.columns[selected_indices])

# print the selected features
print('Selected Features:', selected_features)

Selected Features: [' Destination Port', 'Bwd Packet Length Max', ' Bwd Packet Length Mean', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' Average Packet Size', ' Avg Bwd Segment Size', 'Init_Win_bytes_forward']
