<a href="https://colab.research.google.com/github/mariumnour/Project-1/blob/main/finalThesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Mount Google Drive to access the CSV files
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Define the path of the folder containing the CSV files on Google Drive
folder_path = '/content/drive/MyDrive/DataSets'

# Define the list of CSV files to load
file_names = ['02-14-2018.csv',
              '02-15-2018.csv',
              '02-16-2018.csv',
              '02-20-2018.csv',
              '02-21-2018.csv',
              '02-22-2018.csv',
              '02-23-2018.csv',
              '02-28-2018.csv',
              '03-01-2018.csv',
              '03-02-2018.csv']

# Load each CSV file into a dataframe, clean it, and split it into train and test sets
train_dfs = []
test_dfs = []
for file_name in file_names:
    file_path = f'{folder_path}/{file_name}'
    df = pd.read_csv(file_path, low_memory=False)
    
    # Remove rows with missing or invalid values
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna()
    
    # Drop columns with constant or near-constant values
    constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
    df = df.drop(constant_cols, axis=1)
    
    # Drop rows with duplicated values
    df = df.drop_duplicates()

     # Delete half of the dataframe
    df = df.sample(frac=0.5, random_state=42)
    
    # Convert categorical features to numerical features
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']
    for col in cat_cols:
        df[col] = pd.factorize(df[col])[0]
    
    # Split the dataframe into train and test sets
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    train_dfs.append(train_df)
    test_dfs.append(test_df)

# Concatenate all the train and test dataframes
train_df = pd.concat(train_dfs, axis=0)
test_df = pd.concat(test_dfs, axis=0)

# Save the new train and test dataframes to CSV files in your Google Drive
#train_df.to_csv('/content/drive/MyDrive/path/to/new_train_file.csv', index=False)
#test_df.to_csv('/content/drive/MyDrive/path/to/new_test_file.csv', index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Create a dictionary to map each attack type to its category
attack_category = {
    'DDoS': 'attack',
    'DoS': 'attack',
    'Web Attack - Brute Force': 'attack',
    'Bot': 'attack',
    'Infiltration': 'attack',
    'Web Attack - XSS': 'attack',
    'Web Attack - Sql Injection': 'attack',
    'PortScan': 'attack',
    'Benign': 'benign'
}

# Create a new column in the dataframe to store the attack category
train_df['attack_category'] = train_df['Label'].map(attack_category)
test_df['attack_category'] = test_df['Label'].map(attack_category)


In [7]:
# Drop the columns from the train and test dataframes
train_df = train_df.drop(["Timestamp"], axis=1)
test_df = test_df.drop(["Timestamp"], axis=1)


In [4]:
X_train = train_df.loc[:, ['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']]
y_train = train_df['Label']
X_test = test_df.loc[:, ['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']]
y_test = test_df['Label']

In [7]:
new_df = df.dropna()


In [9]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the input and target variables
X = new_df.drop(['Label'], axis=1)
y = new_df['Label']

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf.fit(X_train, y_train)

# Test the classifier on the testing data
y_pred = rf.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 1.00


In [10]:
from sklearn.model_selection import cross_val_score

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Evaluate the model using 10-fold cross-validation
scores = cross_val_score(rf, X, y, cv=10)

# Print the mean and standard deviation of the cross-validation scores
print(f'Mean cross-validation score: {scores.mean():.2f}')
print(f'Standard deviation of cross-validation scores: {scores.std():.2f}')


Mean cross-validation score: 1.00
Standard deviation of cross-validation scores: 0.00


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the input and target variables
X = new_df.drop(['Label'], axis=1)
y = new_df['Label']

# Create a stratified 10-fold object
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize an empty list to store the accuracy scores
scores = []

# Loop through the folds and fit the model
for train_idx, test_idx in skf.split(X, y):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

# Calculate and print the mean and standard deviation of the scores
mean_score = sum(scores) / len(scores)
std_score = np.std(scores)
print(f'Mean cross-validation score: {mean_score:.2f}')
print(f'Standard deviation of cross-validation scores: {std_score:.2f}')
