In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os

data_folder = '/content/drive/My Drive/Colab Notebooks/Independent Study/Data/TrafficLabelling'

encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'windows-1252']

csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

dataframes = []

# Process each CSV file
for file in csv_files:
    file_path = os.path.join(data_folder, file)
    print(f"Processing file: {file}")

    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding)
            print(f"Successfully read {file} with {encoding} encoding")
            # Strip spaces from column headers
            df.columns = df.columns.str.strip()
            dataframes.append(df)
            break
        except UnicodeDecodeError:
            print(f"Failed to read {file} with {encoding} encoding")
    else:
        print(f"Failed to read {file} with any encoding")



# Combine all successfully read dataframes
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Combined dataframe shape: {combined_df.shape}")

    # Remove any rows with NA or null values in the entire dataset
    combined_df = combined_df.dropna()

    # Reset the index after dropping rows
    combined_df.reset_index(drop=True, inplace=True)

    print(f"Dataframe shape after removing NA/null values: {combined_df.shape}")

    # Step 2: Convert labels that aren't 'BENIGN' to 'ATTACK'
    combined_df['Label'] = combined_df['Label'].apply(lambda x: 'ATTACK' if x != 'BENIGN' else x)

    # Mask target labels: 0 for BENIGN, 1 for ATTACK
    combined_df['Label'] = combined_df['Label'].map({'BENIGN': 0, 'ATTACK': 1})
else:
    print("No files were successfully read")


df = combined_df

#Handle infinite values in specific features
df['Flow Bytes/s'] = df['Flow Bytes/s'].replace([np.inf, -np.inf], -1)
df['Flow Packets/s'] = df['Flow Packets/s'].replace([np.inf, -np.inf], -1)