In [1]:
# ================================================
# 📌 Train-Test Split for ICS Attack Classification
# Author: Saif ul islam | Date: 2025-07-10
# ================================================

# 1. Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# 2. Load your processed dataset
data_path = r"C:\Users\Administrator\Desktop\Saif\CIC_Modbus_Research\Final_Clean_Merged\combined_attack_benign_encoded_scaled.csv"
df = pd.read_csv(data_path)

# 3. Confirm label distribution before splitting
print("🔎 Label distribution before split:")
print(df['Label'].value_counts())

# 4. Define features (X) and target (y)
# Drop label from features
X = df.drop(columns=['Label'])
y = df['Label']

# 5. Perform Train-Test Split (80:20) with stratification to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 6. Print split summaries
print("✅ Train shape:", X_train.shape)
print("✅ Test shape:", X_test.shape)
print("🔎 Train label distribution:")
print(y_train.value_counts(normalize=True))
print("🔎 Test label distribution:")
print(y_test.value_counts(normalize=True))

# 7. Optionally save the splits for model training laterS
X_train.to_csv("train_features.csv", index=False)
X_test.to_csv("test_features.csv", index=False)
y_train.to_csv("train_labels.csv", index=False)
y_test.to_csv("test_labels.csv", index=False)

print("✅ Train-Test split files saved successfully.")


  df = pd.read_csv(data_path)


🔎 Label distribution before split:
Label
Benign    100000
Attack     43787
Name: count, dtype: int64
✅ Train shape: (115029, 89)
✅ Test shape: (28758, 89)
🔎 Train label distribution:
Label
Benign    0.695477
Attack    0.304523
Name: proportion, dtype: float64
🔎 Test label distribution:
Label
Benign    0.695459
Attack    0.304541
Name: proportion, dtype: float64
✅ Train-Test split files saved successfully.


In [2]:
# Check specific columns’ unique types to validate
print(df.iloc[:, 6].apply(type).unique())
print(df.iloc[:, 84].apply(type).unique())


[<class 'str'> <class 'float'>]
[<class 'str'> <class 'float'>]


In [3]:
# Identify the columns’ names for clarity         
print(df.columns[6])
print(df.columns[84])


Timestamp
Attack


In [4]:
# Convert Timestamp to datetime (coerce errors to NaT)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Ensure Attack column is string type
df['Attack'] = df['Attack'].astype(str)


In [5]:
print(df['Timestamp'].apply(type).unique())
print(df['Attack'].apply(type).unique())


[<class 'pandas._libs.tslibs.timestamps.Timestamp'>
 <class 'pandas._libs.tslibs.nattype.NaTType'>]
[<class 'str'>]


In [6]:
# Confirm final missing value status
print("✅ Final missing values check:", df.isnull().sum().sum())


✅ Final missing values check: 300000


In [7]:
# Detailed missing values per column
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])


Timestamp    100000
Hour         100000
DayOfWeek    100000
dtype: int64


In [8]:
# Recommended final cleaning code
# Drop fully missing columns
df = df.drop(columns=['Timestamp', 'Hour', 'DayOfWeek'])

# Final missing value check
print("✅ Final missing values check after drop:", df.isnull().sum().sum())


✅ Final missing values check after drop: 0


In [10]:
# 🔖 Save cleaned train-test sets for modelling

# Save full cleaned dataset
df.to_csv('C:/Users/Administrator/Desktop/Saif/CIC_Modbus_Research/Final_Clean_Merged/combined_attack_benign_final_cleaned.csv', index=False)
print("✅ Full cleaned dataset saved.")

# If you have separate train/test splits already:
# Save train and test sets to your main project folder
X_train.to_csv('C:/Users/Administrator/Desktop/Saif/CIC_Modbus_Research/Final_Clean_Merged/train_data.csv', index=False)
X_test.to_csv('C:/Users/Administrator/Desktop/Saif/CIC_Modbus_Research/Final_Clean_Merged/test_data.csv', index=False)
print("✅ Train and test datasets saved.")



✅ Full cleaned dataset saved.
✅ Train and test datasets saved.


In [11]:
# Check train-test split label distributions match expected stratification
print("🔎 Train Label Distribution:\n", y_train.value_counts(normalize=True))
print("🔎 Test Label Distribution:\n", y_test.value_counts(normalize=True))


🔎 Train Label Distribution:
 Label
Benign    0.695477
Attack    0.304523
Name: proportion, dtype: float64
🔎 Test Label Distribution:
 Label
Benign    0.695459
Attack    0.304541
Name: proportion, dtype: float64
