In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

In [32]:
# Load datasets
data_1 = pd.read_csv('first-night-time.csv')
data_2 = pd.read_csv('midnight.csv')
data_3 = pd.read_csv('saturday-aft-1.csv')

# Display the first few rows of each dataset
print("1st-night-time Dataset:")
print(data_1.head())

print("\nmidnight Dataset:")
print(data_2.head())

print("\nsaturday aft 1 Dataset:")
print(data_3.head())

# Display the columns of each dataset
print("\n1st-night-time Dataset Columns:")
print(data_1.columns)

print("\nmidnight Dataset Columns:")
print(data_2.columns)

print("\nsaturday aft 1 Dataset Columns:")
print(data_3.columns)


1st-night-time Dataset:
   No.      Time          Source     Destination Protocol  Length  \
0    1  0.000000  103.12.198.162      39.62.1.38      UDP    1274   
1    2 -0.005975      39.62.1.38  142.250.181.54      TCP      78   
2    3 -0.005898      39.62.1.38  103.12.198.146      TCP      66   
3    4 -0.005898      39.62.1.38  103.12.198.146      TCP      66   
4    5 -0.005889      39.62.1.38  103.12.198.146      TCP      66   

                                                Info  
0                             443  >  45841 Len=1232  
1  2655  >  443 [ACK] Seq=1 Ack=1 Win=3708 Len=0 ...  
2  44538  >  443 [ACK] Seq=1 Ack=1 Win=1736 Len=0...  
3  44540  >  443 [ACK] Seq=1 Ack=1 Win=1734 Len=0...  
4  44540  >  443 [ACK] Seq=1 Ack=2761 Win=1736 Le...  

midnight Dataset:
   No.      Time          Source Destination Protocol  Length  \
0    1  0.000000    23.63.110.66  39.62.1.38      SSL    1498   
1    2  0.000000    23.63.110.66  39.62.1.38      TCP    1494   
2    3  0.000013 

In [3]:
# Combine datasets
data = pd.concat([data_1, data_2, data_3], ignore_index=True)

In [4]:
# Drop unnecessary columns
data = data.drop(columns=['No.', 'Info'])

In [5]:
# Feature matrix and target variable
X = data.drop(columns=['Length'])
y = data['Length']

In [6]:
# Preprocessing
# Identify categorical and numerical columns
categorical_features = ['Source', 'Destination', 'Protocol']
numerical_features = ['Time']

In [7]:
# Preprocessing for numerical data
numerical_transformer = StandardScaler()

In [8]:
# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [9]:
from sklearn.compose import ColumnTransformer

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [23]:
# Apply preprocessing and split data
X_preprocessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)

In [26]:
# Check class distribution after split
class_counts = np.bincount(y_train)
print("Class distribution in training data:", class_counts)

Class distribution in training data: [0 0 0 ... 0 0 1]


In [27]:
# Handle class imbalance
smote = SMOTE(random_state=42)

In [28]:
# Check if SMOTE is applicable
if len(np.unique(y_train)) > 1 and np.min(class_counts) > 1:
    k_neighbors = min(5, np.min(class_counts) - 1)
    if k_neighbors <= 1:
        print(f"Not enough samples for SMOTE with k_neighbors={k_neighbors}")
        X_train_resampled, y_train_resampled = X_train, y_train
    else:
        try:
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
            print("Training data shape after SMOTE:", X_train_resampled.shape)
        except ValueError as e:
            print("Error applying SMOTE:", e)
            X_train_resampled, y_train_resampled = X_train, y_train
else:
    print("SMOTE cannot be applied. Check data for class imbalance or size issues.")
    X_train_resampled, y_train_resampled = X_train, y_train

SMOTE cannot be applied. Check data for class imbalance or size issues.


In [33]:
# # Initialize and train the model with class weights balanced
# model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
# model.fit(X_train_resampled, y_train_resampled)

In [34]:
# Initialize and train the model with optimized parameters
model = RandomForestClassifier(
    n_estimators=50,  # Reduced number of trees
    max_depth=10,     # Limit depth of trees
    max_samples=0.8,  # Use 80% of samples for each tree
    random_state=42,
    class_weight='balanced'
)

In [35]:
try:
    model.fit(X_train_resampled, y_train_resampled)
except MemoryError as e:
    print("MemoryError while fitting model:", e)
    # Handle error appropriately
    model = None

In [45]:
def evaluate_model(model, X_test, y_test, X_train_resampled):
    # Use a subset of the test data if the dataset is very large
    sample_size = min(10000, X_test.shape[0])  
    X_test_subset = X_test[:sample_size]
    y_test_subset = y_test[:sample_size]

    try:
        # Predict on the test set
        y_pred = model.predict(X_test_subset)

        # Evaluate the model
        accuracy = accuracy_score(y_test_subset, y_pred)
        precision = precision_score(y_test_subset, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test_subset, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test_subset, y_pred, average='weighted', zero_division=0)

        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")

        # Anomaly Detection
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        iso_forest.fit(X_train_resampled)

        # Predict anomalies
        y_anomaly_pred = iso_forest.predict(X_test_subset)
        y_anomaly_pred = np.where(y_anomaly_pred == 1, 0, 1)  # Convert -1 to 1 for anomalies

        # Evaluate anomaly detection
        anomaly_accuracy = accuracy_score(y_test_subset, y_anomaly_pred)
        anomaly_precision = precision_score(y_test_subset, y_anomaly_pred, average='weighted', zero_division=0)
        anomaly_recall = recall_score(y_test_subset, y_anomaly_pred, average='weighted', zero_division=0)
        anomaly_f1 = f1_score(y_test_subset, y_anomaly_pred, average='weighted', zero_division=0)

        print(f"\nAnomaly Detection Accuracy: {anomaly_accuracy:.2f}")
        print(f"Anomaly Detection Precision: {anomaly_precision:.2f}")
        print(f"Anomaly Detection Recall: {anomaly_recall:.2f}")
        print(f"Anomaly Detection F1 Score: {anomaly_f1:.2f}")

    except MemoryError as e:
        print("MemoryError during prediction:", e)


In [46]:
if model:
    evaluate_model(model, X_test, y_test, X_train_resampled)

Accuracy: 0.03
Precision: 0.23
Recall: 0.03
F1 Score: 0.04

Anomaly Detection Accuracy: 0.00
Anomaly Detection Precision: 0.00
Anomaly Detection Recall: 0.00
Anomaly Detection F1 Score: 0.00
