In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from minisom import MiniSom
import numpy as np
import time
from joblib import Parallel, delayed
import warnings

warnings.filterwarnings('ignore')

# Function to process a single file
def process_file(file):
    print(f"Processing file: {file}")  # Debug statement
    df = pd.read_csv(DATA_PATH + file)
    df['Label'] = df['Label'].apply(lambda x: 0 if x == 'Benign' else 1)
    
    target = df.iloc[:, -1].values
    data = df.iloc[:, :-1]
    
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

# Function to classify using SOM
def classify(som, data, X_train, y_train):
    winmap = som.labels_map(X_train, y_train)
    default_class = np.sum(list(winmap.values())).most_common()[0][0]
    result = []
    for d in data:
        win_position = som.winner(d)
        if win_position in winmap:
            result.append(winmap[win_position].most_common()[0][0])
        else:
            result.append(default_class)
    return result

# Function to train SOM and generate classification report
def train_som(X_train, X_test, y_train, y_test):
    print(f"Training SOM with data shape: {X_train.shape}")  # Debug statement
    n_neurons = 20
    m_neurons = 20
    input_len = X_train.shape[1]
    sigma = 1.5
    learning_rate = 1.2
    iterations = 10000
    
    som = MiniSom(x=n_neurons, y=m_neurons, input_len=input_len, sigma=sigma, learning_rate=learning_rate)
    
    # Classification before training
    y_pred_before = classify(som, X_test, X_train, y_train)
    report_before = classification_report(y_test, y_pred_before, digits=4)
    topographic_error_before = som.topographic_error(X_test)
    quantization_error_before = som.quantization_error(X_test)
    
    # Training SOM
    start_time = time.time()
    som.train(X_train, iterations, verbose=True)
    train_time = time.time() - start_time
    
    # Classification after training
    y_pred_after = classify(som, X_test, X_train, y_train)
    report_after = classification_report(y_test, y_pred_after, digits=4)
    topographic_error_after = som.topographic_error(X_test)
    quantization_error_after = som.quantization_error(X_test)
    
    return {
        "report_before": report_before,
        "topographic_error_before": topographic_error_before,
        "quantization_error_before": quantization_error_before,
        "report_after": report_after,
        "topographic_error_after": topographic_error_after,
        "quantization_error_after": quantization_error_after,
        "train_time": train_time
    }

# Paths and files
DATA_PATH = "/users/mik2002/som/dataset-processed CIC-IDS-2018/"
FILES = [
    "processed_bot_dataset.csv",
    "processed_bruteforce_dataset.csv",
    "processed_Ddos_dataset.csv",
    "processed_dos_dataset.csv",
    "processed_infiltration_dataset.csv",
]

# Process each file in parallel
print("Starting file processing...")  # Debug statement
results = Parallel(n_jobs=len(FILES))(delayed(process_file)(file) for file in FILES)

# Train SOM for each dataset and get metrics
metrics = []

for X_train, X_test, y_train, y_test in results:
    metric = train_som(X_train, X_test, y_train, y_test)
    metrics.append(metric)

# Display metrics
for i, file in enumerate(FILES):
    print(f"Metrics for {file}:")
    print("Before Training:")
    print(metrics[i]["report_before"])
    print(f"Topographic Error: {metrics[i]['topographic_error_before']:.4f}")
    print(f"Quantization Error: {metrics[i]['quantization_error_before']:.4f}")
    print("\nAfter Training:")
    print(metrics[i]["report_after"])
    print(f"Topographic Error: {metrics[i]['topographic_error_after']:.4f}")
    print(f"Quantization Error: {metrics[i]['quantization_error_after']:.4f}")
    print(f"Training Time: {metrics[i]['train_time']:.2f} seconds")
    print("-" * 50)


Starting file processing...


Processing file: processed_bot_dataset.csv
Processing file: processed_bruteforce_dataset.csv
Processing file: processed_Ddos_dataset.csv
Processing file: processed_dos_dataset.csv
Processing file: processed_infiltration_dataset.csv
Training SOM with data shape: (832446, 65)
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.5070144033587417
Training SOM with data shape: (656536, 65)
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.5350986097894594
Training SOM with data shape: (6329584, 65)
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.660924529155772
Training SOM with data shape: (1555168, 65)
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.6181821829702675
Training SOM with data shape: (747068, 65)
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.5740077118795391
Metrics for processed_bot_dataset.csv:
Before Training:
              precision    recall  f1-score   support

           0     0.9992    0.7388    0.84