In [84]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time
import psutil
import warnings
from joblib import Parallel, delayed


In [85]:
# Function to convert memory usage to appropriate units
def format_memory_usage(mem_usage):
    if mem_usage >= 1024 ** 3:
        return f"{mem_usage / (1024 ** 3):.2f} GB"
    else:
        return f"{mem_usage / (1024 ** 2):.2f} MB"


In [86]:

# Function to monitor memory usage during a function
def monitor_memory_usage(func, *args, **kwargs):
    process = psutil.Process()
    mem_usage_before = process.memory_info().rss
    result = func(*args, **kwargs)
    mem_usage_after = process.memory_info().rss
    peak_mem_usage = max(mem_usage_before, mem_usage_after)
    return result, peak_mem_usage - mem_usage_before

# Function for training and evaluation with memory monitoring
def train_and_evaluate(train_idx, test_idx, X, y, n_estimators):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)

    start_train = time.time()
    _, training_mem_usage = monitor_memory_usage(model.fit, X_train, y_train)
    end_train = time.time()

    start_eval = time.time()
    y_pred, eval_mem_usage = monitor_memory_usage(model.predict, X_test)
    end_eval = time.time()

    report = classification_report(y_test, y_pred, digits=4, output_dict=True, zero_division=0)

    training_time = end_train - start_train
    evaluating_time = end_eval - start_eval
    total_mem_usage = training_mem_usage + eval_mem_usage

    return report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage


In [87]:
# Importiamo le librerie necessarie
import pandas as pd

# Definiamo il percorso dei dati
DATA_PATH = "/users/mik2002/som/dataset-processed UNSW-NB15/"
DATA_FILE = "dataset-processed-UNSW-NB15.csv"

# Carichiamo il dataset combinato e processato
df_dataset = pd.read_csv(DATA_PATH + DATA_FILE)

# Drop the 'attack_cat' column se ancora presente (può essere già stato rimosso nel dataset processato)
if 'attack_cat' in df_dataset.columns:
    df_dataset = df_dataset.drop(columns=['attack_cat'])

# Display label distribution
print("Distribution:")
display(df_dataset["label"].value_counts())


# Salvataggio del dataset codificato finale
df_dataset.to_csv('/users/mik2002/som/dataset-processed UNSW-NB15/dataset-processed-UNSW-NB15.csv', index=False)

Distribution:


label
1    164673
0     93000
Name: count, dtype: int64

In [92]:
# Parameters for Random Forest
n_estimators = 100
sampling_fractions = [1]

In [93]:
results = []
# Wrapper function for parallelism
def parallel_train_and_evaluate(train_idx, test_idx, data, target, n_estimators):
    report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage = train_and_evaluate(train_idx, test_idx, data, target, n_estimators)
    return report, training_time, training_mem_usage, evaluating_time, eval_mem_usage, total_mem_usage

def display_report_and_times(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages):
    print("\nAll Reports and Times for this Model:")
    
    for i, (report, train_time, train_mem_usage, eval_time, eval_mem_usage, total_mem_usage) in enumerate(zip(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages)):
        print(f"\nReport for Fold {i+1}:")
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                print(f"\nLabel {label}:")
                for metric_name, metric_value in metrics.items():
                    print(f"  {metric_name.capitalize()}: {metric_value:.4f}")
            else:
                print(f"\n{label.capitalize()}: {metrics:.4f}")
        print(f"\nTraining Time: {train_time:.4f} seconds")
        print(f"Training Memory Usage: {format_memory_usage(train_mem_usage)}")
        print(f"Evaluating Time: {eval_time:.4f} seconds")
        print(f"Evaluating Memory Usage: {format_memory_usage(eval_mem_usage)}")
        print(f"Total Memory Usage: {format_memory_usage(total_mem_usage)}")

# KFold initialization
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Loop over different sampling fractions
for frac in sampling_fractions:
    print(f"Evaluating for sampling fraction: {frac}")
    
    # Perform stratified random sampling maintaining class proportions
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        sampled_df = df_dataset.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))

    scaler = MinMaxScaler()
    data = scaler.fit_transform(sampled_df.iloc[:, :-1])
    target = sampled_df.iloc[:, -1].values

    # Split dataset with K-Fold and parallelism
    parallel_results = Parallel(n_jobs=-1)(delayed(parallel_train_and_evaluate)(train_idx, test_idx, data, target, n_estimators) for train_idx, test_idx in kf.split(data))
        
    reports, training_times, training_mem_usages, evaluating_times, eval_mem_usages, total_mem_usages = zip(*parallel_results)
        
    # Aggregate metrics from reports
    class_metrics = {}
    for report in reports:
        for key, metrics in report.items():
            if key.isdigit() or key in ['macro avg', 'weighted avg']:
                if key not in class_metrics:
                    class_metrics[key] = {m: [] for m in metrics.keys() if m != 'support'}
                for metric, value in metrics.items():
                    if metric != 'support':
                        class_metrics[key][metric].append(value)

    # Calculate averages and prepare formatted output
    averages = {key: {metric: np.mean(values) for metric, values in metrics.items()} for key, metrics in class_metrics.items()}

    # Calculate average training and evaluation times and memory usage
    avg_training_time = np.mean(training_times)
    avg_training_mem_usage = np.mean(training_mem_usages)
    avg_evaluating_time = np.mean(evaluating_times)
    avg_eval_mem_usage = np.mean(eval_mem_usages)
    avg_total_mem_usage = np.mean(total_mem_usages)

    results.append((frac, averages, avg_training_time, avg_training_mem_usage, avg_evaluating_time, avg_eval_mem_usage, avg_total_mem_usage, reports, training_times, training_mem_usages, evaluating_times, eval_mem_usages, total_mem_usages))

# Display results
print("\nAverage Metrics Across All Folds:")
for result in results:
    frac, metrics, avg_training_time, avg_training_mem_usage, avg_evaluating_time, avg_eval_mem_usage, avg_total_mem_usage, all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages = result
    print("****************************")
    print(f"\nSampling Fraction: {frac}")
    for class_id, class_metrics in metrics.items():
        print(f"\nClass {class_id}:")
        for metric, value in class_metrics.items():
            print(f"{metric}: {value:.4f}")

    # Display average training and evaluation times and memory usage
    minutes_train, seconds_train = divmod(avg_training_time, 60)
    minutes_eval, seconds_eval = divmod(avg_evaluating_time, 60)
    print(f"Average Training Time: {int(minutes_train)} minutes and {seconds_train:.2f} seconds")
    print(f"Average Training Memory Usage: {format_memory_usage(avg_training_mem_usage)}")
    print(f"Average Evaluating Time: {int(minutes_eval)} minutes and {seconds_eval:.2f} seconds")
    print(f"Average Evaluating Memory Usage: {format_memory_usage(avg_eval_mem_usage)}")
    print(f"Average Total Memory Usage: {format_memory_usage(avg_total_mem_usage)}")
    print("---------------------------------------")
    display_report_and_times(all_reports, all_training_times, all_training_mem_usages, all_evaluating_times, all_eval_mem_usages, all_total_mem_usages)


Evaluating for sampling fraction: 1

Average Metrics Across All Folds:
****************************

Sampling Fraction: 1

Class 0:
precision: 0.6965
recall: 0.8042
f1-score: 0.7465

Class 1:
precision: 0.8788
recall: 0.8021
f1-score: 0.8387

Class macro avg:
precision: 0.7877
recall: 0.8031
f1-score: 0.7926

Class weighted avg:
precision: 0.8130
recall: 0.8028
f1-score: 0.8054
Average Training Time: 0 minutes and 6.10 seconds
Average Training Memory Usage: 12.54 MB
Average Evaluating Time: 0 minutes and 0.16 seconds
Average Evaluating Memory Usage: 1.81 MB
Average Total Memory Usage: 14.35 MB
---------------------------------------

All Reports and Times for this Model:

Report for Fold 1:

Label 0:
  Precision: 0.6945
  Recall: 0.8047
  F1-score: 0.7455
  Support: 18499.0000

Label 1:
  Precision: 0.8800
  Recall: 0.8017
  F1-score: 0.8390
  Support: 33036.0000

Accuracy: 0.8028

Label macro avg:
  Precision: 0.7872
  Recall: 0.8032
  F1-score: 0.7923
  Support: 51535.0000

Label wei