In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# ZIP ingestion (dataset compresso)
import zipfile

from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    cross_val_score
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    accuracy_score,
    balanced_accuracy_score
)


In [4]:
# percorso base della cartella che contiene i CSV
base_dir = Path(
    r"C:\Users\nicde\OneDrive\Desktop\Università\Ancona\DS\Data-Science\datasets"
) / "MachineLearningCSV" / "MachineLearningCVE"

# elenco ordinato di tutti i file CSV
csv_files = sorted(base_dir.glob("*.csv"))

print(f"Numero di file CSV trovati: {len(csv_files)}")

dfs = []

# lettura sequenziale dei CSV
for i, file in enumerate(csv_files, 1):
    print(f"Caricamento file {i}/{len(csv_files)} → {file.name}")
    df = pd.read_csv(file, low_memory=False)
    #df["source_file"] = file.name   # tracciabilità (facoltativa ma utile)
    dfs.append(df)

# concatenazione row-wise
data = pd.concat(dfs, ignore_index=True)

# controlli rapidi
data.head()
#data.shape
#data.columns

Numero di file CSV trovati: 8
Caricamento file 1/8 → Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Caricamento file 2/8 → Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Caricamento file 3/8 → Friday-WorkingHours-Morning.pcap_ISCX.csv
Caricamento file 4/8 → Monday-WorkingHours.pcap_ISCX.csv
Caricamento file 5/8 → Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Caricamento file 6/8 → Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Caricamento file 7/8 → Tuesday-WorkingHours.pcap_ISCX.csv
Caricamento file 8/8 → Wednesday-workingHours.pcap_ISCX.csv


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [5]:
# Rimozione di spazi accidentali nei nomi delle colonne
data.columns = data.columns.str.strip()

# Pulizia sintattica della variabile target (Label), se presente
if "Label" in data.columns:
    data["Label"] = data["Label"].astype(str).str.strip()

# Colonne identificative o temporali non utili per EDA e ML
cols_to_drop = [
    "Flow ID",
    "Timestamp",
    "Src IP",
    "Dst IP",
    "Source IP",
    "Destination IP",
    "Src Port",
    "Dst Port",
    "Source Port",
    "Destination Port"
]

# Drop sicuro: elimina solo quelle effettivamente presenti
data.drop(columns=[c for c in cols_to_drop if c in data.columns],
          inplace=True)

# Separa le feature dalla variabile target
feature_cols = [c for c in data.columns if c != "Label"]

# Conversione a numerico: valori non convertibili → NaN
data[feature_cols] = data[feature_cols].apply(
    pd.to_numeric, errors="coerce"
) 

# Sostituzione dei valori infiniti con NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Informazioni generali sul dataset
data.info()

# Colonne con più valori NaN
data.isna().sum(axis=0).sort_values(ascending=False).head(20)

# Rimozione delle osservazioni contenenti almeno un valore NaN
data = data.dropna(axis=0)

# Controllo finale dei NaN
data.isna().sum(axis=0).sort_values(ascending=False).head(10)

# Distribuzione delle classi (multi-class target)
if "Label" in data.columns:
    data["Label"].value_counts()

# Anteprima del dataset pulito
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830743 entries, 0 to 2830742
Data columns (total 78 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Flow Duration                int64  
 1   Total Fwd Packets            int64  
 2   Total Backward Packets       int64  
 3   Total Length of Fwd Packets  int64  
 4   Total Length of Bwd Packets  int64  
 5   Fwd Packet Length Max        int64  
 6   Fwd Packet Length Min        int64  
 7   Fwd Packet Length Mean       float64
 8   Fwd Packet Length Std        float64
 9   Bwd Packet Length Max        int64  
 10  Bwd Packet Length Min        int64  
 11  Bwd Packet Length Mean       float64
 12  Bwd Packet Length Std        float64
 13  Flow Bytes/s                 float64
 14  Flow Packets/s               float64
 15  Flow IAT Mean                float64
 16  Flow IAT Std                 float64
 17  Flow IAT Max                 int64  
 18  Flow IAT Min                 int64  
 19  

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3,2,0,12,0,6,6,6.0,0.0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,109,1,1,6,6,6,6,6.0,0.0,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,52,1,1,6,6,6,6,6.0,0.0,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,34,1,1,6,6,6,6,6.0,0.0,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,3,2,0,12,0,6,6,6.0,0.0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
