In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

## Funciones auxiliares

In [4]:
# Construcción de una función que realice el particionado completo
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

def remove_labels(df, label_name):
    X = df.drop(label_name, axis=1)
    y = df[label_name].copy()
    return (X, y)

## Lectura del conjunto de datos

In [11]:
df = pd.read_csv('datasets/TotalFeatures-ISCXFlowMeter.csv')

In [13]:
df.head()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
0,1020586,668,1641,35692,2276876,52,52,679,1390,53.431138,...,0.0,-1,0.0,2,4194240,1853440,1640,668,32,benign
1,80794,1,1,75,124,75,124,75,124,75.0,...,0.0,-1,0.0,2,0,0,0,1,0,benign
2,998,3,0,187,0,52,-1,83,-1,62.333333,...,0.0,-1,0.0,4,101888,-1,0,3,32,benign
3,189868,9,9,1448,6200,52,52,706,1390,160.888889,...,0.0,-1,0.0,2,4194240,2722560,8,9,32,benign
4,110577,4,6,528,1422,52,52,331,1005,132.0,...,0.0,-1,0.0,2,155136,31232,5,4,32,benign


In [15]:
df.describe()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
count,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,...,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0
mean,21952450.0,6.728514,10.431934,954.0172,12060.42,141.475727,44.357688,263.675901,183.248084,174.959706,...,19973270.0,20312280.0,20752380.0,466387.5,2.360896,962079.6,310451.9,9.733144,6.72471,19.965713
std,190057800.0,174.161354,349.424019,82350.4,482471.6,157.68088,89.099554,289.644383,371.863224,162.024811,...,189798600.0,189790200.0,189972100.0,6199704.0,3.04181,1705655.0,664795.6,347.877923,174.13813,14.914261
min,-18.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,2.0,-1.0,-1.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,69.0,0.0,52.0,-1.0,52.0,-1.0,52.0,...,-1.0,0.0,-1.0,0.0,2.0,0.0,-1.0,0.0,1.0,0.0
50%,24450.0,1.0,0.0,184.0,0.0,52.0,-1.0,83.0,-1.0,83.0,...,-1.0,0.0,-1.0,0.0,2.0,87616.0,-1.0,0.0,1.0,32.0
75%,1759751.0,3.0,1.0,427.0,167.0,108.0,52.0,421.0,115.0,356.0,...,1013498.0,1291379.0,1306116.0,0.0,2.0,304640.0,90496.0,1.0,3.0,32.0
max,44310760000.0,48255.0,74768.0,40496440.0,103922200.0,1390.0,1390.0,1500.0,1390.0,1390.0,...,44310720000.0,44300000000.0,44310720000.0,847000000.0,2269.0,4194240.0,4194240.0,74524.0,48255.0,44.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631955 entries, 0 to 631954
Data columns (total 80 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   duration                 631955 non-null  int64  
 1   total_fpackets           631955 non-null  int64  
 2   total_bpackets           631955 non-null  int64  
 3   total_fpktl              631955 non-null  int64  
 4   total_bpktl              631955 non-null  int64  
 5   min_fpktl                631955 non-null  int64  
 6   min_bpktl                631955 non-null  int64  
 7   max_fpktl                631955 non-null  int64  
 8   max_bpktl                631955 non-null  int64  
 9   mean_fpktl               631955 non-null  float64
 10  mean_bpktl               631955 non-null  float64
 11  std_fpktl                631955 non-null  float64
 12  std_bpktl                631955 non-null  float64
 13  total_fiat               631955 non-null  int64  
 14  tota

In [21]:
df["calss"].value_counts()

calss
benign            471597
asware            155613
GeneralMalware      4745
Name: count, dtype: int64

## División conjunto de datos

In [24]:
train_set, val_set, test_set = train_val_test_split(df)

In [26]:
X_train, y_train = remove_labels(train_set, "calss")
X_val, y_val = remove_labels(val_set, "calss")
X_test, y_test = remove_labels(test_set, "calss")

## Random forests

In [31]:
clf_rnd = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1)
clf_rnd.fit(X_train, y_train)

In [33]:
y_pred = clf_rnd.predict(X_val)
print("f1 score:", f1_score(y_pred, y_val, average="weighted"))

f1 score: 0.9329474731171657


## Selección del modelo

In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"n_estimators": [100, 500, 1000], "max_leaf_nodes": [16, 24, 36]},
    {"bootstrap": [False], "n_estimators": [100, 500], "max_features": [2, 3, 4]},
]

rnd_clf = RandomForestClassifier(n_jobs=-1, random_state=42)

grid_search = GridSearchCV(rnd_clf, param_grid, cv=5,
                          scoring="f1_weighted", return_train_score=True)
grid_search.fit(X_train, y_train)

In [40]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 4, 'n_estimators': 500}

In [42]:
grid_search.best_estimator_

In [46]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("F1 score:", mean_score, "-", "Parámetros:", params)

F1 score: 0.7923799683849088 - Parámetros: {'max_leaf_nodes': 16, 'n_estimators': 100}
F1 score: 0.7925780047375502 - Parámetros: {'max_leaf_nodes': 16, 'n_estimators': 500}
F1 score: 0.7926827876796985 - Parámetros: {'max_leaf_nodes': 16, 'n_estimators': 1000}
F1 score: 0.8055197663651619 - Parámetros: {'max_leaf_nodes': 24, 'n_estimators': 100}
F1 score: 0.805532754533511 - Parámetros: {'max_leaf_nodes': 24, 'n_estimators': 500}
F1 score: 0.806128393074076 - Parámetros: {'max_leaf_nodes': 24, 'n_estimators': 1000}
F1 score: 0.8162156111668564 - Parámetros: {'max_leaf_nodes': 36, 'n_estimators': 100}
F1 score: 0.8168965814969431 - Parámetros: {'max_leaf_nodes': 36, 'n_estimators': 500}
F1 score: 0.8167781554292424 - Parámetros: {'max_leaf_nodes': 36, 'n_estimators': 1000}
F1 score: 0.9209855440608206 - Parámetros: {'bootstrap': False, 'max_features': 2, 'n_estimators': 100}
F1 score: 0.9213535726827832 - Parámetros: {'bootstrap': False, 'max_features': 2, 'n_estimators': 500}
F1 score

In [56]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    "n_estimators": randint(low=1, high=200),
    "max_depth": randint(low=9, high=50),
}

rnd_clf = RandomForestClassifier(n_jobs=-1)

rnd_search = RandomizedSearchCV(rnd_clf, param_distributions=param_distribs,
                             n_iter=5, cv=2, scoring="f1_weighted")
rnd_search.fit(X_train, y_train)

In [58]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("F1 score:", mean_score, "-", "Parámetros:", params)

F1 score: 0.927334870171654 - Parámetros: {'max_depth': 28, 'n_estimators': 162}
F1 score: 0.890798471969239 - Parámetros: {'max_depth': 11, 'n_estimators': 160}
F1 score: 0.9225038850193543 - Parámetros: {'max_depth': 44, 'n_estimators': 12}
F1 score: 0.9116927804101504 - Parámetros: {'max_depth': 14, 'n_estimators': 129}
F1 score: 0.9205354450024394 - Parámetros: {'max_depth': 17, 'n_estimators': 102}


## Modelo final

In [61]:
rnd_search.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 162,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [63]:
# seleccionar el mejor modelo
clf_rnd = rnd_search.best_estimator_

In [65]:
# predecimos
y_train_pred = clf_rnd.predict(X_train)

In [69]:
print("F1 score conjunto de entrenamiento:", f1_score(y_train_pred, y_train, average="weighted"))

F1 score conjunto de entrenamiento: 0.9781345201519323


In [71]:
y_val_pred = clf_rnd.predict(X_val)
print("F1 score conjunto de entrenamiento:", f1_score(y_val_pred, y_val, average="weighted"))

F1 score conjunto de entrenamiento: 0.9342060834233193
