In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score

## Funciones auxiliares

In [4]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [6]:
def remove_labels(df, label_name):
    X = df.drop(label_name, axis=1)
    y = df[label_name].copy()
    return (X, y)

In [8]:
def evaluate_result(y_pred, y, y_prep_pred, y_prep, metric):
    print(metric.__name__, "WITHOUT preparation:", metric(y_pred, y, average='weighted'))
    print(metric.__name__, "WITH preparation:", metric(y_prep_pred, y_prep, average='weighted'))

## Lectura de los datos

In [11]:
df = pd.read_csv('datasets/TotalFeatures-ISCXFlowMeter.csv')

## Visualización

In [14]:
df

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
0,1020586,668,1641,35692,2276876,52,52,679,1390,53.431138,...,0.0,-1,0.000000e+00,2,4194240,1853440,1640,668,32,benign
1,80794,1,1,75,124,75,124,75,124,75.000000,...,0.0,-1,0.000000e+00,2,0,0,0,1,0,benign
2,998,3,0,187,0,52,-1,83,-1,62.333333,...,0.0,-1,0.000000e+00,4,101888,-1,0,3,32,benign
3,189868,9,9,1448,6200,52,52,706,1390,160.888889,...,0.0,-1,0.000000e+00,2,4194240,2722560,8,9,32,benign
4,110577,4,6,528,1422,52,52,331,1005,132.000000,...,0.0,-1,0.000000e+00,2,155136,31232,5,4,32,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631950,530,1,1,74,334,74,334,74,334,74.000000,...,0.0,-1,0.000000e+00,2,0,0,0,1,0,benign
631951,50240627,23,24,4767,6107,52,52,533,855,207.260870,...,9842879.0,9964749,1.196806e+05,2,317952,107008,11,23,32,GeneralMalware
631952,35471450,1,2,52,104,52,52,52,52,52.000000,...,35300000.0,35290631,0.000000e+00,2,3904,88704,1,1,32,asware
631953,41713629,12,26,1821,18643,40,40,489,1390,151.750000,...,20200000.0,32711382,1.770000e+07,2,227456,2432,23,12,20,benign


In [16]:
df.describe()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
count,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,...,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0
mean,21952450.0,6.728514,10.431934,954.0172,12060.42,141.475727,44.357688,263.675901,183.248084,174.959706,...,19973270.0,20312280.0,20752380.0,466387.5,2.360896,962079.6,310451.9,9.733144,6.72471,19.965713
std,190057800.0,174.161354,349.424019,82350.4,482471.6,157.68088,89.099554,289.644383,371.863224,162.024811,...,189798600.0,189790200.0,189972100.0,6199704.0,3.04181,1705655.0,664795.6,347.877923,174.13813,14.914261
min,-18.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,2.0,-1.0,-1.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,69.0,0.0,52.0,-1.0,52.0,-1.0,52.0,...,-1.0,0.0,-1.0,0.0,2.0,0.0,-1.0,0.0,1.0,0.0
50%,24450.0,1.0,0.0,184.0,0.0,52.0,-1.0,83.0,-1.0,83.0,...,-1.0,0.0,-1.0,0.0,2.0,87616.0,-1.0,0.0,1.0,32.0
75%,1759751.0,3.0,1.0,427.0,167.0,108.0,52.0,421.0,115.0,356.0,...,1013498.0,1291379.0,1306116.0,0.0,2.0,304640.0,90496.0,1.0,3.0,32.0
max,44310760000.0,48255.0,74768.0,40496440.0,103922200.0,1390.0,1390.0,1500.0,1390.0,1390.0,...,44310720000.0,44300000000.0,44310720000.0,847000000.0,2269.0,4194240.0,4194240.0,74524.0,48255.0,44.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631955 entries, 0 to 631954
Data columns (total 80 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   duration                 631955 non-null  int64  
 1   total_fpackets           631955 non-null  int64  
 2   total_bpackets           631955 non-null  int64  
 3   total_fpktl              631955 non-null  int64  
 4   total_bpktl              631955 non-null  int64  
 5   min_fpktl                631955 non-null  int64  
 6   min_bpktl                631955 non-null  int64  
 7   max_fpktl                631955 non-null  int64  
 8   max_bpktl                631955 non-null  int64  
 9   mean_fpktl               631955 non-null  float64
 10  mean_bpktl               631955 non-null  float64
 11  std_fpktl                631955 non-null  float64
 12  std_bpktl                631955 non-null  float64
 13  total_fiat               631955 non-null  int64  
 14  tota

In [22]:
print("Longitud del conjunto de datos:", len(df))
print("Numero de caracteristicas:", len(df.columns))

Longitud del conjunto de datos: 631955
Numero de caracteristicas: 80


In [24]:
df["calss"].value_counts()

calss
benign            471597
asware            155613
GeneralMalware      4745
Name: count, dtype: int64

### Correlaciones

In [27]:
X = df.copy()
X["calss"] = X["calss"].factorize()[0]

In [29]:
corr_matrix = X.corr()
corr_matrix["calss"].sort_values(ascending=False)

calss                     1.000000
flow_fin                  0.286175
min_seg_size_forward      0.258352
Init_Win_bytes_forward    0.129425
std_fpktl                 0.123758
                            ...   
furg_cnt                       NaN
burg_cnt                       NaN
flow_urg                       NaN
flow_cwr                       NaN
flow_ece                       NaN
Name: calss, Length: 80, dtype: float64

In [31]:
X.corr()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
duration,1.000000,0.004837,0.004011,0.001673,0.003518,-0.064100,-0.027231,0.008761,0.042925,-0.043746,...,0.998901,0.999458,0.047582,0.016532,0.027610,0.029712,0.003785,0.004838,0.082955,0.067066
total_fpackets,0.004837,1.000000,0.924622,0.425756,0.904007,-0.018958,0.005252,0.024685,0.086255,-0.007910,...,0.001614,0.002267,0.017229,0.016089,0.050201,0.059224,0.902713,0.999866,0.018198,0.018377
total_bpackets,0.004011,0.924622,1.000000,0.156780,0.997268,-0.017667,0.006912,0.018170,0.086886,-0.016104,...,0.000922,0.001617,0.016230,-0.000493,0.048190,0.058435,0.997580,0.924746,0.015124,0.019430
total_fpktl,0.001673,0.425756,0.156780,1.000000,0.090082,-0.003099,0.000803,0.021278,0.022088,0.022409,...,0.000335,0.000609,0.009896,0.001657,0.013283,0.015991,0.088422,0.425789,0.005477,0.000679
total_bpktl,0.003518,0.904007,0.997268,0.090082,1.000000,-0.014926,0.005966,0.012560,0.079905,-0.017328,...,0.000812,0.001452,0.014336,-0.000293,0.043571,0.053134,0.999616,0.904129,0.012139,0.019838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Init_Win_bytes_backward,0.029712,0.059224,0.058435,0.015991,0.053134,-0.268444,0.038319,0.429893,0.593143,-0.030004,...,0.026959,0.029512,0.097316,-0.052507,0.811204,1.000000,0.056761,0.059242,0.333701,0.069405
RRT_samples_clnt,0.003785,0.902713,0.997580,0.088422,0.999616,-0.016659,0.006156,0.015727,0.084280,-0.017595,...,0.000893,0.001560,0.015200,-0.000437,0.046784,0.056761,1.000000,0.902834,0.014299,0.019679
Act_data_pkt_forward,0.004838,0.999866,0.924746,0.425789,0.904129,-0.018947,0.005264,0.024705,0.086278,-0.007893,...,0.001617,0.002269,0.017233,0.000734,0.050220,0.059242,0.902834,1.000000,0.018229,0.018391
min_seg_size_forward,0.082955,0.018198,0.015124,0.005477,0.012139,-0.686154,-0.189824,-0.074763,0.217989,-0.524024,...,0.077943,0.079324,0.048803,0.052177,0.394743,0.333701,0.014299,0.018229,1.000000,0.258352


## Division conjunto de datos

In [34]:
train_set, val_set, test_set = train_val_test_split(df)

In [36]:
X_train, y_train = remove_labels(train_set, 'calss')
X_val, y_val = remove_labels(val_set, 'calss')
X_test, y_test = remove_labels(test_set, 'calss')

## Escalado del conjunto 

In [39]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [41]:
X_test_scaled = scaler.transform(X_test)

In [43]:
X_val_scaled = scaler.transform(X_val)

In [47]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_train_scaled.head(10)

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
508881,-0.013646,0.0,1.0,-0.310056,1.556886,0.375,4.924528,-0.0271,2.25,-0.032895,...,0.0,0.0,0.0,0.0,0.0,-0.286342,1.1e-05,0.0,0.0,-1.0
208326,-0.013926,0.0,0.0,0.664804,0.0,6.607143,0.0,0.918699,0.0,1.115132,...,0.0,0.0,0.0,0.0,0.0,-0.286342,0.0,0.0,0.0,-1.0
107213,-0.013926,0.0,0.0,0.703911,0.0,6.857143,0.0,0.95664,0.0,1.161184,...,0.0,0.0,0.0,0.0,0.0,-0.286342,0.0,0.0,0.0,-1.0
466726,-0.000273,0.0,1.0,-0.363128,2.724551,0.035714,8.603774,-0.078591,3.931034,-0.095395,...,0.0,0.0,0.0,0.0,0.0,-0.286342,1.1e-05,0.0,0.0,-1.0
230085,-0.013926,0.0,0.0,0.664804,0.0,6.607143,0.0,0.918699,0.0,1.115132,...,0.0,0.0,0.0,0.0,0.0,-0.286342,0.0,0.0,0.0,-1.0
472961,34.421927,1.5,4.0,1.558659,3.868263,0.0,1.0,1.341463,4.163793,0.337171,...,59.657102,48.116772,47.580946,0.0,0.0,13.421042,12.65758,3.0,1.5,0.0
482372,-0.013805,0.5,0.0,-0.136872,0.0,0.0,0.0,0.0,0.0,-0.050987,...,0.0,0.0,0.0,0.0,1.0,-0.080736,0.0,0.0,0.5,0.0
619993,17.578734,0.5,0.0,-0.050279,0.0,0.553571,0.0,0.0,0.0,0.0,...,30.549478,24.617883,24.365465,0.0,1.0,0.063376,0.0,0.0,0.5,0.0
65344,-0.013926,0.0,0.0,0.703911,0.0,6.857143,0.0,0.95664,0.0,1.161184,...,0.0,0.0,0.0,0.0,0.0,-0.286342,0.0,0.0,0.0,-1.0
46666,-0.013926,0.0,0.0,0.505587,0.0,5.589286,0.0,0.764228,0.0,0.927632,...,0.0,0.0,0.0,0.0,0.0,-0.286342,0.0,0.0,0.0,-1.0


In [49]:
X_train_scaled.describe()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
count,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,...,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0,379173.0
mean,12.543409,2.927643,10.762678,1.947184,75.104138,1.600682,0.855241,0.491724,1.591823,0.303639,...,19.838596,16.279309,16.441899,468661.8,0.362658,2.861506,3.435883,10.087709,2.924548,-0.376798
std,116.165117,92.319402,370.875546,185.549801,3071.968461,2.817301,1.676098,0.7865,3.210186,0.533388,...,201.477623,162.562856,160.822696,6188642.0,3.836035,5.577212,7.354164,369.660712,92.301146,0.466297
min,-0.013936,-0.5,0.0,-0.513966,0.0,-0.946429,0.0,-0.227642,0.0,-0.273026,...,0.0,0.0,0.0,0.0,0.0,-0.286345,0.0,0.0,-0.5,-1.0
25%,-0.013926,0.0,0.0,-0.321229,0.0,0.0,0.0,-0.084011,0.0,-0.101974,...,0.0,0.0,0.0,0.0,0.0,-0.286342,0.0,0.0,0.0,-1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.986074,1.0,1.0,0.678771,1.0,1.0,1.0,0.915989,1.0,0.898026,...,1.0,1.0,1.0,0.0,0.0,0.713658,1.0,1.0,1.0,0.0
max,25277.128872,24127.0,74768.0,113118.041899,622288.473054,23.892857,26.245283,3.840108,11.991379,4.299342,...,43893.471528,35408.189283,35008.286082,567000000.0,2267.0,13.421042,46.346741,74524.0,24127.0,0.375


## Decision forest

In [52]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier(random_state=42)
clf_tree.fit(X_train, y_train)

In [54]:
y_train_pred = clf_tree.predict(X_train)

In [56]:
print("F1 score para el entrenamiento:", f1_score(y_train_pred, y_train, average="weighted"))

F1 score para el entrenamiento: 0.9812432382517775


In [58]:
# ahora para el conjunto de validacion
y_val_pred = clf_tree.predict(X_val)

In [60]:
print("F1 score para la validación:", f1_score(y_val_pred, y_val, average="weighted"))

F1 score para la validación: 0.9304198818652374


## Random forest

In [63]:
from sklearn.ensemble import RandomForestClassifier

clf_rnd = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train, y_train)

In [65]:
# entrenado en el conjunto escalado
clf_rnd_scaled = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd_scaled.fit(X_train_scaled, y_train)

In [69]:
# predecimos para el entrenamiento
y_train_pred = clf_rnd.predict(X_train)
y_train_pred_scaled = clf_rnd_scaled.predict(X_train_scaled)

In [73]:
# resultados escalados y sin escalar
evaluate_result(y_train_pred, y_train, y_train_pred_scaled, y_train, f1_score)

f1_score WITHOUT preparation: 0.9812539406589775
f1_score WITH preparation: 0.9811864245262409


In [75]:
# ahora para la validacion
y_val_pred = clf_rnd.predict(X_val)
y_val_pred_scaled = clf_rnd_scaled.predict(X_val_scaled)



In [77]:
evaluate_result(y_val_pred, y_val, y_val_pred_scaled, y_val, f1_score)

f1_score WITHOUT preparation: 0.9329474731171657
f1_score WITH preparation: 0.9324559680173685


## Regression forest

In [80]:
# se aplica el mismo proceso pero se usa 
from sklearn.ensemble import RandomForestRegressor