# Selección de Características

Escoger las características mas relevantes de conjunto de datos a través de **Random Forests** para entrenar un modelo de manera eficiente y óptima.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score

## Funciones Auxiliares

In [2]:
# Construcción de una función que realice el particionado completo
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [3]:
def remove_labels(df, label_name):
    X = df.drop(label_name, axis=1)
    y = df[label_name].copy()
    return (X, y)

## 1. Leer el conjunto

In [4]:
df = pd.read_csv('../datasets/TotalFeatures-ISCXFlowMeter.csv')

In [5]:
df

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
0,1020586,668,1641,35692,2276876,52,52,679,1390,53.431138,...,0.0,-1,0.000000e+00,2,4194240,1853440,1640,668,32,benign
1,80794,1,1,75,124,75,124,75,124,75.000000,...,0.0,-1,0.000000e+00,2,0,0,0,1,0,benign
2,998,3,0,187,0,52,-1,83,-1,62.333333,...,0.0,-1,0.000000e+00,4,101888,-1,0,3,32,benign
3,189868,9,9,1448,6200,52,52,706,1390,160.888889,...,0.0,-1,0.000000e+00,2,4194240,2722560,8,9,32,benign
4,110577,4,6,528,1422,52,52,331,1005,132.000000,...,0.0,-1,0.000000e+00,2,155136,31232,5,4,32,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631950,530,1,1,74,334,74,334,74,334,74.000000,...,0.0,-1,0.000000e+00,2,0,0,0,1,0,benign
631951,50240627,23,24,4767,6107,52,52,533,855,207.260870,...,9842879.0,9964749,1.196806e+05,2,317952,107008,11,23,32,GeneralMalware
631952,35471450,1,2,52,104,52,52,52,52,52.000000,...,35300000.0,35290631,0.000000e+00,2,3904,88704,1,1,32,asware
631953,41713629,12,26,1821,18643,40,40,489,1390,151.750000,...,20200000.0,32711382,1.770000e+07,2,227456,2432,23,12,20,benign


## 2. Dividir el conjunto

In [6]:
train_set, val_set, test_set = train_val_test_split(df)

In [7]:
x_train, y_train = remove_labels(train_set, 'calss')
x_val, y_val = remove_labels(val_set, 'calss')
x_test, y_test = remove_labels(test_set, 'calss')

## 3. Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

model_rforest = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
model_rforest.fit(x_train, y_train)

In [9]:
y_pred = model_rforest.predict(x_val)

In [11]:
f_score = f1_score(y_pred, y_val, average='weighted')
print('Score: ', f_score)

Score:  0.9324043007314987


## 4. Relevancia de las caracteristicas 

In [12]:
model_rforest.feature_importances_

array([0.03096656, 0.00303719, 0.00440737, 0.02318232, 0.01184895,
       0.01721388, 0.00881173, 0.02199267, 0.01122589, 0.01910279,
       0.01229994, 0.00912599, 0.0049411 , 0.01864105, 0.00468261,
       0.01359503, 0.0060695 , 0.01755146, 0.00504174, 0.01740915,
       0.00478204, 0.00668029, 0.00337915, 0.00937514, 0.00572423,
       0.        , 0.        , 0.00268121, 0.00471322, 0.02948284,
       0.0175912 , 0.02737585, 0.0276842 , 0.02610625, 0.0159516 ,
       0.0247063 , 0.01454405, 0.02000791, 0.03888253, 0.03004006,
       0.00794144, 0.03300505, 0.00432689, 0.0041829 , 0.01156361,
       0.00794625, 0.        , 0.        , 0.        , 0.01207349,
       0.02251504, 0.01938611, 0.00347552, 0.00116829, 0.00072676,
       0.00094549, 0.00527031, 0.0106541 , 0.00290367, 0.00144508,
       0.00254706, 0.00234171, 0.00912673, 0.00249816, 0.00228634,
       0.00765582, 0.00907677, 0.01158292, 0.00196904, 0.0121832 ,
       0.00783499, 0.00994449, 0.00162465, 0.00188881, 0.14141

In [13]:
# Extraer las caracteristicas mas importantes
important_features = { name: score for name, score in zip(list(df), model_rforest.feature_importances_) }

In [14]:
important_features_sorted = pd.Series(important_features).sort_values(ascending=False)
important_features_sorted.head(20)

Init_Win_bytes_forward     0.141411
max_flowiat                0.038883
flow_fin                   0.033005
Init_Win_bytes_backward    0.031345
duration                   0.030967
mean_flowiat               0.030040
fPktsPerSecond             0.029483
flowBytesPerSecond         0.027684
flowPktsPerSecond          0.027376
min_flowpktl               0.026106
mean_flowpktl              0.024706
total_fpktl                0.023182
avgPacketSize              0.022515
max_fpktl                  0.021993
min_flowiat                0.020008
fAvgSegmentSize            0.019386
mean_fpktl                 0.019103
total_fiat                 0.018641
min_seg_size_forward       0.017701
bPktsPerSecond             0.017591
dtype: float64

## 5. Seleccionar las 10 mejores caracteristicas

In [15]:
columns = list(important_features_sorted.head(10).index)

In [16]:
x_train_reduced = x_train[columns].copy()
x_val_reduced = x_val[columns].copy()

In [17]:
x_train_reduced.head(10)

Unnamed: 0,Init_Win_bytes_forward,max_flowiat,flow_fin,Init_Win_bytes_backward,duration,mean_flowiat,fPktsPerSecond,flowBytesPerSecond,flowPktsPerSecond,min_flowpktl
508881,0,490,0,0,490,490.0,2040.816327,679591.8367,4081.632653,73
208326,0,-1,0,-1,0,0.0,0.0,0.0,0.0,422
107213,0,-1,0,-1,0,0.0,0.0,0.0,0.0,436
466726,0,23933,0,0,23933,23933.0,41.783312,21267.70568,83.566623,54
230085,0,-1,0,-1,0,0.0,0.0,0.0,0.0,422
472961,4194240,60224201,1,1145472,60365946,8623707.0,0.066263,22.993096,0.132525,52
482372,62912,212,1,-1,212,212.0,9433.962264,636792.4528,9433.962264,52
619993,107008,30839880,2,-1,30839880,30800000.0,0.064851,5.382641,0.064851,83
65344,0,-1,0,-1,0,0.0,0.0,0.0,0.0,436
46666,0,-1,0,-1,0,0.0,0.0,0.0,0.0,365


In [18]:
model_rforest = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
model_rforest.fit(x_train_reduced, y_train)

In [19]:
y_pred = model_rforest.predict(x_val_reduced)

In [20]:
f_score = f1_score(y_pred, y_val, average='weighted')
print('Score: ', f_score)

Score:  0.926788599012114


El rendimiento del modelo mejora significativamente entrenandose mucho mas rápido que antes y la puntución del mismo a penas se redujo un 1% aproximadamente