# Selección de características mediante filtrado

## 0. Imports & cargado de datos

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

In [2]:
file_path = "../datasets/"
file_name = "setX_wdbc.csv"
X_init = pd.read_csv(file_path+file_name, sep = ';', decimal = '.', index_col=0)
file_name = "setY_wdbc.csv"  
Y_init = pd.read_csv(file_path+file_name, sep = ';', decimal = '.', index_col=0)
#------------------------------------
X_init.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dim1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dim3
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## 1. Filtrado por la varianza

### Escalado de las características al intervalo unidad

In [3]:
unit_scaler = MinMaxScaler().set_output(transform="pandas")
unit_scaler.fit(X_init)
X_scl = unit_scaler.transform(X_init)
#------------------------------------------
X_init.head()
X_scl.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dim1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dim3
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


### Aplicar umbral

Podemos aplicar elfiltrado por umbral de la varianza a dos conjuntos de datos: <code>X_init</code> y <code>X_scl</code>.

La diferencia es que en X_init hay columnas de diferentes escalas, mientras que en X_scl todas las columnas están limitadas al intervalo $[0,1]$.

Usamos el método <code>sklearn.preprocessing.MinMaxScaler</code> para escalar las características y <code>sklearn.feature_selection.VarianceThreshold</code> para seleccionar las características.

En concreto, <code>VarianceThreshold</code> eliminará del dataframe todas aquellas columnas que no superen <code>var_th</code>, el umbral determinado para la varianza.

In [4]:
var_th = 0.01
choice = 'scl' # 'scl' or 'init'
feat_selector = VarianceThreshold(var_th).set_output(transform='pandas')

flag_error = False

if choice == 'init':
    feat_selector.fit(X_init)
    X_sel = feat_selector.transform(X_init)
elif choice == 'scl':
    feat_selector.fit(X_scl)
    X_sel = feat_selector.transform(X_scl)    
else:
    flag_error = True
    print('--- error: Incorrect choice ! ---')

#----------------------
if not(flag_error):
    strMsg = X_sel.columns.to_list()
    print('%d features have been removed'  % (X_init.columns.shape[0]-len(strMsg)) ) 
    print('%d features have been selected:'% len(strMsg))
    print( strMsg )

4 features have been removed
26 features have been selected:
['radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1', 'symmetry1', 'fractal_dim1', 'radius2', 'texture2', 'smoothness2', 'compactness2', 'concave_points2', 'symmetry2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dim3']


## 2. Filtrado por correlación

In [5]:
Y_num = Y_init.replace(['M','B'],[-1,1])
X = pd.concat((X_init,Y_num), axis = 1)
X.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dim1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dim3,Diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,-1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,-1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,-1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,-1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,-1


In [6]:
mat_R = X.corr()

In [7]:
R_th = .97

aux = np.abs( np.triu(mat_R.values) - np.eye(31) )
ind_row = np.argmax(aux, axis=0)
ind_col=0
aboveTh_list=[]
removed_list=[]
for ir in ind_row:
    if (aux[ir,ind_col] >= R_th):
        aboveTh_list.append((ir,ind_col))
        removed_list.append(ind_col)
    ind_col=ind_col+1

In [8]:
print('Pairs with correlation above %0.4f are:'%R_th)
[print(' ',X.columns[aux[0]],'-',X.columns[aux[1]]) for aux in aboveTh_list]
print('so, features removed are:')
drop_list = [aux for aux in X.columns[removed_list] ]
print(' ',drop_list)

Pairs with correlation above 0.9700 are:
  radius1 - perimeter1
  radius1 - area1
  radius2 - perimeter2
  radius3 - perimeter3
  radius3 - area3
so, features removed are:
  ['perimeter1', 'area1', 'perimeter2', 'perimeter3', 'area3']


In [9]:
X_sel = X_init.drop(columns=drop_list)

X_sel.head()

Unnamed: 0,radius1,texture1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dim1,radius2,texture2,...,symmetry2,fractal_dim2,radius3,texture3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dim3
0,17.99,10.38,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,...,0.03003,0.006193,25.38,17.33,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,...,0.01389,0.003532,24.99,23.41,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,...,0.0225,0.004571,23.57,25.53,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,...,0.05963,0.009208,14.91,26.5,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,...,0.01756,0.005115,22.54,16.67,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## 3. Filtrado por información mutua

In [10]:
from sklearn.feature_selection import mutual_info_classif

In [11]:
MI = mutual_info_classif(X_init.values, Y_init.values.ravel(), n_neighbors=4, random_state=1234)

In [12]:
imax = MI.argmax()
print('The feature with highest mutual information w.r.t. the target is "%s":'%X_init.columns[imax])
print('The mutual information is %0.3f'%MI[imax])

The feature with highest mutual information w.r.t. the target is "perimeter3":
The mutual information is 0.469
