# Lectura de datos

In [61]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression,mutual_info_classif
import numpy as np
import json
FILE = 'data'

Leamos los datos para la métrica de información mutua, tratando la variable clase como una variable numérica.

In [62]:
df_numeric_class = pd.read_csv('%s/winequality-white.csv' %FILE, sep=";")
df_numeric_class.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [63]:
def metric_CFS(columns, df, class_attr  = 'quality'):
    correlation = df.corr()
    amount_attr = len(columns)
    
    avg_ca = np.mean([
        abs(correlation[column][class_attr])
        for column in columns
    ])
    
    if amount_attr == 1: 
        avg_aa = 1
    else:
        avg_aa = np.mean([
            abs(correlation[k][j])
            for k in columns
            for j in columns
            if k != j
        ])
    return (amount_attr*avg_ca)/np.sqrt(amount_attr+amount_attr*(amount_attr-1)*avg_aa)
print(metric_CFS(['alcohol'], df_numeric_class))

0.4355747154613688


In [75]:
def metric_MIFS(columns, df, class_attr  = 'quality', beta = 1.):
pass

En el caso de la métrica CFS, transformamos los datos para convertir al atributo clase en uno nominal

In [64]:
df_nominal_class = df_numeric_class.copy()
df_nominal_class['quality'] = df_numeric_class['quality'].astype(str)
df_nominal_class.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   object 
dtypes: float64(11), object(1)
memory usage: 459.3+ KB


La clase nodo almacenará los estados de las combinaciones posibles  junto a su valor de correlación

In [65]:
class Node:
    def __init__(self, columns: list, value: float):
        self.columns = columns
        self.value   = value
    
    def __eq__(self, other):
        return self.value == other.value

    def __ne__(self, other):
        return self.value != other.value

    def __lt__(self, other):
        return self.value < other.value

    def __le__(self, other):
        return self.value <= other.value

    def __gt__(self, other):
        return self.value > other.value

    def __ge__(self, other):
        return self.value >= other.value
    
    def __repr__(self):
        return "%s = %s" % (self.columns, self.value)

El método *get_children* se encarga de a partir de un estado crear los N estados siguientes siguiendo esa rama del árbol de búsqueda, eligiendo las combinaciones faltantes. Se escoge como valor que guía la búsqueda la media de todas las correlaciones.

In [69]:
def get_children(father: Node, columns: list, metric: str, df: pd.DataFrame) -> list:
    if not father:
        return [
            Node([column],  metric([column], df)) for column in columns
        ]

    children = []
    for column in columns:
        if column not in father.columns:
            copyFatherColumns = father.columns.copy()
            copyFatherColumns.append(column)
            children.append(
                Node(copyFatherColumns,  metric(copyFatherColumns, df))
            )
            # np.mean(abs(metric(df.loc[:,copyFatherColumns], df.iloc[:,-1])))

    return children

*select_from_variables* se encarga de la exploración del árbol de estados mediante SFS y la métrica correspondiente.

In [70]:
def select_from_variables(df: pd.DataFrame, metric: classmethod, tree_deep: int = 4) -> pd.DataFrame:
    columns = list(df.columns[:-1])
    if len(columns) <= tree_deep:
        raise RuntimeError("the deep of the search tree is higher than the number of columns")

    selection = None
    for _ in range(tree_deep):
        selection = max(get_children(selection, columns, metric, df))

    return selection
     

Para comparar entre las métricas se varia el número de variables seleccionadas de 2 en 2 hasta 8.

## Busqueda mediante regresión mutua

In [71]:
reportMIFS = {} 
for deep in range(2,10,2):
    reportMIFS['Tree Deep %s' %deep] = select_from_variables(df_numeric_class, metric_CFS, tree_deep=deep)

## Busqueda mediante clasificación mutua

In [73]:
reportCFS = {} 
for deep in range(2,10,2):
    reportCFS['Tree Deep %s' %deep] = select_from_variables(df_numeric_class, metric_CFS, tree_deep=deep)

# Conclusiones

In [74]:
for key, value in reportMIFS.items():
    print("#############",key,"#############")
    print("MIFS metric: ", sorted(value.columns))
    print("CFS metric: ", sorted(reportCFS[key].columns))
    print("\n")

############# Tree Deep 2 #############
MIFS metric:  ['alcohol', 'volatile acidity']
CFS metric:  ['alcohol', 'volatile acidity']


############# Tree Deep 4 #############
MIFS metric:  ['alcohol', 'chlorides', 'density', 'volatile acidity']
CFS metric:  ['alcohol', 'chlorides', 'density', 'volatile acidity']


############# Tree Deep 6 #############
MIFS metric:  ['alcohol', 'chlorides', 'density', 'fixed acidity', 'sulphates', 'volatile acidity']
CFS metric:  ['alcohol', 'chlorides', 'density', 'fixed acidity', 'sulphates', 'volatile acidity']


############# Tree Deep 8 #############
MIFS metric:  ['alcohol', 'chlorides', 'density', 'fixed acidity', 'pH', 'sulphates', 'total sulfur dioxide', 'volatile acidity']
CFS metric:  ['alcohol', 'chlorides', 'density', 'fixed acidity', 'pH', 'sulphates', 'total sulfur dioxide', 'volatile acidity']




En suma, se puede apreciar que para un número pequeño de atributos las dos métricas bajo un mismo método escogen los mismo atributos que son los que mayor correlación tienen con la variable clase, sin embargo cuando aumentamos el número de los mismo observamos que escogen atributos distintos. Mediante la métrica del MIFS se esconge las columnas de **ph** o **chlorides**, en cambio para la CFS se prefiere escoger las columnas de **citric acid** o **sulphates**. Solo faltaría probar los diferentes conjuntos en un modelo para comparar su calidad con respecto al problema.