## Predicción: aprendizaje supervisado con selección de atributos FOCI

In [1]:
# Parametros de configuracion del script

dataset_name = 'prostateCancer'                         # nombre del dataset
not_genes_columns = ['group']                           # columnas que no miden valores genicos
dataset_files_folder_preprocessing ="P1_ficheros_preprocesamiento_"+dataset_name+"/" # directorio con ficheros de preprocesamiento
dataset_files_folder_comparative ="P2_ficheros_comparativaSupervisado_"+dataset_name+"/" # directorio con ficheros de comparativa con/sin seleccion de atributos
file_source_trainval = dataset_files_folder_comparative+dataset_name+'_7_trainval.csv'    # directorio con datos de train-validation
n_splits = 3                                             # numero de particiones para validacion cruzada
n_cores = 7                                              # numero de nucleos de paralelización para FOCI                         

#### Importaciones

In [2]:
# Python imports
# ----------------------------------------

# Variable export
import pickle

# Data structure
import pandas as pd
import numpy as np
from itertools import product
from sklearn.base import clone

# Sklearn preprocess, split, pipeline, GridSearch
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import statistics

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

# Sklearn classification models
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn import svm
from sklearn.svm import NuSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import RandomForestClassifier

# Deep learning
# import tensorflow as tf
# from tensorflow import keras

# Hacer reproducibles los experimentos en sklearn y tensorflo
SEED=42
np.random.seed(SEED)

# tf.keras.utils.set_random_seed(SEED)

In [3]:
# R imports
# ---------------------------------------------

from rpy2.robjects.packages import importr
import rpy2.robjects.packages as rpackages
import rpy2.robjects.packages as roptions
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

# import R's "base" package
base = importr('base')

# import R's "utils" package
utils = importr('utils')

# Establecer el mirror de CRAN en el mirror de Cloud R
utils.chooseCRANmirror(ind=1)

# Instalar paquetes
# utils.install_packages('FOCI')
foci = rpackages.importr('FOCI')

# Fijar semilla en R
ro.r('set.seed({})'.format(SEED))

<rpy2.rinterface_lib.sexp.NULLType object at 0x000001CAA6F73680> [RTYPES.NILSXP]

In [4]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#### Lectura del dataset

In [5]:
# Division del conjunto original en un conjunto de entrenamiento-validacion y otro de test
import subprocess
subprocess.call(['python', '../split_dataset.py', 
                dataset_files_folder_preprocessing+dataset_name+'_6_diffexp.csv', 
                dataset_files_folder_comparative+dataset_name+'_7_trainval.csv',  
                dataset_files_folder_comparative+dataset_name+'_7_test.csv', 
                'group'])

0

In [6]:
dataset = pd.read_csv(file_source_trainval, sep=',', header=0, index_col=0)
dataset

Unnamed: 0,group,AAK1,AAMP,AANAT,AASDHPPT,AATF,AATK,ABAT,ABCA1,ABCA2,...,ZNF92,ZNHIT1,ZNRD1ASP,ZNRD2,ZP2,ZPR1,ZSCAN12,ZSCAN26,ZSCAN9,ZSWIM8
67,0,-17.150245,39.816176,877.416912,24.822059,132.980147,1802.222059,45.801838,179.816912,5.400735,...,-102.549265,50.452941,39.816176,12.741912,108.791912,96.073529,506.251471,168.285294,64.431618,793.502941
134,0,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,...,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853
12,1,25.135294,79.930147,76.789706,137.674265,100.394853,177.502574,33.021324,2.381495,-15.910784,...,-97.505515,140.274265,10.325000,63.831250,11.557353,54.566912,92.991544,22.476471,29.462500,40.244118
29,1,39.574020,63.068382,118.509926,129.107353,103.919118,163.919118,11.243750,26.236765,-23.431250,...,-47.862132,106.019853,11.904412,44.525000,15.952941,65.604412,123.169118,38.389706,18.432353,66.128676
9,1,25.267402,90.422059,75.060294,116.741176,107.738603,164.284559,118.533456,10.532230,-40.509559,...,-44.150000,181.593382,2.542647,54.169118,19.483088,63.510294,72.112868,9.051471,22.849265,12.133088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,0,3.070588,5.379412,444.318382,27.744853,132.272794,839.206618,32.542279,87.457598,7.591176,...,-85.967647,74.186765,63.672794,13.180882,39.462500,30.752206,377.258456,110.898529,43.410294,9.900735
86,0,64.329412,35.511029,267.215441,58.744853,161.503676,497.538235,21.019118,63.220833,18.035049,...,-81.195588,75.772059,22.723529,6.840441,51.691176,65.397794,219.042463,32.278676,12.974265,157.344118
18,1,22.640931,91.280147,145.489706,89.536765,153.436765,278.596324,43.356618,26.516667,-21.569363,...,-110.662500,257.409559,13.494853,14.434559,9.977941,77.666912,128.450368,17.613235,11.830882,-19.511765
75,0,106.710417,9.221324,616.604044,50.600000,134.906985,807.845588,9.819485,83.646814,7.640686,...,-113.861029,75.400000,29.001471,42.709559,72.830147,75.400000,267.358824,77.969118,40.033824,47.842647


In [7]:
X = dataset.drop(['group'], axis=1)
X

Unnamed: 0,AAK1,AAMP,AANAT,AASDHPPT,AATF,AATK,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF92,ZNHIT1,ZNRD1ASP,ZNRD2,ZP2,ZPR1,ZSCAN12,ZSCAN26,ZSCAN9,ZSWIM8
67,-17.150245,39.816176,877.416912,24.822059,132.980147,1802.222059,45.801838,179.816912,5.400735,64.431618,...,-102.549265,50.452941,39.816176,12.741912,108.791912,96.073529,506.251471,168.285294,64.431618,793.502941
134,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,...,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853
12,25.135294,79.930147,76.789706,137.674265,100.394853,177.502574,33.021324,2.381495,-15.910784,67.462500,...,-97.505515,140.274265,10.325000,63.831250,11.557353,54.566912,92.991544,22.476471,29.462500,40.244118
29,39.574020,63.068382,118.509926,129.107353,103.919118,163.919118,11.243750,26.236765,-23.431250,50.740441,...,-47.862132,106.019853,11.904412,44.525000,15.952941,65.604412,123.169118,38.389706,18.432353,66.128676
9,25.267402,90.422059,75.060294,116.741176,107.738603,164.284559,118.533456,10.532230,-40.509559,34.427941,...,-44.150000,181.593382,2.542647,54.169118,19.483088,63.510294,72.112868,9.051471,22.849265,12.133088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,3.070588,5.379412,444.318382,27.744853,132.272794,839.206618,32.542279,87.457598,7.591176,79.657353,...,-85.967647,74.186765,63.672794,13.180882,39.462500,30.752206,377.258456,110.898529,43.410294,9.900735
86,64.329412,35.511029,267.215441,58.744853,161.503676,497.538235,21.019118,63.220833,18.035049,65.397794,...,-81.195588,75.772059,22.723529,6.840441,51.691176,65.397794,219.042463,32.278676,12.974265,157.344118
18,22.640931,91.280147,145.489706,89.536765,153.436765,278.596324,43.356618,26.516667,-21.569363,62.832353,...,-110.662500,257.409559,13.494853,14.434559,9.977941,77.666912,128.450368,17.613235,11.830882,-19.511765
75,106.710417,9.221324,616.604044,50.600000,134.906985,807.845588,9.819485,83.646814,7.640686,80.763235,...,-113.861029,75.400000,29.001471,42.709559,72.830147,75.400000,267.358824,77.969118,40.033824,47.842647


In [8]:
y = dataset['group']
y

67     0
134    0
12     1
29     1
9      1
      ..
52     0
86     0
18     1
75     0
105    1
Name: group, Length: 115, dtype: int64

#### Creación de un wrapper de FOCI para Python

Para utilizar FOCI desde sklearn (Python) debemos crear un wrapper estimador con interfaz compatible con sklearn.

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

class FociSelection(BaseEstimator, TransformerMixin): 

    def __init__(self, n_features, n_cores):
        self.n_features = n_features
        self.n_cores = n_cores
        return None
    
    def fit(self, X, y = None):
        print("Fit FOCI", X.shape)

        with (ro.default_converter + pandas2ri.converter).context():
            X_r = ro.conversion.get_conversion().py2rpy(X)
            y_r = ro.conversion.get_conversion().py2rpy(pd.DataFrame(y))

        ro.r('set.seed({})'.format(SEED))        
        res_train = foci.foci(y_r,X_r,stop=False, numCores=self.n_cores, 
                               num_features=self.n_features)
        
        selected_vars_train = list(res_train.rx2('selectedVar').rx2('names'))

        # print("Selected features: ", selected_vars_train)
        # print("Selected features: ", res_train.rx2('selectedVar'))
        self.caracteristicas = selected_vars_train
        return self
    
    def transform(self, X, y = None):
        # print("Transform FOCI", X.shape)
        # print(X[self.caracteristicas].shape)
        return X[self.caracteristicas]
    
    def get_features(self):
        return self.caracteristicas

#### Pipeline y ajuste de parámetros

En este caso, definir un pipeline directamente y realizar ajuste de hiperparámetros con GridSearch y validación cruzada conllevaría una pérdida de eficiencia.  

Para cada iteración de la validación cruzada se utilizan k-1 conjuntos para train y 1 conjunto para validation.   
Debemos aplicar FOCI sobre estos conjuntos (fit sobre train y transform sobre train y validation), pero con realizarlo una vez por cada par (train, validation) es suficiente.  
En cambio, si definimos un pipeline y aplicamos seguidamente GridSearch la operación que realiza FOCI se estaría recalculado para todas las iteraciones de validación pero también para todas las combinaciones de parámetros. Y esto no es necesario, ya que el resultado de FOCI únicamente depende de los datos de la iteración y no de los parámetros del algoritmo.

Por tanto, se propone una optimización que consiste en aplicar selección FOCI sobre las "bolsas" (adecuadamente) y luego aplicar gridSearch sobre dichas bolsas de validación ya transformadas. Esto acelera muchísimo las pruebas y experimentación. No obstante, requiere de la implementación de esta transformación de las bolsas de train-validation y del gridSearch de forma manual.

##### Implementación de la transformación de las bolsas de validación cruzada

In [13]:
skfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) 

In [14]:
def preprocess_skfold(skfold, X, y, n_features, n_cores):
    
    transformacion_skfold = []
    selected_caract = []
    
    for i, (tr_idx, test_idx) in enumerate(skfold.split(X,y)):
        
        # Selector: FOCI
        selector = FociSelection(n_features=n_features, n_cores=n_cores)
        selector.fit(X.iloc[tr_idx], y.iloc[tr_idx])
        selected_iter = selector.get_features()
        Xselected_tr_idx = selector.transform(X.iloc[tr_idx])
        Xselected_test_idx = selector.transform(X.iloc[test_idx])
        
        # Scaler: MinMaxScaler
        scaler = MinMaxScaler()
        scaler.fit(Xselected_tr_idx)
        Xscaled_tr_idx = scaler.transform(Xselected_tr_idx)
        Xscaled_test_idx = scaler.transform(Xselected_test_idx)
        
        transformacion_skfold.append(
            (Xscaled_tr_idx, y.iloc[tr_idx], 
             Xscaled_test_idx, y.iloc[test_idx])
        )
        
        selected_caract.append(selected_iter)
        
    # Guardar la ejecucion como variable en un fichero (para reutilizacion)        
    with open(dataset_files_folder_comparative+'foci_skf_'+str(n_features)+'.pkl', 'wb')  as f:
        pickle.dump((selected_caract, transformacion_skfold), f) 
        
    return selected_caract, transformacion_skfold

In [None]:
# Transformamos las bolsas (pares train, val) de cada interacion de la validacion cruzada (solo una vez)
# FOCI = 50
skf_preprocess50 = preprocess_skfold(
    skfold=skfold, 
    X=X, y=y,
    n_features=50, 
    n_cores=n_cores)

In [None]:
# Transformamos las bolsas (pares train, val) de cada interacion de la validacion cruzada (solo una vez)
# FOCI = 100
skf_preprocess100 = preprocess_skfold(
    skfold=skfold, 
    X=X, y=y,
    n_features=100, 
    n_cores=n_cores)

In [None]:
# Transformamos las bolsas (pares train, val) de cada interacion de la validacion cruzada (solo una vez)
# FOCI = 150
skf_preprocess150 = preprocess_skfold(
    skfold=skfold, 
    X=X, y=y,
    n_features=150, 
    n_cores=n_cores)

##### Implementación del gridSearch sobre las bolsas ya transformadas

In [22]:
def grid_algorithm(model, param_grid, skf_preprocess):
    param_combs = list(product(*param_grid.values()))
    
    best_score = 0
    best_params = None
    
    # Averiguamos que parametros son mejores
    for params in param_combs:
        
        model.set_params(**dict(zip(param_grid.keys(), params)))
        
        # Score de validacion cruzada sobre esos parametros
        score_params_cv = []
        for (Xtrain_iter, ytrain_iter, Xtest_iter, ytest_iter) in skf_preprocess:
            model_iter = clone(model)
            model_iter.fit(Xtrain_iter, ytrain_iter)
            ypred_iter = model_iter.predict(Xtest_iter)
            score_iter = accuracy_score(ytest_iter, ypred_iter)
            score_params_cv.append(score_iter)
        
        score_params = statistics.mean(score_params_cv)
        
        # Comprobamos si esos parametros son mejores de los que ya tengo        
        if score_params > best_score:
            best_score = score_params
            best_params = dict(zip(param_grid.keys(), params))

                
    return {'model_': model.__class__.__name__, 
            'best_params_': best_params, 
            'best_score_': best_score
           }

##### Definición de pipelines y aplicación de GridSearch

In [26]:
def execute_experiments(skf_preprocess, X, y, n_features, n_cores):
    
    # Ajuste de hiperparámetros
    
    # Logistic Regression
    param_grid_LR = {
        'penalty': ['l1', 'l2'], 
        'C': [0.1, 1, 10, 20], 
        'solver': ['liblinear']}

    grid_LR = grid_algorithm(
        LogisticRegression(random_state=SEED),
        param_grid_LR,
        skf_preprocess[1]
    )
    print(grid_LR)

    with open(dataset_files_folder_comparative+'foci_grid_LR_'+str(n_features)+'.pkl', 'wb') as f:
        pickle.dump(grid_LR, f)
    
    # KNearest Neighbors
    param_grid_KNN = {
        'n_neighbors': [6,10,20,50], 
        'weights': ['uniform', 'distance'], 
        'metric': ['euclidean', 'manhattan']}

    grid_KNN = grid_algorithm(
        KNeighborsClassifier(),
        param_grid_KNN,
        skf_preprocess[1]
        )
    print(grid_KNN)

    with open(dataset_files_folder_comparative+'foci_grid_KNN_'+str(n_features)+'.pkl', 'wb') as f:
        pickle.dump(grid_KNN, f)
    
    # SVC
    param_grid_SVC = {'kernel': ['linear', 'rbf', 'sigmoid'], 
                   'C': [1, 10, 100, 1000],
                   'gamma':[1,0.1,0.001,0.0001]
                   }

    grid_SVC = grid_algorithm(
        SVC(random_state=SEED),
        param_grid_SVC,
        skf_preprocess[1]
        )
    print(grid_SVC)

    with open(dataset_files_folder_comparative+'foci_grid_SVC_'+str(n_features)+'.pkl', 'wb') as f:
        pickle.dump(grid_SVC, f)
    
    # NuSVC
    param_grid_NuSVC = {'kernel': ['linear', 'rbf', 'sigmoid'], 
                   'gamma':[1,0.1,0.001,0.0001],
                   'nu': [0.1, 0.5]  }

    grid_NuSVC = grid_algorithm(
        NuSVC(random_state=SEED),
        param_grid_NuSVC,
        skf_preprocess[1]
        )
    print(grid_NuSVC)

    with open(dataset_files_folder_comparative+'foci_grid_NuSVC_'+str(n_features)+'.pkl', 'wb') as f:
        pickle.dump(grid_NuSVC, f)
        
    # Random Forest
    param_grid_RF = {'n_estimators': [5, 10, 20, 100], 
                  'max_depth': [5, 10, 20, 50]
                  }

    grid_RF = grid_algorithm(
        RandomForestClassifier(random_state=SEED),
        param_grid_RF,
        skf_preprocess[1]
    )
    print(grid_RF)

    with open(dataset_files_folder_comparative+'foci_grid_RF_'+str(n_features)+'.pkl', 'wb') as f:
        pickle.dump(grid_RF, f)
        
    # Gradient Boosting
    param_grid_XGB = {'n_estimators': [5,10,20], 
                   'max_depth': [5, 10, 20], 
                   'learning_rate': [0.01, 0.1, 0.3, 1]
                   }

    grid_XGB = grid_algorithm(
        XGBClassifier(random_state=SEED),
        param_grid_XGB,
        skf_preprocess[1]
    )
    print(grid_XGB)

    with open(dataset_files_folder_comparative+'foci_grid_XGB_'+str(n_features)+'.pkl', 'wb') as f:
        pickle.dump(grid_XGB, f)
        
    
    # Resultados 
    df_res = pd.DataFrame(
        np.transpose([
        [grid_LR['best_params_'], grid_KNN['best_params_'], grid_SVC['best_params_'], 
         grid_NuSVC['best_params_'], grid_RF['best_params_'], 
         grid_XGB['best_params_']], 
        [grid_LR['best_score_'], grid_KNN['best_score_'], grid_SVC['best_score_'], 
            grid_NuSVC['best_score_'], grid_RF['best_score_'],
            grid_XGB['best_score_']]
        ]), 
        index=['LR', 'KNN', 'SVC', 'NuSVC', 'RF', 'XGB'], columns=['Best params', 'Best score']
    )
    print(df_res)
    
    # Almacenar los resultados en una variable externa 
    with open(dataset_files_folder_comparative+dataset_name+'_7_class_foci_'+str(n_features)+'.pkl', 'wb') as f:
        pickle.dump(df_res, f)
        
    # Guardar los resultados en un CSV        
    df_res.to_csv(dataset_files_folder_comparative+dataset_name+'_7_class_foci_'+str(n_features)+'.csv')

#### FOCI=50

In [25]:
with open(dataset_files_folder_comparative+'foci_skf_50.pkl', 'rb') as f:
    skf_preprocess50 = pickle.load(f)

In [26]:
execute_experiments(skf_preprocess50, X=X, y=y, n_features=50, n_cores=7)

{'model_': 'LogisticRegression', 'best_params_': {'penalty': 'l2', 'C': 1, 'solver': 'liblinear'}, 'best_score_': 0.8263607737291948}
{'model_': 'KNeighborsClassifier', 'best_params_': {'n_neighbors': 6, 'weights': 'uniform', 'metric': 'manhattan'}, 'best_score_': 0.8697705802968961}
{'model_': 'SVC', 'best_params_': {'kernel': 'rbf', 'C': 1, 'gamma': 1}, 'best_score_': 0.8434547908232118}
{'model_': 'NuSVC', 'best_params_': {'kernel': 'rbf', 'gamma': 1, 'nu': 0.5}, 'best_score_': 0.8434547908232118}
{'model_': 'RandomForestClassifier', 'best_params_': {'n_estimators': 20, 'max_depth': 5}, 'best_score_': 0.8349077822762033}
{'model_': 'XGBClassifier', 'best_params_': {'n_estimators': 10, 'max_depth': 5, 'learning_rate': 1}, 'best_score_': 0.8434547908232118}
                                             Best params Best score
LR      {'penalty': 'l2', 'C': 1, 'solver': 'liblinear'}   0.826361
KNN    {'n_neighbors': 6, 'weights': 'uniform', 'metr...   0.869771
SVC                {'kernel

#### FOCI=100

In [18]:
with open(dataset_files_folder_comparative+'foci_skf_100.pkl', 'rb') as f:
    skf_preprocess100 = pickle.load(f)

In [19]:
execute_experiments(skf_preprocess100,  X=X, y=y, n_features=100, n_cores=7)

{'model_': 'LogisticRegression', 'best_params_': {'penalty': 'l2', 'C': 1, 'solver': 'liblinear'}, 'best_score_': 0.8690958164642375}
{'model_': 'KNeighborsClassifier', 'best_params_': {'n_neighbors': 10, 'weights': 'distance', 'metric': 'manhattan'}, 'best_score_': 0.8868645973909132}
{'model_': 'SVC', 'best_params_': {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}, 'best_score_': 0.8780926675663518}
{'model_': 'NuSVC', 'best_params_': {'kernel': 'rbf', 'gamma': 0.0001, 'nu': 0.5}, 'best_score_': 0.8866396761133604}
{'model_': 'RandomForestClassifier', 'best_params_': {'n_estimators': 5, 'max_depth': 5}, 'best_score_': 0.8522267206477733}
{'model_': 'XGBClassifier', 'best_params_': {'n_estimators': 10, 'max_depth': 5, 'learning_rate': 1}, 'best_score_': 0.8434547908232118}
                                             Best params Best score
LR      {'penalty': 'l2', 'C': 1, 'solver': 'liblinear'}   0.869096
KNN    {'n_neighbors': 10, 'weights': 'distance', 'me...   0.886865
SVC          {'

#### FOCI=150

In [20]:
with open(dataset_files_folder_comparative+'foci_skf_150.pkl', 'rb') as f:
    skf_preprocess150 = pickle.load(f)

In [21]:
execute_experiments(skf_preprocess150,  X=X, y=y, n_features=150, n_cores=7)

{'model_': 'LogisticRegression', 'best_params_': {'penalty': 'l1', 'C': 1, 'solver': 'liblinear'}, 'best_score_': 0.8783175888439047}
{'model_': 'KNeighborsClassifier', 'best_params_': {'n_neighbors': 20, 'weights': 'uniform', 'metric': 'manhattan'}, 'best_score_': 0.8693207377417904}
{'model_': 'SVC', 'best_params_': {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}, 'best_score_': 0.8780926675663518}
{'model_': 'NuSVC', 'best_params_': {'kernel': 'rbf', 'gamma': 0.1, 'nu': 0.5}, 'best_score_': 0.8693207377417904}
{'model_': 'RandomForestClassifier', 'best_params_': {'n_estimators': 5, 'max_depth': 5}, 'best_score_': 0.860548807917229}
{'model_': 'XGBClassifier', 'best_params_': {'n_estimators': 20, 'max_depth': 5, 'learning_rate': 0.3}, 'best_score_': 0.8346828609986505}
                                             Best params Best score
LR      {'penalty': 'l1', 'C': 1, 'solver': 'liblinear'}   0.878318
KNN    {'n_neighbors': 20, 'weights': 'uniform', 'met...   0.869321
SVC          {'ker