## Predicción: aprendizaje supervisado sin selección de atributos

In [1]:
# Parametros de configuracion del script

dataset_name = 'GSE21050'                         # nombre del dataset
not_genes_columns = ['group']                           # columnas que no miden valores genicos
dataset_files_folder_preprocessing ="P1_ficheros_preprocesamiento_"+dataset_name+"/" # directorio con ficheros de preprocesamiento
dataset_files_folder_comparative ="P2_ficheros_comparativaSupervisado_"+dataset_name+"/" # directorio con ficheros de comparativa con/sin seleccion de atributos
file_source_trainval = dataset_files_folder_comparative+dataset_name+'_7_trainval.csv'    # directorio con datos de train-validation
n_splits = 5                                             # numero de particiones para validacion cruzada                         

#### Importaciones

In [2]:
# Python imports
# ----------------------------------------

# Variable export
import pickle

# Data structure
import pandas as pd
import numpy as np

# Sklearn preprocess, split, pipeline, GridSearch
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import statistics

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from skopt import BayesSearchCV

# Sklearn classification models
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn import svm
from sklearn.svm import NuSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import RandomForestClassifier

# Deep learning
# import tensorflow as tf
# from tensorflow import keras

# Hacer reproducibles los experimentos en sklearn y tensorflo
SEED=42
np.random.seed(SEED)

# tf.keras.utils.set_random_seed(SEED)


In [3]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#### Lectura del dataset

In [4]:
# Division del conjunto original en un conjunto de entrenamiento-validacion y otro de test
import subprocess
subprocess.call(['python', '../split_dataset.py', 
                dataset_files_folder_preprocessing+dataset_name+'_6_diffexp.csv', 
                dataset_files_folder_comparative+dataset_name+'_7_trainval.csv',  
                dataset_files_folder_comparative+dataset_name+'_7_test.csv', 
                'group'])

0

In [5]:
dataset = pd.read_csv(file_source_trainval, sep=',', header=0, index_col=0)
dataset

Unnamed: 0,group,time_survivor,AARSD1,ABCA8,ABHD14A,ABLIM1,ACAA2,ACADSB,ACADVL,ACAN,...,ZNF367,ZNF423,ZNF436,ZNF446,ZNF503,ZNF514,ZNF700,ZNF805,ZNF93,ZWINT
162,1,1.12,6.190022,4.169925,7.065005,6.210038,6.130056,7.400026,9.439997,3.879706,...,7.750003,6.099926,6.570067,5.560104,7.360013,7.499968,6.459923,5.994918,5.715021,9.929998
264,0,2.37,7.840023,8.270015,7.380029,8.780015,6.099943,4.620000,8.830008,4.580145,...,10.569998,6.210038,7.120016,5.580145,7.290019,5.215211,6.950002,5.514903,6.205072,11.340000
187,1,0.51,5.839960,12.070000,6.295071,10.350000,7.049962,6.349967,9.420002,5.870118,...,9.410006,7.020035,8.150016,6.450056,7.829976,5.874957,7.609991,7.200023,5.064949,7.869995
35,1,2.18,7.910013,3.459432,6.169925,7.840023,6.639975,6.759955,10.100005,4.230357,...,10.960002,4.869871,5.089857,5.560104,7.609991,8.354990,9.090007,5.059998,5.829888,9.869995
61,1,2.76,7.180009,8.239980,5.955040,9.840007,5.039956,5.839960,10.110000,9.720005,...,10.499995,3.539779,5.035035,5.169925,8.250014,7.340024,9.059993,6.079957,7.980016,9.199991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,1,0.94,7.900021,7.290019,5.744957,9.140012,6.880089,5.499846,9.539992,6.450056,...,10.469998,7.199967,6.435020,5.599913,7.950002,7.600030,10.170000,6.500010,9.699994,10.100005
232,0,0.61,6.820051,8.429992,6.075019,7.460005,7.525016,7.009997,9.840007,4.189825,...,8.180009,6.380071,5.734898,5.720005,8.439997,5.814972,7.210038,7.460014,3.900062,10.189997
304,0,3.19,6.190022,10.610001,7.129983,10.860001,6.954973,6.680043,9.329998,3.959770,...,5.960002,10.939998,7.540040,5.569856,10.289996,6.755010,6.720005,6.570036,6.860019,8.350011
198,1,0.32,5.200065,2.611172,6.775034,5.809929,7.660077,4.730096,9.420002,4.760221,...,8.669983,6.190022,4.740023,6.380071,8.669983,6.234891,6.729961,6.440009,5.400175,9.639992


In [6]:
X = dataset.drop(['group'], axis=1)
X

Unnamed: 0,time_survivor,AARSD1,ABCA8,ABHD14A,ABLIM1,ACAA2,ACADSB,ACADVL,ACAN,ACAT2,...,ZNF367,ZNF423,ZNF436,ZNF446,ZNF503,ZNF514,ZNF700,ZNF805,ZNF93,ZWINT
162,1.12,6.190022,4.169925,7.065005,6.210038,6.130056,7.400026,9.439997,3.879706,9.790006,...,7.750003,6.099926,6.570067,5.560104,7.360013,7.499968,6.459923,5.994918,5.715021,9.929998
264,2.37,7.840023,8.270015,7.380029,8.780015,6.099943,4.620000,8.830008,4.580145,8.729995,...,10.569998,6.210038,7.120016,5.580145,7.290019,5.215211,6.950002,5.514903,6.205072,11.340000
187,0.51,5.839960,12.070000,6.295071,10.350000,7.049962,6.349967,9.420002,5.870118,7.939991,...,9.410006,7.020035,8.150016,6.450056,7.829976,5.874957,7.609991,7.200023,5.064949,7.869995
35,2.18,7.910013,3.459432,6.169925,7.840023,6.639975,6.759955,10.100005,4.230357,7.969991,...,10.960002,4.869871,5.089857,5.560104,7.609991,8.354990,9.090007,5.059998,5.829888,9.869995
61,2.76,7.180009,8.239980,5.955040,9.840007,5.039956,5.839960,10.110000,9.720005,9.380006,...,10.499995,3.539779,5.035035,5.169925,8.250014,7.340024,9.059993,6.079957,7.980016,9.199991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,0.94,7.900021,7.290019,5.744957,9.140012,6.880089,5.499846,9.539992,6.450056,10.730003,...,10.469998,7.199967,6.435020,5.599913,7.950002,7.600030,10.170000,6.500010,9.699994,10.100005
232,0.61,6.820051,8.429992,6.075019,7.460005,7.525016,7.009997,9.840007,4.189825,7.449974,...,8.180009,6.380071,5.734898,5.720005,8.439997,5.814972,7.210038,7.460014,3.900062,10.189997
304,3.19,6.190022,10.610001,7.129983,10.860001,6.954973,6.680043,9.329998,3.959770,7.069960,...,5.960002,10.939998,7.540040,5.569856,10.289996,6.755010,6.720005,6.570036,6.860019,8.350011
198,0.32,5.200065,2.611172,6.775034,5.809929,7.660077,4.730096,9.420002,4.760221,9.090007,...,8.669983,6.190022,4.740023,6.380071,8.669983,6.234891,6.729961,6.440009,5.400175,9.639992


In [7]:
y = dataset['group']
y

162    1
264    0
187    1
35     1
61     1
      ..
176    1
232    0
304    0
198    1
290    0
Name: group, Length: 262, dtype: int64

#### Definición de pipelines

In [8]:
# Regresión logistica
pipeline_LR = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('LR', LogisticRegression(random_state=SEED))
])

In [9]:
# KNN
pipeline_KNN = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('KNN', KNeighborsClassifier())
])

In [10]:
# SVC
pipeline_SVC = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('SVC', SVC(random_state=SEED))
])

In [11]:
# NuSVC
pipeline_NuSVC = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('NuSVC', NuSVC(random_state=SEED))
])

In [12]:
# Random Forest
pipeline_RF = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('RF', RandomForestClassifier(random_state=SEED))
])

In [13]:
# XGBoost
pipeline_XGB = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('XGB', XGBClassifier(random_state=SEED))
])

#### Ajuste de hiperparámetros: GridSearch y validación cruzada estratificada

In [14]:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) 

In [15]:
# Regresión logistica
param_grid_LR = [{
    'LR__penalty': ['l1', 'l2'], 
    'LR__C': [0.1, 1, 10, 20], 
    'LR__solver': ['liblinear']}]
grid_search_LR = GridSearchCV(pipeline_LR, param_grid=param_grid_LR, cv=skf, scoring='accuracy')
grid_search_LR.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('LR',
                                        LogisticRegression(random_state=42))]),
             param_grid=[{'LR__C': [0.1, 1, 10, 20],
                          'LR__penalty': ['l1', 'l2'],
                          'LR__solver': ['liblinear']}],
             scoring='accuracy')

In [16]:
print(grid_search_LR.best_params_)
print(grid_search_LR.best_score_)

{'LR__C': 1, 'LR__penalty': 'l1', 'LR__solver': 'liblinear'}
0.7293178519593614


In [17]:
with open(dataset_files_folder_comparative+'without_grid_LR.pkl', 'wb') as f:
    pickle.dump(grid_search_LR, f)

In [18]:
# KNN
param_grid_KNN = [{
    'KNN__n_neighbors': [6,10,20,50], 
    'KNN__weights': ['uniform', 'distance'], 
    'KNN__metric': ['euclidean', 'manhattan']}]
grid_search_KNN = GridSearchCV(pipeline_KNN, param_grid=param_grid_KNN, cv=skf, scoring='accuracy')
grid_search_KNN.fit(X, y)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('KNN', KNeighborsClassifier())]),
             param_grid=[{'KNN__metric': ['euclidean', 'manhattan'],
                          'KNN__n_neighbors': [6, 10, 20, 50],
                          'KNN__weights': ['uniform', 'distance']}],
             scoring='accuracy')

In [19]:
print(grid_search_KNN.best_params_)
print(grid_search_KNN.best_score_)

{'KNN__metric': 'euclidean', 'KNN__n_neighbors': 6, 'KNN__weights': 'uniform'}
0.7253991291727141


In [20]:
with open(dataset_files_folder_comparative+'without_grid_KNN.pkl', 'wb') as f:
    pickle.dump(grid_search_KNN, f)

In [21]:
# SVC
param_grid_SVC = [{'SVC__kernel': ['linear', 'rbf', 'sigmoid'], 
                   'SVC__C': [1, 10, 100, 1000],
                   'SVC__gamma':[1,0.1,0.001,0.0001]
                   }]
grid_search_SVC = GridSearchCV(pipeline_SVC, param_grid=param_grid_SVC, cv=skf, scoring='accuracy')
grid_search_SVC.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('SVC', SVC(random_state=42))]),
             param_grid=[{'SVC__C': [1, 10, 100, 1000],
                          'SVC__gamma': [1, 0.1, 0.001, 0.0001],
                          'SVC__kernel': ['linear', 'rbf', 'sigmoid']}],
             scoring='accuracy')

In [22]:
print(grid_search_SVC.best_params_)
print(grid_search_SVC.best_score_)

{'SVC__C': 1000, 'SVC__gamma': 0.001, 'SVC__kernel': 'sigmoid'}
0.7066037735849057


In [23]:
with open(dataset_files_folder_comparative+'without_grid_SVC.pkl', 'wb') as f:
    pickle.dump(grid_search_SVC, f)

In [24]:
# NuSVC
param_grid_NuSVC = [{'NuSVC__kernel': ['linear', 'rbf', 'sigmoid'], 
                   'NuSVC__gamma':[1,0.1,0.001,0.0001],
                   'NuSVC__nu': [0.1, 0.5]
    }]

grid_search_NuSVC = GridSearchCV(pipeline_NuSVC, param_grid=param_grid_NuSVC, cv=skf, scoring='accuracy')
grid_search_NuSVC.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('NuSVC', NuSVC(random_state=42))]),
             param_grid=[{'NuSVC__gamma': [1, 0.1, 0.001, 0.0001],
                          'NuSVC__kernel': ['linear', 'rbf', 'sigmoid'],
                          'NuSVC__nu': [0.1, 0.5]}],
             scoring='accuracy')

In [25]:
print(grid_search_NuSVC.best_params_)
print(grid_search_NuSVC.best_score_)

{'NuSVC__gamma': 0.001, 'NuSVC__kernel': 'sigmoid', 'NuSVC__nu': 0.1}
0.7141509433962264


In [26]:
with open(dataset_files_folder_comparative+'without_grid_NuSVC.pkl', 'wb') as f:
    pickle.dump(grid_search_NuSVC, f)

In [27]:
# Random Forest
param_grid_RF = [{'RF__n_estimators': [5, 10, 20, 100], 
                  'RF__max_depth': [5, 10, 20]
                  }]

grid_search_RF = GridSearchCV(pipeline_RF, param_grid=param_grid_RF, cv=skf, scoring='accuracy')
grid_search_RF.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('RF',
                                        RandomForestClassifier(random_state=42))]),
             param_grid=[{'RF__max_depth': [5, 10, 20],
                          'RF__n_estimators': [5, 10, 20, 100]}],
             scoring='accuracy')

In [28]:
print(grid_search_RF.best_params_)
print(grid_search_RF.best_score_)

{'RF__max_depth': 20, 'RF__n_estimators': 20}
0.7136429608127721


In [29]:
with open(dataset_files_folder_comparative+'without_grid_RF.pkl', 'wb') as f:
    pickle.dump(grid_search_RF, f)

In [30]:
# XGBoost
param_grid_XGB = [{'XGB__n_estimators': [5,10,20], 
                   'XGB__max_depth': [5, 10, 20], 
                   'XGB__learning_rate': [0.01, 0.1, 0.3, 1]
                   }]

grid_search_XGB = GridSearchCV(pipeline_XGB, param_grid=param_grid_XGB, cv=skf, scoring='f1')
grid_search_XGB.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('XGB',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      feature_types=None,
                                                      gamma..

In [31]:
print(grid_search_XGB.best_params_)
print(grid_search_XGB.best_score_)

{'XGB__learning_rate': 0.3, 'XGB__max_depth': 5, 'XGB__n_estimators': 10}
0.6562341796917799


In [32]:
with open(dataset_files_folder_comparative+'without_grid_XGB.pkl', 'wb') as f:
    pickle.dump(grid_search_XGB, f)

In [33]:
df_res_without = pd.DataFrame(
    np.transpose([
    [grid_search_LR.best_params_, grid_search_KNN.best_params_, grid_search_SVC.best_params_, grid_search_NuSVC.best_params_, grid_search_RF.best_params_, grid_search_XGB.best_params_], 
    [grid_search_LR.best_score_, grid_search_KNN.best_score_, grid_search_SVC.best_score_, grid_search_NuSVC.best_score_, grid_search_RF.best_score_, grid_search_XGB.best_score_]
    ]), 
    index=['LR', 'KNN', 'SVC', 'NuSVC', 'RF', 'XGB'], columns=['Best params', 'Best score']
)

df_res_without

Unnamed: 0,Best params,Best score
LR,"{'LR__C': 1, 'LR__penalty': 'l1', 'LR__solver'...",0.729318
KNN,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",0.725399
SVC,"{'SVC__C': 1000, 'SVC__gamma': 0.001, 'SVC__ke...",0.706604
NuSVC,"{'NuSVC__gamma': 0.001, 'NuSVC__kernel': 'sigm...",0.714151
RF,"{'RF__max_depth': 20, 'RF__n_estimators': 20}",0.713643
XGB,"{'XGB__learning_rate': 0.3, 'XGB__max_depth': ...",0.656234


In [34]:
# Almacenar los resultados
df_res_without.to_csv(dataset_files_folder_comparative+dataset_name+'_7_class_without.csv')
with open(dataset_files_folder_comparative+dataset_name+'_7_class_without.pkl', 'wb') as f:
    pickle.dump(df_res_without, f)