## Predicción: aprendizaje supervisado sin selección de atributos

In [2]:
# Parametros de configuracion del script

dataset_name = 'prostateCancer'                         # nombre del dataset
not_genes_columns = ['group']                           # columnas que no miden valores genicos
dataset_files_folder_preprocessing ="P1_ficheros_preprocesamiento_"+dataset_name+"/" # directorio con ficheros de preprocesamiento
dataset_files_folder_comparative ="P2_ficheros_comparativaSupervisado_"+dataset_name+"/" # directorio con ficheros de comparativa con/sin seleccion de atributos
file_source_trainval = dataset_files_folder_comparative+dataset_name+'_7_trainval.csv'    # directorio con datos de train-validation
n_splits = 3                                             # numero de particiones para validacion cruzada                         

#### Importaciones

In [3]:
# Python imports
# ----------------------------------------

# Variable export
import pickle

# Data structure
import pandas as pd
import numpy as np

# Sklearn preprocess, split, pipeline, GridSearch
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
import statistics

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from skopt import BayesSearchCV

# Sklearn classification models
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn import svm
from sklearn.svm import NuSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import RandomForestClassifier

# Deep learning
# import tensorflow as tf
# from tensorflow import keras

# Hacer reproducibles los experimentos en sklearn y tensorflo
SEED=42
np.random.seed(SEED)

# tf.keras.utils.set_random_seed(SEED)


In [4]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#### Lectura del dataset

In [5]:
# Division del conjunto original en un conjunto de entrenamiento-validacion y otro de test
import subprocess
subprocess.call(['python', '../split_dataset.py', 
                dataset_files_folder_preprocessing+dataset_name+'_6_diffexp.csv', 
                dataset_files_folder_comparative+dataset_name+'_7_trainval.csv',  
                dataset_files_folder_comparative+dataset_name+'_7_test.csv', 
                'group'])

0

In [6]:
dataset = pd.read_csv(file_source_trainval, sep=',', header=0, index_col=0)
dataset

Unnamed: 0,group,AAK1,AAMP,AANAT,AASDHPPT,AATF,AATK,ABAT,ABCA1,ABCA2,...,ZNF92,ZNHIT1,ZNRD1ASP,ZNRD2,ZP2,ZPR1,ZSCAN12,ZSCAN26,ZSCAN9,ZSWIM8
67,0,-17.150245,39.816176,877.416912,24.822059,132.980147,1802.222059,45.801838,179.816912,5.400735,...,-102.549265,50.452941,39.816176,12.741912,108.791912,96.073529,506.251471,168.285294,64.431618,793.502941
134,0,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,...,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853
12,1,25.135294,79.930147,76.789706,137.674265,100.394853,177.502574,33.021324,2.381495,-15.910784,...,-97.505515,140.274265,10.325000,63.831250,11.557353,54.566912,92.991544,22.476471,29.462500,40.244118
29,1,39.574020,63.068382,118.509926,129.107353,103.919118,163.919118,11.243750,26.236765,-23.431250,...,-47.862132,106.019853,11.904412,44.525000,15.952941,65.604412,123.169118,38.389706,18.432353,66.128676
9,1,25.267402,90.422059,75.060294,116.741176,107.738603,164.284559,118.533456,10.532230,-40.509559,...,-44.150000,181.593382,2.542647,54.169118,19.483088,63.510294,72.112868,9.051471,22.849265,12.133088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,0,3.070588,5.379412,444.318382,27.744853,132.272794,839.206618,32.542279,87.457598,7.591176,...,-85.967647,74.186765,63.672794,13.180882,39.462500,30.752206,377.258456,110.898529,43.410294,9.900735
86,0,64.329412,35.511029,267.215441,58.744853,161.503676,497.538235,21.019118,63.220833,18.035049,...,-81.195588,75.772059,22.723529,6.840441,51.691176,65.397794,219.042463,32.278676,12.974265,157.344118
18,1,22.640931,91.280147,145.489706,89.536765,153.436765,278.596324,43.356618,26.516667,-21.569363,...,-110.662500,257.409559,13.494853,14.434559,9.977941,77.666912,128.450368,17.613235,11.830882,-19.511765
75,0,106.710417,9.221324,616.604044,50.600000,134.906985,807.845588,9.819485,83.646814,7.640686,...,-113.861029,75.400000,29.001471,42.709559,72.830147,75.400000,267.358824,77.969118,40.033824,47.842647


In [7]:
X = dataset.drop(['group'], axis=1)
X

Unnamed: 0,AAK1,AAMP,AANAT,AASDHPPT,AATF,AATK,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF92,ZNHIT1,ZNRD1ASP,ZNRD2,ZP2,ZPR1,ZSCAN12,ZSCAN26,ZSCAN9,ZSWIM8
67,-17.150245,39.816176,877.416912,24.822059,132.980147,1802.222059,45.801838,179.816912,5.400735,64.431618,...,-102.549265,50.452941,39.816176,12.741912,108.791912,96.073529,506.251471,168.285294,64.431618,793.502941
134,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,...,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853,30.744853
12,25.135294,79.930147,76.789706,137.674265,100.394853,177.502574,33.021324,2.381495,-15.910784,67.462500,...,-97.505515,140.274265,10.325000,63.831250,11.557353,54.566912,92.991544,22.476471,29.462500,40.244118
29,39.574020,63.068382,118.509926,129.107353,103.919118,163.919118,11.243750,26.236765,-23.431250,50.740441,...,-47.862132,106.019853,11.904412,44.525000,15.952941,65.604412,123.169118,38.389706,18.432353,66.128676
9,25.267402,90.422059,75.060294,116.741176,107.738603,164.284559,118.533456,10.532230,-40.509559,34.427941,...,-44.150000,181.593382,2.542647,54.169118,19.483088,63.510294,72.112868,9.051471,22.849265,12.133088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,3.070588,5.379412,444.318382,27.744853,132.272794,839.206618,32.542279,87.457598,7.591176,79.657353,...,-85.967647,74.186765,63.672794,13.180882,39.462500,30.752206,377.258456,110.898529,43.410294,9.900735
86,64.329412,35.511029,267.215441,58.744853,161.503676,497.538235,21.019118,63.220833,18.035049,65.397794,...,-81.195588,75.772059,22.723529,6.840441,51.691176,65.397794,219.042463,32.278676,12.974265,157.344118
18,22.640931,91.280147,145.489706,89.536765,153.436765,278.596324,43.356618,26.516667,-21.569363,62.832353,...,-110.662500,257.409559,13.494853,14.434559,9.977941,77.666912,128.450368,17.613235,11.830882,-19.511765
75,106.710417,9.221324,616.604044,50.600000,134.906985,807.845588,9.819485,83.646814,7.640686,80.763235,...,-113.861029,75.400000,29.001471,42.709559,72.830147,75.400000,267.358824,77.969118,40.033824,47.842647


In [8]:
y = dataset['group']
y

67     0
134    0
12     1
29     1
9      1
      ..
52     0
86     0
18     1
75     0
105    1
Name: group, Length: 115, dtype: int64

#### Definición de pipelines

In [8]:
# Regresión logistica
pipeline_LR = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('LR', LogisticRegression(random_state=SEED))
])

In [9]:
# KNN
pipeline_KNN = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('KNN', KNeighborsClassifier())
])

In [10]:
# SVC
pipeline_SVC = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('SVC', SVC(random_state=SEED))
])

In [11]:
# NuSVC
pipeline_NuSVC = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('NuSVC', NuSVC(random_state=SEED))
])

In [12]:
# Random Forest
pipeline_RF = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('RF', RandomForestClassifier(random_state=SEED))
])

In [13]:
# XGBoost
pipeline_XGB = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('XGB', XGBClassifier(random_state=SEED))
])

#### Ajuste de hiperparámetros: GridSearch y validación cruzada estratificada

In [14]:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) 

In [111]:
# Regresión logistica
param_grid_LR = [{
    'LR__penalty': ['l1', 'l2'], 
    'LR__C': [0.1, 1, 10, 20], 
    'LR__solver': ['liblinear']}]
grid_search_LR = GridSearchCV(pipeline_LR, param_grid=param_grid_LR, cv=skf, scoring='accuracy')
grid_search_LR.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('LR',
                                        LogisticRegression(random_state=42))]),
             param_grid=[{'LR__C': [0.1, 1, 10, 50, 100],
                          'LR__penalty': ['l1', 'l2'],
                          'LR__solver': ['liblinear']}],
             scoring='accuracy')

In [112]:
print(grid_search_LR.best_params_)
print(grid_search_LR.best_score_)

{'LR__C': 10, 'LR__penalty': 'l2', 'LR__solver': 'liblinear'}
0.8958614484930275


In [81]:
with open(dataset_files_folder_comparative+'without_grid_LR.pkl', 'wb') as f:
    pickle.dump(grid_search_LR, f)

In [56]:
# KNN
param_grid_KNN = [{
    'KNN__n_neighbors': [6,10,20,50], 
    'KNN__weights': ['uniform', 'distance'], 
    'KNN__metric': ['euclidean', 'manhattan']}]
grid_search_KNN = GridSearchCV(pipeline_KNN, param_grid=param_grid_KNN, cv=skf, scoring='accuracy')
grid_search_KNN.fit(X, y)


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('KNN', KNeighborsClassifier())]),
             param_grid=[{'KNN__metric': ['euclidean', 'manhattan'],
                          'KNN__n_neighbors': [6, 10, 20, 50],
                          'KNN__weights': ['uniform', 'distance']}],
             scoring='accuracy')

In [57]:
print(grid_search_KNN.best_params_)
print(grid_search_KNN.best_score_)

{'KNN__metric': 'euclidean', 'KNN__n_neighbors': 6, 'KNN__weights': 'distance'}
0.8526765632028791


In [80]:
with open(dataset_files_folder_comparative+'without_grid_KNN.pkl', 'wb') as f:
    pickle.dump(grid_search_KNN, f)

In [69]:
# SVC
param_grid_SVC = [{'SVC__kernel': ['linear', 'rbf', 'sigmoid'], 
                   'SVC__C': [1, 10, 100, 1000],
                   'SVC__gamma':[1,0.1,0.001,0.0001]
                   }]
grid_search_SVC = GridSearchCV(pipeline_SVC, param_grid=param_grid_SVC, cv=skf, scoring='accuracy')
grid_search_SVC.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('SVC', SVC(random_state=42))]),
             param_grid=[{'SVC__C': [1, 10, 100, 1000],
                          'SVC__gamma': [1, 0.1, 0.001, 0.0001],
                          'SVC__kernel': ['linear', 'rbf', 'sigmoid']}],
             scoring='accuracy')

In [70]:
print(grid_search_SVC.best_params_)
print(grid_search_SVC.best_score_)

{'SVC__C': 10, 'SVC__gamma': 0.001, 'SVC__kernel': 'rbf'}
0.887314439946019


In [82]:
with open(dataset_files_folder_comparative+'without_grid_SVC.pkl', 'wb') as f:
    pickle.dump(grid_search_SVC, f)

In [78]:
# NuSVC
param_grid_NuSVC = [{'NuSVC__kernel': ['linear', 'rbf', 'sigmoid'], 
                   'NuSVC__gamma':[1,0.1,0.001,0.0001],
                   'NuSVC__nu': [0.1, 0.5]
    }]

grid_search_NuSVC = GridSearchCV(pipeline_NuSVC, param_grid=param_grid_NuSVC, cv=skf, scoring='accuracy')
grid_search_NuSVC.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('NuSVC', NuSVC(random_state=42))]),
             param_grid=[{'NuSVC__gamma': [1, 0.1, 0.001, 0.0001],
                          'NuSVC__kernel': ['linear', 'rbf', 'sigmoid'],
                          'NuSVC__nu': [0.1, 0.5]}],
             scoring='accuracy')

In [79]:
print(grid_search_NuSVC.best_params_)
print(grid_search_NuSVC.best_score_)

{'NuSVC__gamma': 0.001, 'NuSVC__kernel': 'sigmoid', 'NuSVC__nu': 0.1}
0.8783175888439047


In [83]:
with open(dataset_files_folder_comparative+'without_grid_NuSVC.pkl', 'wb') as f:
    pickle.dump(grid_search_NuSVC, f)

In [95]:
# Random Forest
param_grid_RF = [{'RF__n_estimators': [5, 10, 20, 100], 
                  'RF__max_depth': [5, 10, 20]
                  }]

grid_search_RF = GridSearchCV(pipeline_RF, param_grid=param_grid_RF, cv=skf, scoring='accuracy')
grid_search_RF.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('RF',
                                        RandomForestClassifier(random_state=42))]),
             param_grid=[{'RF__max_depth': [5, 10, 20, 50],
                          'RF__n_estimators': [5, 10, 20, 100]}],
             scoring='accuracy')

In [96]:
print(grid_search_RF.best_params_)
print(grid_search_RF.best_score_)

{'RF__max_depth': 5, 'RF__n_estimators': 100}
0.843229869545659


In [92]:
with open(dataset_files_folder_comparative+'without_grid_RF.pkl', 'wb') as f:
    pickle.dump(grid_search_RF, f)

In [15]:
# XGBoost
param_grid_XGB = [{'XGB__n_estimators': [5,10,20], 
                   'XGB__max_depth': [5, 10, 20], 
                   'XGB__learning_rate': [0.01, 0.1, 0.3, 1]
                   }]

grid_search_XGB = GridSearchCV(pipeline_XGB, param_grid=param_grid_XGB, cv=skf, scoring='f1')
grid_search_XGB.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('XGB',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      feature_types=None,
                                                      gamma..

In [17]:
print(grid_search_XGB.best_params_)
print(grid_search_XGB.best_score_)

{'XGB__learning_rate': 1, 'XGB__max_depth': 5, 'XGB__n_estimators': 5}
0.8942526035549291


In [122]:
with open(dataset_files_folder_comparative+'without_grid_XGB.pkl', 'wb') as f:
    pickle.dump(grid_search_XGB, f)

In [123]:
df_res_without = pd.DataFrame(
    np.transpose([
    [grid_search_LR.best_params_, grid_search_KNN.best_params_, grid_search_SVC.best_params_, grid_search_NuSVC.best_params_, grid_search_RF.best_params_, grid_search_XGB.best_params_], 
    [grid_search_LR.best_score_, grid_search_KNN.best_score_, grid_search_SVC.best_score_, grid_search_NuSVC.best_score_, grid_search_RF.best_score_, grid_search_XGB.best_score_]
    ]), 
    index=['LR', 'KNN', 'SVC', 'NuSVC', 'RF', 'XGB'], columns=['Best params', 'Best score']
)

df_res_without

Unnamed: 0,Best params,Best score
LR,"{'LR__C': 10, 'LR__penalty': 'l2', 'LR__solver...",0.895861
KNN,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",0.852677
SVC,"{'SVC__C': 10, 'SVC__gamma': 0.001, 'SVC__kern...",0.887314
NuSVC,"{'NuSVC__gamma': 0.001, 'NuSVC__kernel': 'sigm...",0.878318
RF,"{'RF__max_depth': 5, 'RF__n_estimators': 100}",0.84323
XGB,"{'XGB__learning_rate': 1, 'XGB__max_depth': 5,...",0.894253


In [124]:
# Almacenar los resultados
df_res_without.to_csv(dataset_files_folder_comparative+dataset_name+'_7_class_without.csv')
with open(dataset_files_folder_comparative+dataset_name+'_7_class_without.pkl', 'wb') as f:
    pickle.dump(df_res_without, f)