In [9]:
import pickle
import pandas as pd

import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from src.MonoFADLModel import MonoFADLModel
from src.MultiFADLModelOvR import MultiFADLModelOvR
from src.NoSelectionModel import NoSelectionModel

# Seed for neural network executions
SEED = 1234
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [10]:
comparative_results = pd.DataFrame(columns=['Accuracy', 'Number of selected features', 'Selected Features'])

* Preprocesamiento

In [11]:
# Load dataset and preprocess it
# https://www.kaggle.com/datasets/brunogrisci/brain-cancer-gene-expression-cumida

brain = pd.read_csv('data/Brain_GSE50161.csv')
brain = brain.drop(columns=['samples'])

# Identificar cada categoria con un numero
brain['type'] = brain['type'].map({
    'normal': 0,
    'ependymoma': 1,
    'glioblastoma': 2,
    'medulloblastoma': 3,
    'pilocytic_astrocytoma': 4
})

# Picke store 
with open('data/brain.pkl', 'wb') as f:
    pickle.dump(brain, f)

brain = pickle.load(open('data/brain.pkl', 'rb'))
brain

Unnamed: 0,type,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,1,12.498150,7.604868,6.880934,9.027128,4.176175,7.224920,6.085942,6.835999,5.898355,...,9.979005,9.926470,12.719785,12.777792,5.403657,4.870548,4.047380,3.721936,4.516434,4.749940
1,1,13.067436,7.998090,7.209076,9.723322,4.826126,7.539381,6.250962,8.012549,5.453147,...,11.924749,11.215930,13.605662,13.401342,5.224555,4.895315,3.786437,3.564481,4.430891,4.491416
2,1,13.068179,8.573674,8.647684,9.613002,4.396581,7.813101,6.007746,7.178156,8.400266,...,12.154405,11.532460,13.764593,13.477800,5.303565,5.052184,4.005343,3.595382,4.563494,4.668827
3,1,12.456040,9.098977,6.628784,8.517677,4.154847,8.361843,6.596064,6.347285,4.900380,...,11.969072,11.288801,13.600828,13.379029,4.953429,4.708371,3.892318,3.759429,4.748381,4.521275
4,1,12.699958,8.800721,11.556188,9.166309,4.165891,7.923826,6.212754,6.866387,5.405628,...,11.411701,11.169317,13.751442,13.803646,4.892677,4.773806,3.796856,3.577544,4.504385,4.541450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,4,12.658228,8.843270,7.672655,9.125912,5.495477,8.603892,7.747514,5.828978,6.926720,...,13.170441,12.676080,14.124837,13.996436,4.913579,4.399176,3.878855,3.680103,4.726784,4.564637
126,4,12.812823,8.510550,8.729699,9.104402,3.967228,7.719089,7.092496,6.504812,6.157163,...,13.040267,12.403316,13.978009,13.812916,5.189600,4.912618,3.764800,3.664920,4.628355,4.761351
127,4,12.706991,8.795721,7.772359,8.327273,6.329383,8.550471,6.613332,6.308945,7.494852,...,12.825383,12.439265,14.328373,14.008693,4.931460,4.712895,3.913637,3.700964,4.764693,4.834952
128,4,12.684593,8.293938,7.228186,8.494428,6.049414,8.214729,7.287758,5.732710,6.296021,...,13.116581,12.657967,14.390346,14.194904,4.871092,4.739400,3.782980,3.920363,4.665584,4.613326


In [25]:
Xbrain = brain.drop(['type'], axis=1)
ybrain = brain['type']

Xbrain_trainval, Xbrain_test, ybrain_trainval, ybrain_test = train_test_split(
    Xbrain, 
    ybrain, test_size=0.15, 
    random_state=SEED)

Xbrain_train, Xbrain_val, ybrain_train, ybrain_val = train_test_split(
    Xbrain_trainval, ybrain_trainval, test_size=0.2, 
    random_state=SEED)

In [26]:
ybrain_train.value_counts(), ybrain_val.value_counts(), ybrain_test.value_counts()

(type
 1    33
 2    19
 0    12
 4    12
 3    12
 Name: count, dtype: int64,
 type
 2    7
 1    7
 3    5
 4    3
 Name: count, dtype: int64,
 type
 2    8
 1    6
 3    5
 0    1
 Name: count, dtype: int64)

In [27]:
# Normalize numerical variables
def categorize_variables(df):

    categorical = []
    numerical = []

    for column in df.columns:
        unique_values = df[column].unique()
        n_unique = len(unique_values)

        if n_unique <= 10:
            categorical.append((column, unique_values.tolist()))
        else:
            numerical.append(column)

    return {
        'categorical': categorical,
        'numerical': numerical
    }
variables_numericas = categorize_variables(brain.drop('type', axis=1))['numerical']

scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), variables_numericas)
    ],
    remainder='passthrough'  # No escalar las demás variables
)

# Normalize train set
Xbrain_train_scaled = scaler.fit_transform(Xbrain_train)

# Normalize val and test set
Xbrain_val_scaled = scaler.transform(Xbrain_val)
Xbrain_test_scaled = scaler.transform(Xbrain_test)

Xbrain_train_scaled = pd.DataFrame(Xbrain_train_scaled, columns=Xbrain_train.columns)
Xbrain_val_scaled = pd.DataFrame(Xbrain_val_scaled, columns=Xbrain_val.columns)
Xbrain_test_scaled = pd.DataFrame(Xbrain_test_scaled, columns=Xbrain_test.columns)


* Noselection results

In [28]:
model = NoSelectionModel(
    n_inputs=Xbrain_train_scaled.columns.values.shape[0],
    n_class=ybrain_train.unique().shape[0]
)       

model.fit(
    Xbrain_train_scaled,
    ybrain_train,
    Xbrain_val_scaled,
    ybrain_val,
    epochs=50
)

model.evaluate(Xbrain_test_scaled, ybrain_test)


# Pickle store
with open('results/brain_NoSelection.pkl', 'wb') as f:
    pickle.dump(model, f)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


In [29]:
verbose1 = model.get_verbose()
verbose1

{'model': <keras.src.engine.functional.Functional at 0x210661e2cd0>,
 'selected_features': array(['1007_s_at', '1053_at', '117_at', ..., 'AFFX-TrpnX-3_at',
        'AFFX-TrpnX-5_at', 'AFFX-TrpnX-M_at'], dtype=object),
 'predictionsproba': array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
         0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2776004e-15,
         0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 2.5754859e-28,
         0.0000000e+00],
        [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
         0.0000000e+00],
        [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         0.0000000e+00],
        [0.0000000e+00, 1.9797236e-29, 2.6490920e-29, 1.0000000e+00,
         0.0000000e+00],
        [0.0000000e+00, 1.0284693e-24, 9.7165973e-20, 1.0000000e+00,
         0.0000000e+00],
        [0

In [30]:
comparative_results.loc['NoSelection'] = [verbose1['results'][1], verbose1['selected_features'].shape[0], verbose1['selected_features']]
comparative_results

Unnamed: 0,Accuracy,Number of selected features,Selected Features
NoSelection,0.8,54675,"[1007_s_at, 1053_at, 117_at, 121_at, 1255_g_at..."


* MonoFADL results

In [31]:
model2 = MonoFADLModel(
    n_inputs=Xbrain_train_scaled.columns.values.shape[0],
    n_class=ybrain_train.unique().shape[0]
)

model2.fit(
    Xbrain_train_scaled,
    ybrain_train,
    Xbrain_val_scaled,
    ybrain_val,
    epochs=50
)

model2.evaluate(Xbrain_test_scaled, ybrain_test)

model2.get_verbose()

# Pickle store
with open('results/brain_MonoFADL.pkl', 'wb') as f:
    pickle.dump(model2, f)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50


In [32]:
verbose2 = model2.get_verbose()
verbose2

{'model': <keras.src.engine.functional.Functional at 0x2100dba9890>,
 'selected_features': array(['1007_s_at', '1053_at', '117_at', ..., 'AFFX-r2-Ec-bioD-3_at',
        'AFFX-r2-P1-cre-3_at', 'AFFX-ThrX-M_at'], dtype=object),
 'predictionsproba': array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
         0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.8119153e-35,
         0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 2.4210849e-16,
         2.2182031e-34],
        [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 4.1431732e-32,
         0.0000000e+00],
        [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         0.0000000e+00],
        [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         0.0000000e+00],
        [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
         0.0000000e+00],
  

In [33]:
comparative_results.loc['MonoFADL'] = [verbose2['results'][1], verbose2['selected_features'].shape[0], verbose2['selected_features']]
comparative_results

Unnamed: 0,Accuracy,Number of selected features,Selected Features
NoSelection,0.8,54675,"[1007_s_at, 1053_at, 117_at, 121_at, 1255_g_at..."
MonoFADL,0.9,17481,"[1007_s_at, 1053_at, 117_at, 1294_at, 1438_at,..."


* MultiFADL One-versus-Rest results

In [34]:
model3 = MultiFADLModelOvR(
)

model3.fit(
    Xbrain_train_scaled,
    ybrain_train,
    Xbrain_val_scaled,
    ybrain_val,
    epochs=50
)

model3.evaluate(Xbrain_test_scaled, ybrain_test)

model3.get_verbose()

# Pickle store
with open('results/brain_MultiFADL.pkl', 'wb') as f:
    pickle.dump(model3, f)

--> Training model class 0 vs rest
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
--> Training model class 1 vs rest
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50


In [35]:
verbose3 = model3.get_verbose()
verbose3

{'models': {0: <src.MonoFADLModel.MonoFADLModel at 0x21064eff850>,
  1: <src.MonoFADLModel.MonoFADLModel at 0x210661d6f50>,
  2: <src.MonoFADLModel.MonoFADLModel at 0x2100d683190>,
  4: <src.MonoFADLModel.MonoFADLModel at 0x21075a75fd0>,
  3: <src.MonoFADLModel.MonoFADLModel at 0x210759d9510>},
 'selected_features_per_class': {0: array(['204366_s_at', '207227_x_at', '208675_s_at', '218651_s_at',
         '219283_at', '219317_at', '234979_at', '239764_at', '244471_x_at',
         '38398_at'], dtype=object),
  1: array(['1553734_at', '204874_x_at', '204932_at', '204933_s_at',
         '205464_at', '205578_at', '206773_at', '210033_s_at', '214147_at',
         '220156_at', '220334_at', '231192_at', '232984_at', '233516_s_at',
         '236085_at', '239942_at', '242162_at', '244364_at', '41660_at'],
        dtype=object),
  2: array(['1557359_at', '1558568_a_at', '1569241_a_at', '202995_s_at',
         '204639_at', '205775_at', '217784_at', '222118_at', '224376_s_at',
         '228235_at',

In [36]:
comparative_results.loc['MultiFADL'] = [verbose3['acc_global'], 
                                        {clas: verbose3['selected_features_per_class'][clas].shape[0] for clas in verbose3['selected_features_per_class']},
                                        verbose3['selected_features_per_class']]
comparative_results

Unnamed: 0,Accuracy,Number of selected features,Selected Features
NoSelection,0.8,54675,"[1007_s_at, 1053_at, 117_at, 121_at, 1255_g_at..."
MonoFADL,0.9,17481,"[1007_s_at, 1053_at, 117_at, 1294_at, 1438_at,..."
MultiFADL,0.9,"{0: 10, 1: 19, 2: 19, 4: 42, 3: 12}","{0: ['204366_s_at', '207227_x_at', '208675_s_a..."


In [37]:
comparative_results.to_csv('results/brain_ComparativeResults.csv')