# Pruebas MultiFADL One-versus-Rest. Water Pump dataset

In [1]:
import pickle
import pandas as pd

import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from src.MonoFADLModel import MonoFADLModel
from src.MultiFADLModelOvR import MultiFADLModelOvR
from src.NoSelectionModel import NoSelectionModel

# Seed for neural network executions
SEED = 1234
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [2]:
comparative_results = pd.DataFrame(columns=['Accuracy', 'Number of selected features', 'Selected Features'])

* Preprocesamiento

In [3]:
# Lectura y preparacion del dataset

# waterPump = pd.read_csv('data/waterPump.csv')
# display(waterPump)

# # Diccionario para el mapeo
# target_mapping = {
#     (0, 0, 1): 0,  # No funcional
#     (0, 1, 0): 1,  # Funcional necesitando reparación
#     (1, 0, 0): 2,  # Funcional
# }

# # Función para aplicar el mapeo usando el diccionario
# def map_target(row):
#     return target_mapping[(row['target_functional'], row['target_functional needs repair'], row['target_non functional'])]

# waterPump = waterPump.copy()
# waterPump['target'] = waterPump.apply(map_target, axis=1)
# waterPump = waterPump.drop(['target_functional', 'target_functional needs repair', 'target_non functional'], axis=1)

# waterPump = waterPump.drop('recorded_by_GeoData Consultants Ltd', axis=1) # Eliminamos columna inncesaria

# Picke store 
# with open('data/waterPump.pkl', 'wb') as f:
#     pickle.dump(waterPump, f)

waterPump = pickle.load(open('data/waterPump.pkl', 'rb'))
waterPump

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,population,funder_0,funder_A/co Germany,funder_Aar,funder_Abas Ka,...,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other,target
0,6000.0,1390,34.938093,-9.856322,0,109,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
1,0.0,1399,34.698766,-2.147466,0,280,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
2,25.0,686,37.460664,-3.821329,0,250,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
3,0.0,263,38.486161,-11.155298,0,58,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0,31.130847,-1.825359,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,10.0,1210,37.169807,-3.253847,0,125,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
59396,4700.0,1212,35.249991,-9.070629,0,56,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
59397,0.0,0,34.017087,-8.750434,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,2
59398,0.0,0,35.861315,-6.378573,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,2


In [4]:
XwaterPump = waterPump.drop(['target'], axis=1)
ywaterPump = waterPump['target']

XwaterPump_trainval, XwaterPump_test, ywaterPump_trainval, ywaterPump_test = train_test_split(
    XwaterPump, 
    ywaterPump, test_size=0.2, 
    random_state=SEED)

XwaterPump_train, XwaterPump_val, ywaterPump_train, ywaterPump_val = train_test_split(
    XwaterPump_trainval, ywaterPump_trainval, test_size=0.25, 
    random_state=SEED)

In [5]:
ywaterPump_train.value_counts(), ywaterPump_val.value_counts(), ywaterPump_test.value_counts()

(target
 2    19433
 0    13625
 1     2582
 Name: count, dtype: int64,
 target
 2    6385
 0    4623
 1     872
 Name: count, dtype: int64,
 target
 2    6441
 0    4576
 1     863
 Name: count, dtype: int64)

In [6]:
# Normalize numerical variables
def categorize_variables(df):

    categorical = []
    numerical = []

    for column in df.columns:
        unique_values = df[column].unique()
        n_unique = len(unique_values)

        if n_unique <= 10:
            categorical.append((column, unique_values.tolist()))
        else:
            numerical.append(column)

    return {
        'categorical': categorical,
        'numerical': numerical
    }
variables_numericas = categorize_variables(waterPump.drop('target', axis=1))['numerical']

scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), variables_numericas)
    ],
    remainder='passthrough'  # No escalar las demás variables
)

# Normalize train set
XwaterPump_train_scaled = scaler.fit_transform(XwaterPump_train)

# Normalize val and test set
XwaterPump_val_scaled = scaler.transform(XwaterPump_val)
XwaterPump_test_scaled = scaler.transform(XwaterPump_test)

XwaterPump_train_scaled = pd.DataFrame(XwaterPump_train_scaled, columns=XwaterPump_train.columns)
XwaterPump_val_scaled = pd.DataFrame(XwaterPump_val_scaled, columns=XwaterPump_val.columns)
XwaterPump_test_scaled = pd.DataFrame(XwaterPump_test_scaled, columns=XwaterPump_test.columns)


* Noselection results

In [6]:
model = NoSelectionModel(
    n_inputs=XwaterPump_train_scaled.columns.values.shape[0],
    n_class=ywaterPump_train.unique().shape[0]
)       

model.fit(
    XwaterPump_train_scaled,
    ywaterPump_train,
    XwaterPump_val_scaled,
    ywaterPump_val,
    epochs=50
)

model.evaluate(XwaterPump_test_scaled, ywaterPump_test)


# Pickle store
with open('results/WaterPump_NoSelection.pkl', 'wb') as f:
    pickle.dump(model, f)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


In [14]:
verbose1 = model.get_verbose()
verbose1

{'model': <keras.src.engine.functional.Functional at 0x21b2ae05d50>,
 'selected_features': array(['amount_tsh', 'gps_height', 'longitude', ...,
        'waterpoint_type_group_hand pump',
        'waterpoint_type_group_improved spring',
        'waterpoint_type_group_other'], dtype=object),
 'predictionsproba': array([[4.4160137e-01, 6.5702140e-02, 4.9269655e-01],
        [1.3308047e-03, 5.1563361e-04, 9.9815351e-01],
        [6.2661119e-02, 1.8222762e-03, 9.3551660e-01],
        ...,
        [4.9736467e-01, 2.9354095e-02, 4.7328135e-01],
        [1.7838441e-01, 4.1098362e-01, 4.1063195e-01],
        [1.3808557e-03, 1.1429456e-04, 9.9850488e-01]], dtype=float32),
 'results': [0.5283778309822083, 0.7918350100517273]}

In [17]:
comparative_results.loc['NoSelection'] = [verbose1['results'][1], verbose1['selected_features'].shape[0], verbose1['selected_features']]
comparative_results

Unnamed: 0,Accuracy,Number of selected features,Selected Features
NoSelection,0.791835,9214,"[amount_tsh, gps_height, longitude, latitude, ..."


* MonoFADL results

In [7]:
model2 = MonoFADLModel(
    n_inputs=XwaterPump_train_scaled.columns.values.shape[0],
    n_class=ywaterPump_train.unique().shape[0]
)

model2.fit(
    XwaterPump_train_scaled,
    ywaterPump_train,
    XwaterPump_val_scaled,
    ywaterPump_val,
    epochs=50
)

model2.evaluate(XwaterPump_test_scaled, ywaterPump_test)

model2.get_verbose()

# Pickle store
with open('results/WaterPump_MonoFADL.pkl', 'wb') as f:
    pickle.dump(model2, f)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50


In [18]:
verbose2 = model2.get_verbose()
verbose2

{'model': <keras.src.engine.functional.Functional at 0x21b0057ce10>,
 'selected_features': array(['amount_tsh', 'gps_height', 'longitude', 'funder_Danida',
        'funder_Government Of Tanzania', 'installer_DWE',
        'basin_Lake Nyasa', 'basin_Pangani', 'region_code_11',
        'region_code_17', 'district_code_3', 'district_code_4',
        'scheme_management_VWC', 'construction_year_2010',
        'extraction_type_gravity', 'extraction_type_group_other',
        'extraction_type_class_handpump', 'payment_pay per bucket',
        'payment_unknown', 'payment_type_never pay',
        'water_quality_unknown', 'quantity_dry', 'quantity_group_enough',
        'source_type_borehole', 'source_type_shallow well',
        'source_type_spring',
        'waterpoint_type_communal standpipe multiple',
        'waterpoint_type_group_other'], dtype=object),
 'predictionsproba': array([[0.18376283, 0.02048945, 0.79574776],
        [0.22598024, 0.04873891, 0.7252809 ],
        [0.08608519, 0.0224

In [19]:
comparative_results.loc['MonoFADL'] = [verbose2['results'][1], verbose2['selected_features'].shape[0], verbose2['selected_features']]
comparative_results

Unnamed: 0,Accuracy,Number of selected features,Selected Features
NoSelection,0.791835,9214,"[amount_tsh, gps_height, longitude, latitude, ..."
MonoFADL,0.744529,28,"[amount_tsh, gps_height, longitude, funder_Dan..."


* MultiFADL One-versus-Rest results

In [7]:
model3 = MultiFADLModelOvR(
)

model3.fit(
    XwaterPump_train_scaled,
    ywaterPump_train,
    XwaterPump_val_scaled,
    ywaterPump_val,
    epochs=50
)

model3.evaluate(XwaterPump_test_scaled, ywaterPump_test)

model3.get_verbose()

# Pickle store
with open('results/WaterPump_MultiFADL.pkl', 'wb') as f:
    pickle.dump(model3, f)

--> Training model class 2 vs rest
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
--> Training model class 0 vs rest
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
--> Training model class 1 vs rest
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


In [8]:
verbose3 = model3.get_verbose()
verbose3

{'models': {2: <src.MonoFADLModel.MonoFADLModel at 0x1e2821fd890>,
  0: <src.MonoFADLModel.MonoFADLModel at 0x1e281616950>,
  1: <src.MonoFADLModel.MonoFADLModel at 0x1e281653990>},
 'selected_features_per_class': {2: array(['basin_Lake Nyasa', 'region_code_11', 'extraction_type_other',
         'quantity_enough', 'quantity_group_dry',
         'waterpoint_type_communal standpipe multiple',
         'waterpoint_type_other'], dtype=object),
  0: array(['funder_Government Of Tanzania', 'installer_DWE', 'region_code_11',
         'extraction_type_group_gravity',
         'extraction_type_group_nira/tanira', 'extraction_type_class_other',
         'management_wug', 'quantity_dry', 'quantity_enough',
         'source_spring', 'waterpoint_type_group_other'], dtype=object),
  1: array(['quantity_group_dry'], dtype=object)},
 'predictionsproba_per_model': {2: array([[0.70749605],
         [0.70749605],
         [0.70749605],
         ...,
         [0.70749605],
         [0.70749605],
         

In [13]:
comparative_results.loc['MultiFADL'] = [verbose3['acc_global'], 
                                        {clas: verbose3['selected_features_per_class'][clas].shape[0] for clas in verbose3['selected_features_per_class']},
                                        verbose3['selected_features_per_class']]
comparative_results

Unnamed: 0,Accuracy,Number of selected features,Selected Features
NoSelection,0.791835,9214,['amount_tsh' 'gps_height' 'longitude' ...\n '...
MonoFADL,0.744529,28,['amount_tsh' 'gps_height' 'longitude' 'funder...
MultiFADL,0.703114,"{2: 7, 0: 11, 1: 1}","{2: ['basin_Lake Nyasa', 'region_code_11', 'ex..."


In [None]:
comparative_results.to_csv('results/WaterPump_ComparativeResults.csv')

In [None]:
# 