In [None]:
import pickle
import pandas as pd

import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from src.MonoFADLModel import MonoFADLModel
from src.MultiFADLModelOvR import MultiFADLModelOvR
from src.NoSelectionModel import NoSelectionModel

# Seed for neural network executions
SEED = 1234
np.random.seed(SEED)
tf.random.set_seed(SEED)


* Preprocesamiento

In [5]:
# Lectura y preparacion del dataset

# waterPump = pd.read_csv('data/waterPump.csv')
# display(waterPump)

# # Diccionario para el mapeo
# target_mapping = {
#     (0, 0, 1): 0,  # No funcional
#     (0, 1, 0): 1,  # Funcional necesitando reparación
#     (1, 0, 0): 2,  # Funcional
# }

# # Función para aplicar el mapeo usando el diccionario
# def map_target(row):
#     return target_mapping[(row['target_functional'], row['target_functional needs repair'], row['target_non functional'])]

# waterPump = waterPump.copy()
# waterPump['target'] = waterPump.apply(map_target, axis=1)
# waterPump = waterPump.drop(['target_functional', 'target_functional needs repair', 'target_non functional'], axis=1)

# waterPump = waterPump.drop('recorded_by_GeoData Consultants Ltd', axis=1) # Eliminamos columna inncesaria

# Picke store 
# with open('data/waterPump.pkl', 'wb') as f:
#     pickle.dump(waterPump, f)

waterPump = pickle.load(open('data/waterPump.pkl', 'rb'))
waterPump

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,population,funder_0,funder_A/co Germany,funder_Aar,funder_Abas Ka,...,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,waterpoint_type_group_cattle trough,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_other,target
0,6000.0,1390,34.938093,-9.856322,0,109,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
1,0.0,1399,34.698766,-2.147466,0,280,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
2,25.0,686,37.460664,-3.821329,0,250,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
3,0.0,263,38.486161,-11.155298,0,58,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0,31.130847,-1.825359,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,10.0,1210,37.169807,-3.253847,0,125,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
59396,4700.0,1212,35.249991,-9.070629,0,56,0,0,0,0,...,0,0,0,0,1,0,0,0,0,2
59397,0.0,0,34.017087,-8.750434,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,2
59398,0.0,0,35.861315,-6.378573,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,2


In [4]:
XwaterPump = waterPump.drop(['target'], axis=1)
ywaterPump = waterPump['target']

XwaterPump_trainval, XwaterPump_test, ywaterPump_trainval, ywaterPump_test = train_test_split(
    XwaterPump, 
    ywaterPump, test_size=0.2, 
    random_state=SEED)

XwaterPump_train, XwaterPump_val, ywaterPump_train, ywaterPump_val = train_test_split(
    XwaterPump_trainval, ywaterPump_trainval, test_size=0.25, 
    random_state=SEED)

In [6]:
ywaterPump_train.value_counts(), ywaterPump_val.value_counts(), ywaterPump_test.value_counts()

(target
 2    19433
 0    13625
 1     2582
 Name: count, dtype: int64,
 target
 2    6385
 0    4623
 1     872
 Name: count, dtype: int64,
 target
 2    6441
 0    4576
 1     863
 Name: count, dtype: int64)

In [7]:
# Normalize numerical variables
def categorize_variables(df):

    categorical = []
    numerical = []

    for column in df.columns:
        unique_values = df[column].unique()
        n_unique = len(unique_values)

        if n_unique <= 10:
            categorical.append((column, unique_values.tolist()))
        else:
            numerical.append(column)

    return {
        'categorical': categorical,
        'numerical': numerical
    }
variables_numericas = categorize_variables(waterPump.drop('target', axis=1))['numerical']

scaler = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), variables_numericas)
    ],
    remainder='passthrough'  # No escalar las demás variables
)

# Normalize train set
XwaterPump_train_scaled = scaler.fit_transform(XwaterPump_train)

# Normalize val and test set
XwaterPump_val_scaled = scaler.transform(XwaterPump_val)
XwaterPump_test_scaled = scaler.transform(XwaterPump_test)

XwaterPump_train_scaled = pd.DataFrame(XwaterPump_train_scaled, columns=XwaterPump_train.columns)
XwaterPump_val_scaled = pd.DataFrame(XwaterPump_val_scaled, columns=XwaterPump_val.columns)
XwaterPump_test_scaled = pd.DataFrame(XwaterPump_test_scaled, columns=XwaterPump_test.columns)


* Noselection results

In [8]:
model = NoSelectionModel(
    n_inputs=XwaterPump_train_scaled.columns.values.shape[0],
    n_class=ywaterPump_train.unique().shape[0]
)       

model.fit(
    XwaterPump_train_scaled,
    ywaterPump_train,
    XwaterPump_val_scaled,
    ywaterPump_val,
    epochs=2
)

model.evaluate(XwaterPump_test_scaled, ywaterPump_test)

model.get_verbose()

# Pickle store
with open('results/WaterPump_NoSelection.pkl', 'wb') as f:
    pickle.dump(model, f)

Epoch 1/2
Epoch 2/2


{'model': <keras.src.engine.functional.Functional at 0x22183674950>,
 'selected_features': array(['amount_tsh', 'gps_height', 'longitude', ...,
        'waterpoint_type_group_hand pump',
        'waterpoint_type_group_improved spring',
        'waterpoint_type_group_other'], dtype=object),
 'predictionsproba': array([[2.9228657e-01, 2.7322786e-02, 6.8039060e-01],
        [2.1733178e-03, 9.4881732e-05, 9.9773186e-01],
        [4.5998096e-02, 1.7143530e-03, 9.5228755e-01],
        ...,
        [2.8229904e-01, 2.6578672e-02, 6.9112223e-01],
        [2.7956101e-01, 4.8144704e-01, 2.3899193e-01],
        [4.2169189e-04, 6.8750349e-04, 9.9889082e-01]], dtype=float32),
 'results': [0.5473944544792175, 0.782575786113739]}

* MonoFADL results

In [10]:
model2 = MonoFADLModel(
    n_inputs=XwaterPump_train_scaled.columns.values.shape[0],
    n_class=ywaterPump_train.unique().shape[0]
)

model2.fit(
    XwaterPump_train_scaled,
    ywaterPump_train,
    XwaterPump_val_scaled,
    ywaterPump_val,
    epochs=2
)

model2.evaluate(XwaterPump_test_scaled, ywaterPump_test)

model2.get_verbose()

# Pickle store
with open('results/WaterPump_MonoFADL.pkl', 'wb') as f:
    pickle.dump(model2, f)

Epoch 1/2
Epoch 2/2


* MultiFADL One-versus-Rest results

In [11]:
model3 = MultiFADLModelOvR(
)

model3.fit(
    XwaterPump_train_scaled,
    ywaterPump_train,
    XwaterPump_val_scaled,
    ywaterPump_val,
    epochs=2
)

model3.evaluate(XwaterPump_test_scaled, ywaterPump_test)

model3.get_verbose()

# Pickle store
with open('results/WaterPump_MultiFADL.pkl', 'wb') as f:
    pickle.dump(model3, f)

--> Training model class 2 vs rest
Epoch 1/2
Epoch 2/2