In [79]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LeakyReLU
from tensorflow.python.keras.models import load_model
from tensorflow.python.keras.optimizers import adam_v2
from sklearn.model_selection import train_test_split
from datetime import datetime

# Preparar los datos
Preparamos los datos previa separación. Codificamos numéricamente y eliminamos columnas que no vamos a utilizar

In [80]:
df = pd.read_csv('Datasets/Data_Combinada.csv')

  df = pd.read_csv('Datasets/Data_Combinada.csv')


Eliminamos columnas que no utilizaremos, como las coordenadas, o columnas repetidas como distrito con código distrito y lesividad con código lesividad

In [81]:
df.drop(columns=['distrito', 'coordenada_y_utm', 'coordenada_x_utm', 'localizacion', 'numero', 'lesividad'],
        inplace=True)

In [82]:
df['sexo'].replace({'M': 0, 'H': 1}, inplace=True)

In [83]:
df['fugado'].replace({'No': 0, 'Si': 1}, inplace=True)

In [84]:
df['tipo_persona'].replace({'Conductor': 3, 'Pasajero': 2, 'No': 1}, inplace=True)

In [85]:
# El criterio según la fuente de datos es que si es None es nada, de todas formas si fue ingresado debería existir registro
# por lo que es muy raro el caso contrario
df['cod_lesividad'].fillna(0, inplace=True)

In [86]:
def marca_de_clase_rango_edad(edad: str) -> float:
    regex_format = r'\b\d+\b'
    numbers = re.findall(regex_format, edad)
    if len(numbers) >= 2:
        return (int(numbers[0]) + int(numbers[1])) / 2

    if edad == 'Menor de 5 años':
        return 5

    if edad == 'Más de 74 años':
        return 74

    else:
        return -1


df['rango_edad'] = df['rango_edad'].apply(marca_de_clase_rango_edad)
df.rename(columns={'rango_edad': 'marca_de_clase_edad'}, inplace=True)

In [87]:
print(sorted(df['marca_de_clase_edad'].unique()))

[-1.0, 5.0, 7.5, 12.0, 16.0, 19.0, 22.5, 27.0, 32.0, 37.0, 42.0, 47.0, 52.0, 57.0, 62.0, 67.0, 72.0, 74.0]


In [88]:
def minutes_from_midnight(hour_str: str) -> int:
    clock = hour_str.split(':')
    minutes = 0

    minutes = int(clock[0]) * 60
    minutes = minutes + int(clock[1])

    return minutes


df['hora'] = df['hora'].apply(minutes_from_midnight)
df.pop('hora').head()

0    90
1    90
2    30
3    30
4    30
Name: hora, dtype: int64

In [89]:
def extract_day(date_str: str) -> int:
    date_parts = date_str.split('/')
    if len(date_parts) >= 3:
        return int(date_parts[0])


def extract_month(date_str: str) -> int:
    date_parts = date_str.split('/')
    if len(date_parts) >= 3:
        return int(date_parts[1])


def extract_year(date_str: str) -> int:
    date_parts = date_str.split('/')
    if len(date_parts) >= 3:
        return int(date_parts[2])


df['day'] = df['fecha'].transform(extract_day)
df['month'] = df['fecha'].transform(extract_month)
df['year'] = df['fecha'].transform(extract_year)

In [90]:
def is_work_day(date_str: str) -> int:
    date_format = '%d/%m/%Y'  # Formato para 'dd-MM-yyyy'

    holidays_madrid = ['']  # todo
    try:
        date_obj = datetime.strptime(date_str, date_format)
        day_of_week_int = date_obj.weekday()
        if day_of_week_int == 5 or day_of_week_int == 6 or date_str in holidays_madrid:
            return 0
        return 1

    except ValueError:
        return -1


df['bool_laborable'] = df['fecha'].transform(is_work_day)
df.pop('fecha').head()

0    01/01/2022
1    01/01/2022
2    01/01/2022
3    01/01/2022
4    01/01/2022
Name: fecha, dtype: object

In [91]:
unique_values = df['tipo_accidente'].unique()
dictionary = {type_of: code for code, type_of in enumerate(unique_values)}
df['tipo_accidente'] = df['tipo_accidente'].map(dictionary)

In [92]:
unique_values = df['tipo_vehiculo'].unique()
dictionary = {type_of: code for code, type_of in enumerate(unique_values)}
df['tipo_vehiculo'] = df['tipo_vehiculo'].map(dictionary)

In [93]:
# TODO eliminar esto cuando se rellene con criterio
df['estado_meteorológico'].fillna('Desconocido', inplace=True)

unique_values = df['estado_meteorológico'].unique()
dictionary = {type_of: code for code, type_of in enumerate(unique_values)}
df['estado_meteorológico'] = df['estado_meteorológico'].map(dictionary)

In [94]:
df.isnull().any()

num_expediente          False
cod_distrito            False
tipo_accidente          False
estado_meteorológico    False
tipo_vehiculo           False
tipo_persona            False
marca_de_clase_edad     False
sexo                    False
cod_lesividad           False
positiva_alcohol        False
positiva_droga          False
numero_pasajeros        False
fugado                  False
day                     False
month                   False
year                    False
bool_laborable          False
dtype: bool

Todo listo. Dropeamos número de expediente y procedemos a separar los datos

In [95]:
df.pop('num_expediente').head()

0    2022S000001
1    2022S000001
2    2022S000002
3    2022S000002
4    2022S000002
Name: num_expediente, dtype: object

# Separamos en tres datasets
1. Normalizamos los datos
2. Separamos en:
    * A entrenar: el 80% de los datos
    * A validar: el 20% muestreado aleatoriamente
    * A predecir: Los que marca de edad sean igual a -1

In [96]:
predict_df = df[df['marca_de_clase_edad'] == -1]
df_ex_predict = df.drop(predict_df.index)

In [97]:
print(sorted(df_ex_predict['marca_de_clase_edad'].unique()))

[5.0, 7.5, 12.0, 16.0, 19.0, 22.5, 27.0, 32.0, 37.0, 42.0, 47.0, 52.0, 57.0, 62.0, 67.0, 72.0, 74.0]


In [98]:
# 'tipo_persona', 'sexo', 'positivo_alcohol', 'positivo_droga', 'fugado' no necesitan ser normalizadas
def custom_normalizer(__df: pd.DataFrame) -> None:
    cols_to_preserve = ['tipo_persona', 'sexo', 'positiva_alcohol', 'positiva_droga', 'fugado', 'marca_de_clase_edad']
    # Normalizar las demás columnas
    for col in __df.columns:
        if col not in cols_to_preserve:
            __df[col] = (__df[col] - __df[col].mean()) / __df[col].std()

In [99]:
custom_normalizer(df_ex_predict)
df_ex_predict.head()

Unnamed: 0,cod_distrito,tipo_accidente,estado_meteorológico,tipo_vehiculo,tipo_persona,marca_de_clase_edad,sexo,cod_lesividad,positiva_alcohol,positiva_droga,numero_pasajeros,fugado,day,month,year,bool_laborable
0,0.561046,-1.034915,-0.469983,-0.443075,3,32.0,0,-0.967438,0.0,0.0,-0.539943,0,-1.673878,-1.450272,-0.813778,-1.777059
1,0.561046,-1.034915,-0.469983,-0.443075,3,47.0,1,-0.967438,0.0,0.0,-0.539943,0,-1.673878,-1.450272,-0.813778,-1.777059
2,-1.194866,-0.595482,0.318022,-0.193187,3,32.0,1,-0.967438,1.0,0.0,0.177063,0,-1.673878,-1.450272,-0.813778,-1.777059
3,-1.194866,-0.595482,0.318022,-0.193187,2,37.0,0,-0.967438,0.0,0.0,0.177063,0,-1.673878,-1.450272,-0.813778,-1.777059
4,-1.194866,-0.595482,0.318022,-0.443075,3,42.0,1,-0.967438,0.0,0.0,0.177063,0,-1.673878,-1.450272,-0.813778,-1.777059


In [100]:
custom_normalizer(predict_df)
predict_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  __df[col] = (__df[col] - __df[col].mean()) / __df[col].std()


Unnamed: 0,cod_distrito,tipo_accidente,estado_meteorológico,tipo_vehiculo,tipo_persona,marca_de_clase_edad,sexo,cod_lesividad,positiva_alcohol,positiva_droga,numero_pasajeros,fugado,day,month,year,bool_laborable
5,-1.818079,-0.496001,-0.514858,0.000252,3,-1.0,1,-0.12309,0.0,0.0,-0.330996,0,-1.66251,-1.433202,-0.82184,-1.435229
12,-0.706536,-0.014323,-0.514858,-0.471443,3,-1.0,1,-0.12309,0.0,0.0,-0.330996,0,-1.66251,-1.433202,-0.82184,-1.435229
14,1.701806,-0.014323,-0.514858,0.236099,3,-1.0,1,-0.12309,0.0,0.0,-0.330996,0,-1.66251,-1.433202,-0.82184,-1.435229
17,-0.150765,0.467356,-0.514858,-0.471443,3,-1.0,1,-0.12309,0.0,0.0,-0.330996,0,-1.66251,-1.433202,-0.82184,-1.435229
18,1.701806,-0.014323,-0.514858,-0.471443,3,-1.0,1,-0.12309,0.0,0.0,0.639787,0,-1.66251,-1.433202,-0.82184,-1.435229


Verificamos la integridad del corte

In [101]:
# TODO remove this on production
df['marca_de_clase_edad'].count() - predict_df['marca_de_clase_edad'].count() - df_ex_predict[
    'marca_de_clase_edad'].count()

0

In [102]:
train_df, val_df = train_test_split(df_ex_predict, test_size=0.2, random_state=1234)

In [103]:
# # TODO Remove this test cell
# train_df.pop('cod_lesividad')
# val_df.pop('cod_lesividad')

In [104]:
classes = [5.0, 7.5, 12.0, 16.0, 19.0, 22.5, 27.0, 32.0, 37.0, 42.0, 47.0, 52.0, 57.0, 62.0, 67.0, 72.0, 74.0]


def extract_labels(df, target_column, classes):
    labels = df[target_column].values

    one_hot_labels = []

    for label in labels:
        one_hot_vector = [1 if label == cls else 0 for cls in classes]
        one_hot_labels.append(one_hot_vector)

    return np.array(one_hot_labels)

In [105]:
train_labels = extract_labels(train_df, 'marca_de_clase_edad', classes)
val_labels = extract_labels(val_df, 'marca_de_clase_edad', classes)
train_data = train_df.drop('marca_de_clase_edad', axis=1).values
val_data = val_df.drop('marca_de_clase_edad', axis=1).values

# Modelo
Modelo de clasificación con 15 entradas y 17 salidas. 
Activación Leaky ReLu + He uniform, con gradient clipping, Lr = 0.001, 10 capas intermedias completamente conectadas de 32 neuronas, tamaño de batch 140, epochs 20, guarda el mejor modelo com checkpoint en bestmodelv4.h5 . Aplica softmax y categorical cross entropy en la salida

In [106]:
# Crear el modelo
model = Sequential()

# Agregar la capa de entrada
model.add(Dense(64, input_dim=15, kernel_initializer='he_uniform'))
model.add(LeakyReLU(alpha=0.01))

# Agregar 5 capas intermedias completamente conectadas
for _ in range(30):
    model.add(Dense(32, kernel_initializer='he_uniform'))
    model.add(LeakyReLU(alpha=0.01))

# Agregar la capa de salida
model.add(Dense(17, activation='softmax'))

# Configurar el optimizador Adam con gradient clipping
adam = adam_v2.Adam(learning_rate=0.001, clipvalue=1.)

# Compilar el modelo
model.compile(optimizer=adam, loss=tf.losses.categorical_crossentropy, metrics=['accuracy'])

# Callback para guardar el mejor modelo
checkpoint = ModelCheckpoint('bestmodelv4.h5', save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)

model.summary()

# Entrenar el modelo
model.fit(
    train_data, train_labels,
    validation_data=(val_data, val_labels),
    epochs=40,
    batch_size=140,
    callbacks=[checkpoint]
)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_128 (Dense)            (None, 64)                1024      
_________________________________________________________________
leaky_re_lu_124 (LeakyReLU)  (None, 64)                0         
_________________________________________________________________
dense_129 (Dense)            (None, 32)                2080      
_________________________________________________________________
leaky_re_lu_125 (LeakyReLU)  (None, 32)                0         
_________________________________________________________________
dense_130 (Dense)            (None, 32)                1056      
_________________________________________________________________
leaky_re_lu_126 (LeakyReLU)  (None, 32)                0         
_________________________________________________________________
dense_131 (Dense)            (None, 32)               

<tensorflow.python.keras.callbacks.History at 0x7fc6409dbbd0>

In [107]:
model = load_model('bestmodelv4.h5')

model.summary()

# Callback to save the best model during training
checkpoint = ModelCheckpoint('bestmodelv1-0.31422.h5', verbose=1, monitor='val_accuracy', save_best_only=True,
                             mode='auto')

# Callback for TensorBoard
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)

# Train the model
model.fit(
    train_data, train_labels,
    validation_data=(val_data, val_labels),
    epochs=40,
    batch_size=140,
    callbacks=[checkpoint]
)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_128 (Dense)            (None, 64)                1024      
_________________________________________________________________
leaky_re_lu_124 (LeakyReLU)  (None, 64)                0         
_________________________________________________________________
dense_129 (Dense)            (None, 32)                2080      
_________________________________________________________________
leaky_re_lu_125 (LeakyReLU)  (None, 32)                0         
_________________________________________________________________
dense_130 (Dense)            (None, 32)                1056      
_________________________________________________________________
leaky_re_lu_126 (LeakyReLU)  (None, 32)                0         
_________________________________________________________________
dense_131 (Dense)            (None, 32)               

2023-11-17 18:22:10.315359: I tensorflow/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.
2023-11-17 18:22:10.315657: I tensorflow/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.
2023-11-17 18:22:10.316022: I tensorflow/tsl/profiler/lib/profiler_session.cc:131] Profiler session tear down.



Epoch 00001: val_accuracy improved from -inf to 0.13084, saving model to bestmodelv1-0.31422.h5
Epoch 2/40

Epoch 00002: val_accuracy did not improve from 0.13084
Epoch 3/40

Epoch 00003: val_accuracy did not improve from 0.13084
Epoch 4/40

Epoch 00004: val_accuracy improved from 0.13084 to 0.13120, saving model to bestmodelv1-0.31422.h5
Epoch 5/40

Epoch 00005: val_accuracy did not improve from 0.13120
Epoch 6/40

Epoch 00006: val_accuracy did not improve from 0.13120
Epoch 7/40

Epoch 00007: val_accuracy did not improve from 0.13120
Epoch 8/40

Epoch 00008: val_accuracy did not improve from 0.13120
Epoch 9/40

Epoch 00009: val_accuracy did not improve from 0.13120
Epoch 10/40

Epoch 00010: val_accuracy did not improve from 0.13120
Epoch 11/40

Epoch 00011: val_accuracy did not improve from 0.13120
Epoch 12/40

Epoch 00012: val_accuracy did not improve from 0.13120
Epoch 13/40

Epoch 00013: val_accuracy did not improve from 0.13120
Epoch 14/40

Epoch 00014: val_accuracy did not impr

<tensorflow.python.keras.callbacks.History at 0x7fc640997490>

In [735]:
tf.config.list_physical_devices('GPU')

[]