# <center> Modelo de ML </center>

## 1. Librerías

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import SparseTopKCategoricalAccuracy

2025-11-17 11:06:07.294955: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. Configuraciones

In [3]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 3. Carga de datos

In [4]:
DATA_PATH = "../Datasets/Refined/refined_data.csv"
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
df.head()

Shape: (9316, 10)


Unnamed: 0,codCliente,codPoliza,Ramo,Y,Salud,Vida,Autos,Cumplimiento,Patrimoniales,Otros
0,37,69,Vida,Autos,0,1,0,0,0,0
1,37,80568,Autos,Salud,0,1,1,0,0,0
2,84,118023,Autos,Vida,0,0,1,0,0,0
3,87,54,Vida,Salud,0,1,0,0,0,0
4,1356,206,Cumplimiento,Patrimoniales,0,0,0,1,0,0


## 4. Preprocesamiento de datos

In [5]:
id_cols = ["codCliente", "codPoliza"]
for col in id_cols:
    if col in df.columns:
        df = df.drop(columns = col)

### 4.1. Revisar desbalance entre clases

In [6]:
print(df["Y"].value_counts(normalize = True))

Y
Salud            0.285316
Patrimoniales    0.263847
Vida             0.223916
Autos            0.197939
Cumplimiento     0.022112
Otros            0.006870
Name: proportion, dtype: float64


### 4.2. Revisar nulos

In [7]:
df.isna().sum()

Ramo             0
Y                0
Salud            0
Vida             0
Autos            0
Cumplimiento     0
Patrimoniales    0
Otros            0
dtype: int64

## 5. Encoding

In [8]:
# All unique product names appearing in either column
all_products = pd.unique(pd.concat([df["Ramo"], df["Y"]], ignore_index=True))

product2id = {p: i for i, p in enumerate(all_products, start=1)}  # start from 1 (0 = padding)
id2product = {i: p for p, i in product2id.items()}

num_items = len(product2id) + 1  # +1 for padding index 0 (even if we don't really pad here)

df["Ramo_id"] = df["Ramo"].map(product2id)
df["Y_id"] = df["Y"].map(product2id)

df[["Ramo", "Y", "Ramo_id", "Y_id"]].head()

Unnamed: 0,Ramo,Y,Ramo_id,Y_id
0,Vida,Autos,1,2
1,Autos,Salud,2,5
2,Autos,Vida,2,1
3,Vida,Salud,1,5
4,Cumplimiento,Patrimoniales,3,4


## 6. Modelos

### 6.1. Modelo base: Matriz de transición

In [9]:
# Count transitions Ramo -> Y
transition_counts = pd.crosstab(df["Ramo"], df["Y"])
transition_probs = transition_counts.div(transition_counts.sum(axis=1), axis=0)  # row-normalize

transition_counts, transition_probs.head()


(Y              Autos  Cumplimiento  Otros  Patrimoniales  Salud  Vida
 Ramo                                                                 
 Autos              0            40      8            458    796   719
 Cumplimiento      93             0     48           1100     52    65
 Otros              2             0      0             13      0     5
 Patrimoniales    376            81      4              0    438   418
 Salud            580            32      2            385      0   879
 Vida             793            53      2            502   1372     0,
 Y                 Autos  Cumplimiento     Otros  Patrimoniales     Salud  \
 Ramo                                                                       
 Autos          0.000000      0.019792  0.003958       0.226620  0.393864   
 Cumplimiento   0.068483      0.000000  0.035346       0.810015  0.038292   
 Otros          0.100000      0.000000  0.000000       0.650000  0.000000   
 Patrimoniales  0.285497      0.061503  0.0030

#### 6.1.1. Transiciones con Markov

In [None]:
def predict_next_markov(current_ramo, top_k=3):
    if current_ramo not in transition_probs.index:
        # Fallback: global frequency of Y
        global_probs = df["Y"].value_counts(normalize=True)
        return list(global_probs.head(top_k).items())
    
    row = transition_probs.loc[current_ramo].sort_values(ascending=False)
    return list(row.head(top_k).items())

[('Salud', 0.5040411462160176),
 ('Autos', 0.2913299044819985),
 ('Patrimoniales', 0.18442321822189567)]

In [11]:
predict_next_markov("Vida", top_k=3)

[('Salud', 0.5040411462160176),
 ('Autos', 0.2913299044819985),
 ('Patrimoniales', 0.18442321822189567)]

### 6.2. Neural sequential model

#### 6.2.1. Construcción de datos

In [12]:
portfolio_cols = ["Salud", "Vida", "Autos", "Cumplimiento", "Patrimoniales", "Otros"]

# X1: current ramo as a sequence of length 1 (shape: (N, 1))
X_seq = df["Ramo_id"].values.reshape(-1, 1).astype("int32")

# X2: portfolio as float32
X_portfolio = df[portfolio_cols].astype("float32").values

# y: target product ID
y = df["Y_id"].values.astype("int32")

#### 6.2.2. División del conjunto de datos

In [13]:
X_seq_train, X_seq_test, X_port_train, X_port_test, y_train, y_test = train_test_split(
    X_seq,
    X_portfolio,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_seq_train.shape, X_seq_test.shape, X_port_train.shape


((7452, 1), (1864, 1), (7452, 6))

#### 6.2.3. Modelo de embeddings

In [14]:
EMBED_DIM = 16

# --- Inputs ---
seq_input = layers.Input(shape=(1,), dtype="int32", name="seq_ramo")     # [current ramo]
port_input = layers.Input(shape=(len(portfolio_cols),), dtype="float32", name="portfolio")

# --- Embedding for current Ramo ---
x = layers.Embedding(
    input_dim=num_items,
    output_dim=EMBED_DIM,
    mask_zero=False,   # no padding needed for length 1
    name="ramo_embedding"
)(seq_input)

# Flatten the embedding (because sequence length = 1)
x = layers.Flatten()(x)

# Concatenate with portfolio features
z = layers.Concatenate()([x, port_input])

# A couple of dense layers
z = layers.Dense(64, activation="relu")(z)
z = layers.Dropout(0.3)(z)
z = layers.Dense(64, activation="relu")(z)
z = layers.Dropout(0.3)(z)

# Output: probability over next product
output = layers.Dense(num_items, activation="softmax", name="next_product")(z)

model = Model(inputs=[seq_input, port_input], outputs=output)
model.summary()


#### 6.2.4. Métricas apropiadas

In [15]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=[
        "sparse_categorical_accuracy",
        SparseTopKCategoricalAccuracy(k=3, name="top3_acc"),
        SparseTopKCategoricalAccuracy(k=5, name="top5_acc"),
    ],
)


#### 6.2.5. Entrenamiento

In [16]:
BATCH_SIZE = 256
EPOCHS = 50

history = model.fit(
    {"seq_ramo": X_seq_train, "portfolio": X_port_train},
    y_train,
    validation_split=0.2,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor="val_top3_acc",
            mode="max",
            patience=5,
            restore_best_weights=True,
        )
    ],
    verbose=1,
)


Epoch 1/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - loss: 1.8104 - sparse_categorical_accuracy: 0.3749 - top3_acc: 0.7022 - top5_acc: 0.8861 - val_loss: 1.6440 - val_sparse_categorical_accuracy: 0.5158 - val_top3_acc: 0.8632 - val_top5_acc: 0.9940
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.4859 - sparse_categorical_accuracy: 0.5112 - top3_acc: 0.8811 - top5_acc: 0.9909 - val_loss: 1.2605 - val_sparse_categorical_accuracy: 0.5171 - val_top3_acc: 0.9531 - val_top5_acc: 0.9940
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.2229 - sparse_categorical_accuracy: 0.5120 - top3_acc: 0.9549 - top5_acc: 0.9928 - val_loss: 1.0823 - val_sparse_categorical_accuracy: 0.5466 - val_top3_acc: 0.9658 - val_top5_acc: 0.9940
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1.1584 - sparse_categorical_accuracy: 0.5056 - top3_acc: 0.95

#### 6.2.6. Evaluación

In [17]:
test_metrics = model.evaluate(
    {"seq_ramo": X_seq_test, "portfolio": X_port_test},
    y_test,
    batch_size=BATCH_SIZE,
    verbose=0,
)

for name, value in zip(model.metrics_names, test_metrics):
    print(f"{name}: {value:.4f}")


loss: 1.1110
compile_metrics: 0.5182


#### 6.2.6. Hacer recomendación

In [18]:
def recommend_next_from_row(row, top_k=5):
    ramo = row["Ramo"]
    ramo_id = product2id[ramo]
    
    seq = np.array([[ramo_id]], dtype="int32")
    portfolio = row[portfolio_cols].values.astype("float32").reshape(1, -1)
    
    probs = model.predict({"seq_ramo": seq, "portfolio": portfolio}, verbose=0)[0]
    
    # Remove padding index 0
    probs[0] = 0.0
    top_ids = probs.argsort()[::-1][:top_k]
    
    return [(id2product[i], float(probs[i])) for i in top_ids]

recommend_next_from_row(df.iloc[0], top_k=5)


[('Salud', 0.503348708152771),
 ('Autos', 0.2464023381471634),
 ('Patrimoniales', 0.16363051533699036),
 ('Cumplimiento', 0.03689230978488922),
 ('Vida', 0.03422253578901291)]