# Renta Nacional

In [1]:
import pandas as pd
df_sc = pd.read_excel("BBDD/SC_COTIZACIONES.xlsx")
print(df_sc.head())

  FECHA_COTIZACION  NUMERO_SOLICITUD  NUMERO_IDENTIFICACION  NUMERO_SECUENCIA  \
0       2024-01-02         125720401              188965131                 1   
1       2024-01-02         125720401              188965150                 1   
2       2024-01-02         125720401              188965169                 1   
3       2024-01-02         125720401              188965188                 1   
4       2024-01-02         125747401              188975930                 1   

  TIPO_RENTA  MESES_DIFERIDOS MODALIDAD_RENTA  MESES_GARANTIZADOS  \
0          I                0               S                   0   
1          I                0               G                 120   
2          I                0               G                 192   
3          I                0               G                 180   
4          I                0               S                   0   

   MESES_AUMENTO_TEMPORAL  PORCENTAJE_AUMENTO_TEMPORAL  ...  PRIMA_UNICA  \
0                     

Cargo el otro df para tratar de cruzarlos.

In [2]:
import pandas as pd
df = pd.read_excel("BBDD/COTIZACIONES.xlsx")
print(df.head())

  FECHA_COTIZACION TIPO_PENSION TIPO_COTIZACION  COTIZANTE  COTIZACION  \
0       2024-01-02            S               E  421008240           1   
1       2024-01-02            S               E  421008240           2   
2       2024-01-02            S               E  421008240           3   
3       2024-01-02            S               E  421008240           4   
4       2024-01-02           VA               E  421005868          11   

  TIPO_INTERMEDIARIO RUT_CORREDOR TIPO_RENTA  MESES_DIFERIDOS MODALIDAD_RENTA  \
0                  S   11370691-0          I                0               S   
1                  S   11370691-0          I                0               G   
2                  S   11370691-0          I                0               G   
3                  S   11370691-0          I                0               G   
4                  S   13982859-3          I                0               G   

   ...        VAN  TASA_VAN  DURATION  DIAS_VALIDEZ  INVALIDA  TIPO_

Vemos si hay coincidencias.

In [5]:
cotizantes_comunes = set(df['COTIZANTE']).intersection(df_sc['COTIZANTE'])
print(f"Cotizantes comunes: {len(cotizantes_comunes)}")

Cotizantes comunes: 125426


Realizamos el modelo.

In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# --- Paso 1: Preparación de datos ---

# 1.1 Muestra de 10.000 filas
df_sample = df_sc.sample(n=10000, random_state=42).copy()

# 1.2 Conversión de columnas numéricas con comas
cols_to_convert = [
    "COTIZACION", "RENTA", "PRIMA_UNICA", 
    "RETENCION_AFP", "RETIRO_EXCEDENTES", 
    "TIT_PROPUESTA", "TIT_MAXIMA", 
    "PORCENTAJE_COMISION"
]

for col in cols_to_convert:
    df_sample[col] = df_sample[col].astype(str).str.replace(',', '.', regex=False).astype(float)

# 1.3 Codificación de variables categóricas
categorical_cols = ["TIPO_RENTA", "MODALIDAD_RENTA", "SEGMENTO", "NOMBRE_SEGMENTO"]
df_sample[categorical_cols] = df_sample[categorical_cols].astype("category")

# One-hot encoding
df_encoded = pd.get_dummies(df_sample, columns=categorical_cols)

# 1.4 Features (X) y etiqueta (y)
X = df_encoded.drop(columns=["POSICION_RELATIVA", "FECHA_COTIZACION"])
y = df_sample["POSICION_RELATIVA"].astype(int)

# Guardar índice original antes de imputar
X["index_original"] = df_sample.index

# Imputación de NaN con la media
imputer = SimpleImputer(strategy="mean")
X_imputed = pd.DataFrame(imputer.fit_transform(X.drop(columns=["index_original"])), columns=X.columns[:-1])
X_imputed["index_original"] = X["index_original"].values

# --- Paso 2: Modelo de Random Forest ---

# 2.1 División en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed.drop(columns=["index_original"]),
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

index_test = X_imputed.loc[X_test.index, "index_original"]

# 2.2 Entrenamiento
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)
clf.fit(X_train, y_train)

# 2.3 Evaluación
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# --- Paso 3: Resultados y exportación ---

# 3.1 Probabilidades por clase
y_prob = clf.predict_proba(X_test)
prob_df = pd.DataFrame(y_prob, columns=[f"proba_pos_{cls}" for cls in clf.classes_])
prob_df["prediccion"] = y_pred
prob_df["real"] = y_test.values
prob_df["index_original"] = index_test.values

# 3.2 Agregar columnas originales para análisis
original_test_data = df_sample.loc[prob_df["index_original"].values]
resultados_finales = pd.concat([original_test_data.reset_index(drop=True), prob_df.reset_index(drop=True)], axis=1)

# 3.3 Guardar CSV
resultados_finales.to_csv("predicciones_random_forest.csv", index=False)


              precision    recall  f1-score   support

           0       0.96      0.98      0.97       838
           1       0.44      0.46      0.45       161
           2       0.44      0.54      0.48       309
           3       0.32      0.33      0.33       253
           4       0.35      0.32      0.33       212
           5       0.26      0.19      0.22       118
           6       0.28      0.16      0.20        69
           7       0.31      0.15      0.20        33
           8       0.25      0.14      0.18         7

    accuracy                           0.63      2000
   macro avg       0.40      0.36      0.38      2000
weighted avg       0.62      0.63      0.62      2000



In [12]:
resultados_finales.head()

Unnamed: 0,FECHA_COTIZACION,NUMERO_SOLICITUD,NUMERO_IDENTIFICACION,NUMERO_SECUENCIA,TIPO_RENTA,MESES_DIFERIDOS,MODALIDAD_RENTA,MESES_GARANTIZADOS,MESES_AUMENTO_TEMPORAL,PORCENTAJE_AUMENTO_TEMPORAL,...,proba_pos_3,proba_pos_4,proba_pos_5,proba_pos_6,proba_pos_7,proba_pos_8,proba_pos_9,prediccion,real,index_original
0,2024-10-22,136338101,200457616,1,I,0,S,0,0,0,...,0.43,0.23,0.11,0.02,0.0,0.01,0.0,3,2,802017
1,2024-05-10,130667801,194603767,2,D,24,G,120,0,0,...,0.02,0.03,0.0,0.0,0.0,0.0,0.0,0,0,396710
2,2024-07-03,131699403,196463639,1,I,0,G,228,36,100,...,0.12,0.15,0.26,0.21,0.15,0.0,0.0,5,5,523232
3,2024-02-13,127899001,190770348,1,D,36,S,0,0,0,...,0.07,0.42,0.05,0.0,0.0,0.0,0.0,4,4,130667
4,2024-02-09,127127702,190689372,1,I,0,S,0,0,0,...,0.4,0.38,0.07,0.02,0.01,0.0,0.0,3,4,121811
