# **Importacion de paquetes**
---

In [64]:
import pandas as pd
import numpy as np
import random
import matplotlib.pylab as plt
from matplotlib.ticker import MaxNLocator
import pylab as p
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import recall_score,accuracy_score,classification_report,confusion_matrix
from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler,SMOTE

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'

with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    from sklearn import model_selection
    import xgboost as xgb

#import xgboost as xgb
import operator
import timeit
import scipy.stats as stats
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D,Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Reshape,TimeDistributed,Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

Entender que producto hay o no segun agrupaciones, por ejemplo grupos de cantidades de renta, id_segmento(universitario, no etx)

# **Preprocesamiento**
---

In [65]:
reader = pd.read_csv('../../Data/dataset_para_modelar.csv')

Sustituimos los valores anómalos

In [66]:
reader.replace(' NA', -1, inplace=True)
reader.replace('         NA', -1, inplace=True)
reader.fillna(-1, inplace=True)

Diccionarios para mapear:

In [67]:
emp_dict = {'N':0,-1:-1,'A':1,'B':2,'F':3,'S':4}
inreaderall_dict = {'N':0,-1:-1,'S':1}
sexo_dict = {'V':0,'H':1,-1:-1}
tiprel_dict = {'A':0,-1:-1,'I':1,'P':2,'N':3,'R':4}
indresi_dict = {'N':0,-1:-1,'S':1}
indext_dict = {'N':0,-1:-1,'S':1}
conyuemp_dict = {'N':0,-1:-1,'S':1}
segmento_dict = {-1:4,'01 - TOP':1,'02 - PARTICULARES':2,'03 - UNIVERSITARIO':3}

Mapeo de valores y conversion de los tipos

In [68]:
reader['cod_persona'] = reader['cod_persona'].astype(np.int32)
reader['imp_renta'] = reader['imp_renta'].astype(np.int32)
#sustituimos el 99 por 2 para que sean valores mas cercanos
reader['xti_rel'] = reader['xti_rel'].map(lambda x: 2 if x == 99 else x).astype(np.int8)

reader['xti_empleado'] = reader['xti_empleado'].map(lambda x: emp_dict[x]).astype(np.int8)
reader['sexo'] = reader['sexo'].map(lambda x: sexo_dict[x]).astype(np.int8)
reader['edad'] = reader['edad'].astype(np.int16)

In [69]:
reader['xti_nuevo_cliente'] = reader['xti_nuevo_cliente'].astype(np.int8)
reader['num_antiguedad'] = reader['num_antiguedad'].map(lambda x: -1 if x == '     NA' else x).astype(int)
reader['num_antiguedad'] = reader['num_antiguedad'].map(lambda x: -1 if x == -999999 else x).astype(np.int16)
reader['xti_rel_1mes'] = reader['xti_rel_1mes'].map(lambda x: -2 if x == 'P' else x).astype(np.float16)
reader['xti_rel_1mes'] = reader['xti_rel_1mes'].astype(np.int8)

In [70]:
reader['tip_rel_1mes'] = reader['tip_rel_1mes'].map(lambda x: tiprel_dict[x]).astype(np.int8)
reader['indresi'] = reader['indresi'].map(lambda x: indresi_dict[x]).astype(np.int8)
reader['indext'] = reader['indext'].map(lambda x: indext_dict[x]).astype(np.int8)

Convertimos des canal a numerico

In [71]:
canal_dict={value:idx+1  for idx, value in enumerate(reader['des_canal'].unique())}
reader['des_canal'] = reader['des_canal'].map(lambda x: canal_dict[x]).astype(np.int16)

In [72]:
pais_dict = {'LV': 102,'CA': 2,'GB': 9,'EC': 19,'BY': 64,'ML': 104,'MT': 118,
 'LU': 59,'GR': 39,'NI': 33,'BZ': 113,'QA': 58,'DE': 10,'AU': 63,'IN': 31,
 'GN': 98,'KE': 65,'HN': 22,'JM': 116,'SV': 53,'TH': 79,'IE': 5,'TN': 85,
 'PH': 91,'ET': 54,'AR': 13,'KR': 87,'GA': 45,'FR': 8,'SG': 66,'LB': 81,
 'MA': 38,'NZ': 93,'SK': 69,'CN': 28,'GI': 96,'PY': 51,'SA': 56,'PL': 30,
 'PE': 20,'GE': 78,'HR': 67,'CD': 112,'MM': 94,'MR': 48,'NG': 83,'HU': 106,
 'AO': 71,'NL': 7,'GM': 110,'DJ': 115,'ZA': 75,'OM': 100,'LT': 103,'MZ': 27,
 'VE': 14,'EE': 52,'CF': 109,'CL': 4,'SL': 97,'DO': 11,'PT': 26,'ES': 0,
 'CZ': 36,'AD': 35,'RO': 41,'TW': 29,'BA': 61,'IS': 107,'AT': 6,'ZW': 114,
 'TR': 70,'CO': 21,'PK': 84,'SE': 24,'AL': 25,'CU': 72,'UY': 77,'EG': 74,'CR': 32,
 'GQ': 73,'MK': 105,'KW': 92,'GT': 44,'CM': 55,'SN': 47,'KZ': 111,'DK': 76,
 'LY': 108,'AE': 37,'PA': 60,'UA': 49,'GW': 99,'TG': 86,'MX': 16,'KH': 95,
 'FI': 23,'NO': 46,'IT': 18,'GH': 88, 'JP': 82,'RU': 43,'PR': 40,'RS': 89,
 'DZ': 80,'MD': 68,-1: 1,'BG': 50,'CI': 57,'IL': 42,'VN': 90,'CH': 3,'US': 15,'HK': 34,
 'CG': 101,'BO': 62,'BR': 17,'BE': 12,'BM': 117}

In [73]:
reader['xti_extra'] = reader['xti_extra'].map(lambda x: inreaderall_dict[x]).astype(np.int8)
reader['pais'] = reader['pais'].map(lambda x: pais_dict[x]).astype(np.int8)
reader['tip_dom'] = reader['tip_dom'].astype(np.int8)
reader['cod_provincia'] = reader['cod_provincia'].astype(np.int8)

In [74]:
reader['xti_actividad_cliente'] = reader['xti_actividad_cliente'].astype(np.int8)
reader['fecha_dato_month'] = reader['mes'].map(lambda x: int(x[5:7])).astype(np.int8)
reader['fecha_dato_year'] = reader['mes'].map(lambda x: int(x[0:4]) - 2015).astype(np.int8)
reader['month_int'] = (reader['fecha_dato_month'] + 12 * reader['fecha_dato_year']).astype(np.int8)
reader.drop('mes',axis=1,inplace=True)

In [75]:
reader['fecha1'] = reader['fecha1'].map(lambda x: '2020-01-01' if x == -1 else x)
reader['fecha_alta_month'] = reader['fecha1'].map(lambda x: int(x[5:7])).astype(np.int16)
reader['fecha_alta_year'] = reader['fecha1'].map(lambda x: int(x[0:4]) - 1995).astype(np.int16)

In [76]:
reader['fecha_alta_day'] = reader['fecha1'].map(lambda x: int(x[8:10])).astype(np.int16)
reader['fecha_alta_month_int'] = (reader['fecha_alta_month'] + 12 * reader['fecha_alta_year']).astype(np.int16)
reader['fecha_alta_day_int'] = (reader['fecha_alta_day'] + 30 * reader['fecha_alta_month'] + 365 * reader['fecha_alta_year']).astype(np.int32)
reader.drop('fecha1',axis=1,inplace=True)

In [77]:
reader['fec_ult_cli_1t'] = reader['fec_ult_cli_1t'].map(lambda x: '2020-01-01' if x == -1 else x)
reader['ult_fec_cli_1t_month'] = reader['fec_ult_cli_1t'].map(lambda x: int(x[5:7])).astype(np.int16)
reader['ult_fec_cli_1t_year'] = reader['fec_ult_cli_1t'].map(lambda x: int(x[0:4]) - 2015).astype(np.int16)
reader['ult_fec_cli_1t_day'] = reader['fec_ult_cli_1t'].map(lambda x: int(x[8:10])).astype(np.int16)
reader['ult_fec_cli_1t_month_int'] = (reader['ult_fec_cli_1t_month'] + 12 * reader['ult_fec_cli_1t_year']).astype(np.int8)
reader.drop('fec_ult_cli_1t',axis=1,inplace=True)

In [78]:
reader['id_segmento'] = reader['id_segmento'].map(lambda x: segmento_dict[x]).astype(np.int8)
target_cols=[f'ind_prod{i}' for i in range(1,26)]
for col in target_cols:
    reader[col] = reader[col].astype(np.int8)

In [84]:
#Obtenemos los productos que si se han comprado por cada segmento
diccionario_segmentos=reader.groupby(by=['id_segmento'])[target_cols].sum().T.to_dict()
diccionario_seg_proc={}
for key,value in diccionario_segmentos.items():
    sol=[]
    for prod,res in value.items():
        if res>0:
            sol.append(prod)
    diccionario_seg_proc[key]=sol
diccionario_seg_proc

{1: ['ind_prod3',
  'ind_prod4',
  'ind_prod5',
  'ind_prod7',
  'ind_prod8',
  'ind_prod9',
  'ind_prod10',
  'ind_prod11',
  'ind_prod12',
  'ind_prod13',
  'ind_prod14',
  'ind_prod15',
  'ind_prod16',
  'ind_prod17',
  'ind_prod18',
  'ind_prod19',
  'ind_prod20',
  'ind_prod21',
  'ind_prod22',
  'ind_prod23',
  'ind_prod24',
  'ind_prod25'],
 2: ['ind_prod1',
  'ind_prod3',
  'ind_prod4',
  'ind_prod5',
  'ind_prod6',
  'ind_prod7',
  'ind_prod8',
  'ind_prod9',
  'ind_prod10',
  'ind_prod11',
  'ind_prod12',
  'ind_prod13',
  'ind_prod14',
  'ind_prod15',
  'ind_prod16',
  'ind_prod17',
  'ind_prod18',
  'ind_prod19',
  'ind_prod20',
  'ind_prod21',
  'ind_prod22',
  'ind_prod23',
  'ind_prod24',
  'ind_prod25'],
 3: ['ind_prod3',
  'ind_prod4',
  'ind_prod5',
  'ind_prod7',
  'ind_prod8',
  'ind_prod9',
  'ind_prod10',
  'ind_prod11',
  'ind_prod12',
  'ind_prod13',
  'ind_prod14',
  'ind_prod16',
  'ind_prod17',
  'ind_prod18',
  'ind_prod19',
  'ind_prod20',
  'ind_prod22',
 

In [85]:
reader.to_csv('sant_limpio.csv',index=False)

In [86]:
reader.rename(columns={'cod_persona':'id'},inplace=True)

In [87]:
cols_valor_unico=[]
muy_bajos_unos=[]
bajos_unos=[]
medios_unos=[]
for col in target_cols:
    if len(reader[col].value_counts().values)<2:
        cols_valor_unico.append(col)
    elif reader[col].value_counts().values[1]/len(reader) <0.001:
        muy_bajos_unos.append(col)
    elif reader[col].value_counts().values[1]/len(reader) <0.005:
        bajos_unos.append(col)
    else:
        medios_unos.append(col)
print('Columnas de valor unico: ',cols_valor_unico)
print('Columnas con muy poco 1: ',muy_bajos_unos)
print('Columnas con pocos 1: ',bajos_unos)
print('Columnas con intermedio 1: ',medios_unos)

Columnas de valor unico:  ['ind_prod2']
Columnas con muy poco 1:  ['ind_prod1', 'ind_prod4']
Columnas con pocos 1:  ['ind_prod10', 'ind_prod11', 'ind_prod17', 'ind_prod21']
Columnas con intermedio 1:  ['ind_prod3', 'ind_prod5', 'ind_prod6', 'ind_prod7', 'ind_prod8', 'ind_prod9', 'ind_prod12', 'ind_prod13', 'ind_prod14', 'ind_prod15', 'ind_prod16', 'ind_prod18', 'ind_prod19', 'ind_prod20', 'ind_prod22', 'ind_prod23', 'ind_prod24', 'ind_prod25']


In [88]:
reader.columns

Index(['Unnamed: 0', 'id', 'pais', 'sexo', 'edad', 'xti_empleado',
       'xti_nuevo_cliente', 'num_antiguedad', 'xti_rel', 'xti_rel_1mes',
       'tip_rel_1mes', 'indresi', 'indext', 'des_canal', 'xti_extra',
       'tip_dom', 'cod_provincia', 'xti_actividad_cliente', 'imp_renta',
       'id_segmento', 'mean_engagement', 'ind_prod1', 'ind_prod2', 'ind_prod3',
       'ind_prod4', 'ind_prod5', 'ind_prod6', 'ind_prod7', 'ind_prod8',
       'ind_prod9', 'ind_prod10', 'ind_prod11', 'ind_prod12', 'ind_prod13',
       'ind_prod14', 'ind_prod15', 'ind_prod16', 'ind_prod17', 'ind_prod18',
       'ind_prod19', 'ind_prod20', 'ind_prod21', 'ind_prod22', 'ind_prod23',
       'ind_prod24', 'ind_prod25', 'fecha_dato_month', 'fecha_dato_year',
       'month_int', 'fecha_alta_month', 'fecha_alta_year', 'fecha_alta_day',
       'fecha_alta_month_int', 'fecha_alta_day_int', 'ult_fec_cli_1t_month',
       'ult_fec_cli_1t_year', 'ult_fec_cli_1t_day',
       'ult_fec_cli_1t_month_int'],
      dtype='object

In [89]:
for col in target_cols:
    reader[col].replace(-1,0,inplace=True)
data_reducida=reader[['id','month_int'] + target_cols].copy()
data_reducida.fillna(0,inplace=True)

In [90]:
data_reducida.sort_values(by = ['id','month_int'],inplace=True)

In [91]:
#Creamos columnas con un shift sobre un dataset reducido para pruebas
DIFF_CONDS = {}
for shift_val in range(1,18):
    name = 'id_shift_' + str(shift_val)
    data_reducida[name] = data_reducida['id'].shift(shift_val).fillna(0).astype(np.int32)
    DIFF_CONDS[shift_val] = ((data_reducida['id'] - data_reducida[name]) != 0)
    data_reducida.drop(name,axis = 1,inplace=True)

for col in target_cols:
    for shift_val in range(1,18):
        name = col + '_s_' + str(shift_val)
        data_reducida[name] = data_reducida[col].shift(shift_val).fillna(0).astype(np.int8)
        data_reducida[name][DIFF_CONDS[shift_val]] = 0

In [92]:
for col in target_cols:
    data_reducida[col] = (data_reducida[col] - data_reducida[col + '_s_1']).astype(np.int8)
    data_reducida[col] = (data_reducida[col] > 0).astype(np.int8)

In [93]:
MIN_MONTH_DICT = data_reducida.groupby('id')['month_int'].min().to_dict()
data_reducida['min_month_int'] = data_reducida['id'].map(lambda x: MIN_MONTH_DICT[x])

data_reducida = data_reducida[data_reducida['min_month_int'] != data_reducida['month_int']]

data_reducida['sum_inds'] = data_reducida[target_cols].sum(axis=1)
data_reducida = data_reducida[(data_reducida['sum_inds'] != 0) | (data_reducida['month_int'] == 18)].copy()
#combined_small = combined_small[(combined_small['sum_inds'] != 0) | (combined_small['month_int'] >= 17)].copy()
data_reducida.to_csv('data_reducida_procesada.csv', index=False)

In [94]:
cols_to_combine = ['edad', 'num_antiguedad', 'des_canal', 'cod_provincia',
       'fecha_alta_day', 'fecha_alta_month', 'fecha_alta_month_int','fecha_alta_day_int',
       'fecha_alta_year', 'fecha_dato_month', 'fecha_dato_year',
       'xti_actividad_cliente',
       'xti_empleado',
       'xti_nuevo_cliente',
       'indext',
       'xti_extra', 'xti_rel', 'xti_rel_1mes', 'indresi',
       'pais', 'imp_renta', 'id_segmento', 'sexo',
       'tip_rel_1mes', 'ult_fec_cli_1t_day', 'ult_fec_cli_1t_month',
       'ult_fec_cli_1t_month_int', 'ult_fec_cli_1t_year']

#Creamos columnas con un shift sobre el dataset inicial
DIFF_CONDS = {}
for shift_val in [1]:
    name = 'id_shift_' + str(shift_val)
    reader[name] = reader['id'].shift(shift_val).fillna(0).astype(np.int32)
    DIFF_CONDS[shift_val] = ((reader['id'] - reader[name]) != 0)
    reader.drop(name,axis = 1,inplace=True)
shifted_feature_names = []
for col in cols_to_combine + target_cols:
    for shift_val in [1]:
        name = col + '_s_' + str(shift_val)
        reader[name] = reader[col].shift(shift_val).fillna(0).astype(np.int32)
        reader[name][DIFF_CONDS[shift_val]] = 0
        if col in cols_to_combine:
            shifted_feature_names.append(name)

In [95]:
reader=reader.iloc[:,1:]

In [96]:
#Creamos columnas con las varianciones de las variables X para predecir
diff_feautres_s1 = []
for col in cols_to_combine:
    name = col + '_s1_diff'
    diff_feautres_s1.append(name)
    reader[name] = (reader[col] - reader[col + '_s_1']).astype(np.int32)

In [97]:
MIN_MONTH_DICT = reader.groupby('id')['month_int'].min().to_dict()
reader['min_month_int'] = reader['id'].map(lambda x: MIN_MONTH_DICT[x]).astype(np.int8)

#Antiguedad minima del usuario
MIN_ANTIGUEDAD_DICT = reader.groupby('id')['num_antiguedad'].min().to_dict()
reader['min_antiguedad'] = reader['id'].map(lambda x: MIN_ANTIGUEDAD_DICT[x]).astype(np.int16)

#Antiguedad maxima del usuario
MAX_ANTIGUEDAD_DICT = reader.groupby('id')['num_antiguedad'].max().to_dict()
reader['max_antiguedad'] = reader['id'].map(lambda x: MAX_ANTIGUEDAD_DICT[x]).astype(np.int16)

#Edad minima del usuario
MIN_AGE_DICT = reader.groupby('id')['edad'].min().to_dict()
reader['min_edad'] = reader['id'].map(lambda x: MIN_AGE_DICT[x]).astype(np.int16)

#Edad maxima del usuario
MAX_AGE_DICT = reader.groupby('id')['edad'].max().to_dict()
reader['max_edad'] = reader['id'].map(lambda x: MAX_AGE_DICT[x]).astype(np.int16)

#Minimo maximo y desviancion estandar de la renta por cada usuario
MIN_RENTA_DICT = reader.groupby('id')['imp_renta'].min().to_dict()
reader['min_renta'] = reader['id'].map(lambda x: MIN_RENTA_DICT[x])
MAX_RENTA_DICT = reader.groupby('id')['imp_renta'].max().to_dict()
reader['max_renta'] = reader['id'].map(lambda x: MAX_RENTA_DICT[x])

In [98]:
RENTA_VAL_COUNTS = reader.groupby('imp_renta')['id'].nunique().to_dict()
reader['renta_freq'] = reader['imp_renta'].map(lambda x: RENTA_VAL_COUNTS[x])
reader.sort_values(by = ['id','month_int'],inplace=True)
#Eliminamos los id duplicados porque hemos reducido la informacion a una sola fila por usuario
combined_nd = reader.drop_duplicates('id')

In [99]:
def model(input_shape, output_shape,dense_units=100,dense_layers=6, dropout_rate=0.5):
    model = Sequential()    
    # Añadir capas Conv2D y MaxPooling2D
    for i in range(dense_layers):
        if i==0:
            model.add(Dense(units=dense_units,activation='relu',input_dim=input_shape))
        else:
            model.add(Dense(units=dense_units,activation='relu'))
            model.add(Dropout(dropout_rate))
    # Capa de salida
    model.add(Dense(output_shape, activation='softmax'))
    
    # Compilar el modelo
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy','recall','categorical_accuracy'])
    
    return model

In [100]:
def convert_tensor(X_train,X_test,y_train,y_test):
    X_train=tf.convert_to_tensor(X_train,dtype=tf.float32)
    X_test=tf.convert_to_tensor(X_test,dtype=tf.float32)
    y_train=tf.convert_to_tensor(y_train,dtype=tf.float32)
    y_test=tf.convert_to_tensor(y_test,dtype=tf.float32)
    return X_train, X_test, y_train,y_test

In [108]:
otros=True
columnas_sin_target=combined_nd.columns
columnas_sin_target=[col for col in columnas_sin_target if col not in target_cols]
resultados={}
for segment in combined_nd['id_segmento'].unique():
    res_seg={}
    acc=[]
    rec=[]
    models=[]
    for prod in diccionario_seg_proc[segment]:
        if prod!='ind_prod2':
            X=combined_nd[columnas_sin_target].copy()
            y=combined_nd[prod]
            tipo='0'
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            if tipo=='NN':
                X_train, X_test, y_train, y_test=convert_tensor(X_train, X_test, y_train, y_test)
                modelo=model(X_train.shape[-1],y_train.shape[-1],100,10,.5)
                callbacks = [
                EarlyStopping(monitor='val_loss', patience=3, verbose=1),  # Parada temprana si la pérdida en validación deja de disminuir
                ModelCheckpoint('modelo_epoch_{epoch:02d}.keras', save_freq='epoch', verbose=1)  # Guardar el modelo en cada época
                ]
                history=modelo.fit(X_train,y_train,epochs=2,batch_size=8,validation_data=(X_test,y_test),callbacks=callbacks)
                y_pred=modelo.predict(X_test)

                accuracy = accuracy_score(y_test.numpy(), y_pred.numpy())
                print(f'Accuracy: {accuracy}')
                recall = recall_score(y_test.numpy(), y_pred.numpy())
                print(f'Recall: {recall}')
                conf = confusion_matrix(y_test.numpy(), y_pred.numpy())
                print(f'Matriz de confusion: {conf}')


            else:
                # Identificar características categóricas
                categorical_features_indices = np.where(X.dtypes != np.float64)[0]

                # Calcular los pesos de las clases
                class_weights = [len(y) / (2 * sum(y == c)) for c in np.unique(y)]
                # Crear el modelo CatBostClassifier
                modelo = CatBoostClassifier(
                    depth=6,
                    loss_function='Logloss',
                    class_weights=class_weights,
                    verbose=1000)

                # Entrenar el modelo
                modelo.fit(X_train, y_train)

                y_pred = modelo.predict_proba(X_test)
                threshold=.5
                y_pred=[1 if p[1]>threshold else 0 for p in y_pred ]
                models.append(modelo)
                # Evaluar la precisión del modelo
                accuracy = accuracy_score(y_test, y_pred)
                acc.append(accuracy)
                print('\n','*'*50)
                print('\n PARA EL SEGMENTO: ',segment)
                print('\n','*'*50)
                print('-'*50,'\nProducto: ',prod)
                print(f'Accuracy: {accuracy}')
                recall = recall_score(y_test, y_pred)
                rec.append(recall)
                print(f'Recall: {recall}')
                conf = confusion_matrix(y_test, y_pred)
                print(f'Matriz de confusion: {conf} ','\n')
        if otros:
            faltan=25-len(acc)
            if faltan>0:
                acc+=[1]*faltan
                rec+=[1]*faltan
    resultados[segment]={'accuracy':acc,'recall':rec,'models':models}


Learning rate set to 0.04838
0:	learn: 0.6354288	total: 14.3ms	remaining: 14.3s
999:	learn: 0.2821387	total: 17.1s	remaining: 0us

 **************************************************

 PARA EL SEGMENTO:  4

 **************************************************
-------------------------------------------------- 
Producto:  ind_prod3
Accuracy: 0.8410645575032065
Recall: 0.904480722473081
Matriz de confusion: [[2661  937]
 [ 550 5208]]  

Learning rate set to 0.04838
0:	learn: 0.6427475	total: 29.2ms	remaining: 29.2s
999:	learn: 0.2125170	total: 16.2s	remaining: 0us

 **************************************************

 PARA EL SEGMENTO:  4

 **************************************************
-------------------------------------------------- 
Producto:  ind_prod5
Accuracy: 0.8399957246686618
Recall: 0.8231404958677686
Matriz de confusion: [[7361 1390]
 [ 107  498]]  

Learning rate set to 0.04838
0:	learn: 0.5641967	total: 13.2ms	remaining: 13.2s
999:	learn: 0.0007486	total: 13.4s	remainin

In [109]:
#Media de los resultados
for segment in resultados.keys():
    print('Valores medios por segmento: ',segment)
    print('Accuracy medio: ', np.mean(resultados[segment]['accuracy']))
    print('Recall medio: ',np.mean(resultados[segment]['recall']))

Valores medios por segmento:  4
Accuracy medio:  0.9638532290306554
Recall medio:  0.908732481166843
Valores medios por segmento:  2
Accuracy medio:  0.9608985321362405
Recall medio:  0.7816858722866797
Valores medios por segmento:  3
Accuracy medio:  0.9577446655524896
Recall medio:  0.8262330727975903
Valores medios por segmento:  1
Accuracy medio:  0.9592054389649982
Recall medio:  0.793933084125231


**Resultados sin filtrado por segmento:**
- Accuracy medio:  0.9219796565483825
- Recall medio:  0.564505857250443
---

**Valores medios por segmento:  4**

- Accuracy medio:  0.8971207287795573
- Recall medio:  0.7402386002440919

**Valores medios por segmento:  2**

- Accuracy medio:  0.9217970642724812
- Recall medio:  0.5633717445733596

**Valores medios por segmento:  3**

- Accuracy medio:  0.9070382642154767
- Recall medio:  0.6177127601546981

**Valores medios por segmento:  1**

- Accuracy medio:  0.9147022814722686
- Recall medio:  0.5691328122618469

In [113]:
import os

In [114]:
for segment in combined_nd['id_segmento'].unique():
    os.mkdir(f'Models/{segment}')
    for prods,models  in zip(diccionario_seg_proc[segment],resultados[segment]['models']):
        raiz=f'Models/{segment}/{prods}.cb'
        models.save_model(raiz)

In [50]:
corr='SMOTE'
if corr=='ROS':
    ros = RandomOverSampler(random_state=42,)
    X_train, y_train = ros.fit_resample(X_train, y_train)
elif corr=='SMOTE':
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
else:
    pass
X_train, X_test, y_train, y_test=convert_tensor(X_train, X_test, y_train, y_test)
print(X_train.shape)
print(y_train.shape)

modelo=model(X_train.shape[-1],1,100,10,0)
callbacks = [
EarlyStopping(monitor='val_loss', patience=3, verbose=1),  # Parada temprana si la pérdida en validación deja de disminuir
#ModelCheckpoint('modelo_epoch_{epoch:02d}.keras', save_freq='epoch', verbose=1)  # Guardar el modelo en cada época
]
history=modelo.fit(X_train,y_train,epochs=2,batch_size=8,validation_data=(X_test,y_test),callbacks=callbacks)


(74822, 121)
(74822,)
Epoch 1/2
[1m9353/9353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 3ms/step - accuracy: 0.6677 - categorical_accuracy: 0.5040 - loss: 13.3735 - recall: 0.7247 - val_accuracy: 0.7936 - val_categorical_accuracy: 0.9957 - val_loss: 0.3809 - val_recall: 0.3333
Epoch 2/2
[1m9353/9353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 4ms/step - accuracy: 0.7776 - categorical_accuracy: 0.4987 - loss: 0.5610 - recall: 0.8527 - val_accuracy: 0.6893 - val_categorical_accuracy: 0.9957 - val_loss: 0.6086 - val_recall: 0.5000


In [51]:
y_pred=modelo.predict(X_test)
print(y_pred)
threshold=.5
y_pred=[1 if p>threshold else 0 for p in y_pred ]
print(y_test.numpy())
print(y_pred)
accuracy = accuracy_score(y_test.numpy(), y_pred)
print(f'Accuracy: {accuracy}')
recall = recall_score(y_test.numpy(), y_pred)
print(f'Recall: {recall}')
conf = confusion_matrix(y_test.numpy(), y_pred)
print(f'Matriz de confusion: {conf}')

[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[[0.00322835]
 [0.        ]
 [0.        ]
 ...
 [0.1906496 ]
 [0.04416687]
 [0.8857743 ]]
[0. 0. 0. ... 0. 0. 0.]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 

In [158]:
# Evaluar la precisión del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')
conf = confusion_matrix(y_test, y_pred)
print(f'Matriz de confusion: {conf}')

Accuracy: 0.9349080803762292
Recall: 0.09090909090909091
Matriz de confusion: [[8692   59]
 [ 550   55]]


In [132]:
list(y_test)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [111]:
clf=CatBoostClassifier()

['ind_prod1',
 'ind_prod2',
 'ind_prod3',
 'ind_prod4',
 'ind_prod5',
 'ind_prod6',
 'ind_prod7',
 'ind_prod8',
 'ind_prod9',
 'ind_prod10',
 'ind_prod11',
 'ind_prod12',
 'ind_prod13',
 'ind_prod14',
 'ind_prod15',
 'ind_prod16',
 'ind_prod17',
 'ind_prod18',
 'ind_prod19',
 'ind_prod20',
 'ind_prod21',
 'ind_prod22',
 'ind_prod23',
 'ind_prod24',
 'ind_prod25']

In [73]:
reader.head()

Unnamed: 0,id,pais,sexo,edad,xti_empleado,xti_nuevo_cliente,num_antiguedad,xti_rel,xti_rel_1mes,tip_rel_1mes,...,indresi_s1_diff,pais_s1_diff,imp_renta_s1_diff,id_segmento_s1_diff,sexo_s1_diff,tip_rel_1mes_s1_diff,ult_fec_cli_1t_day_s1_diff,ult_fec_cli_1t_month_s1_diff,ult_fec_cli_1t_month_int_s1_diff,ult_fec_cli_1t_year_s1_diff
0,178103,0,1,35,0,0,6,1,1,0,...,1,0,87218,2,1,0,1,1,61,5
1,503082,0,0,27,0,0,35,1,1,1,...,1,0,70777,3,0,1,1,1,61,5
2,502996,0,0,37,0,0,35,1,1,0,...,1,0,104035,2,0,0,1,1,61,5
3,503053,0,1,23,0,0,35,1,1,0,...,1,0,136930,3,1,0,1,1,61,5
4,503031,0,1,44,0,0,35,1,1,1,...,1,0,110245,2,1,1,1,1,61,5


In [19]:
reader.columns

Index(['Unnamed: 0', 'cod_persona', 'mes', 'pais', 'sexo', 'edad', 'fecha1',
       'xti_empleado', 'xti_nuevo_cliente', 'num_antiguedad', 'xti_rel',
       'fec_ult_cli_1t', 'xti_rel_1mes', 'tip_rel_1mes', 'indresi', 'indext',
       'des_canal', 'xti_extra', 'tip_dom', 'cod_provincia',
       'xti_actividad_cliente', 'imp_renta', 'id_segmento', 'mean_engagement',
       'ind_prod1', 'ind_prod2', 'ind_prod3', 'ind_prod4', 'ind_prod5',
       'ind_prod6', 'ind_prod7', 'ind_prod8', 'ind_prod9', 'ind_prod10',
       'ind_prod11', 'ind_prod12', 'ind_prod13', 'ind_prod14', 'ind_prod15',
       'ind_prod16', 'ind_prod17', 'ind_prod18', 'ind_prod19', 'ind_prod20',
       'ind_prod21', 'ind_prod22', 'ind_prod23', 'ind_prod24', 'ind_prod25'],
      dtype='object')