# **Importacion de paquetes**
---

In [39]:
import pandas as pd
import numpy as np
import random
import matplotlib.pylab as plt
from matplotlib.ticker import MaxNLocator
import pylab as p
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import recall_score,accuracy_score,classification_report,confusion_matrix
from catboost import CatBoostClassifier
import catboost
from imblearn.over_sampling import RandomOverSampler,SMOTE

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'

with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    from sklearn import model_selection
    import xgboost as xgb

#import xgboost as xgb
import operator
import timeit
import scipy.stats as stats
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D,Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Reshape,TimeDistributed,Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

Entender que producto hay o no segun agrupaciones, por ejemplo grupos de cantidades de renta, id_segmento(universitario, no etx)

# **Preprocesamiento**
---

In [40]:
reader = pd.read_csv('../../Data/dataset_para_modelar.csv')

Sustituimos los valores anómalos

In [41]:
reader.replace(' NA', -1, inplace=True)
reader.replace('         NA', -1, inplace=True)
reader.fillna(-1, inplace=True)

Diccionarios para mapear:

In [42]:
emp_dict = {'N':0,-1:-1,'A':1,'B':2,'F':3,'S':4}
inreaderall_dict = {'N':0,-1:-1,'S':1}
sexo_dict = {'V':0,'H':1,-1:-1}
tiprel_dict = {'A':0,-1:-1,'I':1,'P':2,'N':3,'R':4}
indresi_dict = {'N':0,-1:-1,'S':1}
indext_dict = {'N':0,-1:-1,'S':1}
conyuemp_dict = {'N':0,-1:-1,'S':1}
segmento_dict = {-1:4,'01 - TOP':1,'02 - PARTICULARES':2,'03 - UNIVERSITARIO':3}

Mapeo de valores y conversion de los tipos

In [43]:
reader['cod_persona'] = reader['cod_persona'].astype(np.int32)
reader['imp_renta'] = reader['imp_renta'].astype(np.int32)
#sustituimos el 99 por 2 para que sean valores mas cercanos
reader['xti_rel'] = reader['xti_rel'].map(lambda x: 2 if x == 99 else x).astype(np.int8)

reader['xti_empleado'] = reader['xti_empleado'].map(lambda x: emp_dict[x]).astype(np.int8)
reader['sexo'] = reader['sexo'].map(lambda x: sexo_dict[x]).astype(np.int8)
reader['edad'] = reader['edad'].astype(np.int16)

In [44]:
reader['xti_nuevo_cliente'] = reader['xti_nuevo_cliente'].astype(np.int8)
reader['num_antiguedad'] = reader['num_antiguedad'].map(lambda x: -1 if x == '     NA' else x).astype(int)
reader['num_antiguedad'] = reader['num_antiguedad'].map(lambda x: -1 if x == -999999 else x).astype(np.int16)
reader['xti_rel_1mes'] = reader['xti_rel_1mes'].map(lambda x: -2 if x == 'P' else x).astype(np.float16)
reader['xti_rel_1mes'] = reader['xti_rel_1mes'].astype(np.int8)

In [45]:
reader['tip_rel_1mes'] = reader['tip_rel_1mes'].map(lambda x: tiprel_dict[x]).astype(np.int8)
reader['indresi'] = reader['indresi'].map(lambda x: indresi_dict[x]).astype(np.int8)
reader['indext'] = reader['indext'].map(lambda x: indext_dict[x]).astype(np.int8)

Convertimos des canal a numerico

In [46]:
canal_dict={value:idx+1  for idx, value in enumerate(reader['des_canal'].unique())}
reader['des_canal'] = reader['des_canal'].map(lambda x: canal_dict[x]).astype(np.int16)

In [47]:
pais_dict = {'LV': 102,'CA': 2,'GB': 9,'EC': 19,'BY': 64,'ML': 104,'MT': 118,
 'LU': 59,'GR': 39,'NI': 33,'BZ': 113,'QA': 58,'DE': 10,'AU': 63,'IN': 31,
 'GN': 98,'KE': 65,'HN': 22,'JM': 116,'SV': 53,'TH': 79,'IE': 5,'TN': 85,
 'PH': 91,'ET': 54,'AR': 13,'KR': 87,'GA': 45,'FR': 8,'SG': 66,'LB': 81,
 'MA': 38,'NZ': 93,'SK': 69,'CN': 28,'GI': 96,'PY': 51,'SA': 56,'PL': 30,
 'PE': 20,'GE': 78,'HR': 67,'CD': 112,'MM': 94,'MR': 48,'NG': 83,'HU': 106,
 'AO': 71,'NL': 7,'GM': 110,'DJ': 115,'ZA': 75,'OM': 100,'LT': 103,'MZ': 27,
 'VE': 14,'EE': 52,'CF': 109,'CL': 4,'SL': 97,'DO': 11,'PT': 26,'ES': 0,
 'CZ': 36,'AD': 35,'RO': 41,'TW': 29,'BA': 61,'IS': 107,'AT': 6,'ZW': 114,
 'TR': 70,'CO': 21,'PK': 84,'SE': 24,'AL': 25,'CU': 72,'UY': 77,'EG': 74,'CR': 32,
 'GQ': 73,'MK': 105,'KW': 92,'GT': 44,'CM': 55,'SN': 47,'KZ': 111,'DK': 76,
 'LY': 108,'AE': 37,'PA': 60,'UA': 49,'GW': 99,'TG': 86,'MX': 16,'KH': 95,
 'FI': 23,'NO': 46,'IT': 18,'GH': 88, 'JP': 82,'RU': 43,'PR': 40,'RS': 89,
 'DZ': 80,'MD': 68,-1: 1,'BG': 50,'CI': 57,'IL': 42,'VN': 90,'CH': 3,'US': 15,'HK': 34,
 'CG': 101,'BO': 62,'BR': 17,'BE': 12,'BM': 117}

In [48]:
reader['xti_extra'] = reader['xti_extra'].map(lambda x: inreaderall_dict[x]).astype(np.int8)
reader['pais'] = reader['pais'].map(lambda x: pais_dict[x]).astype(np.int8)
reader['tip_dom'] = reader['tip_dom'].astype(np.int8)
reader['cod_provincia'] = reader['cod_provincia'].astype(np.int8)

In [49]:
reader['xti_actividad_cliente'] = reader['xti_actividad_cliente'].astype(np.int8)
reader['fecha_dato_month'] = reader['mes'].map(lambda x: int(x[5:7])).astype(np.int8)
reader['fecha_dato_year'] = reader['mes'].map(lambda x: int(x[0:4]) - 2015).astype(np.int8)
reader['month_int'] = (reader['fecha_dato_month'] + 12 * reader['fecha_dato_year']).astype(np.int8)
reader.drop('mes',axis=1,inplace=True)

In [50]:
reader['fecha1'] = reader['fecha1'].map(lambda x: '2020-01-01' if x == -1 else x)
reader['fecha_alta_month'] = reader['fecha1'].map(lambda x: int(x[5:7])).astype(np.int16)
reader['fecha_alta_year'] = reader['fecha1'].map(lambda x: int(x[0:4]) - 1995).astype(np.int16)

In [51]:
reader['fecha_alta_day'] = reader['fecha1'].map(lambda x: int(x[8:10])).astype(np.int16)
reader['fecha_alta_month_int'] = (reader['fecha_alta_month'] + 12 * reader['fecha_alta_year']).astype(np.int16)
reader['fecha_alta_day_int'] = (reader['fecha_alta_day'] + 30 * reader['fecha_alta_month'] + 365 * reader['fecha_alta_year']).astype(np.int32)
reader.drop('fecha1',axis=1,inplace=True)

In [52]:
reader['fec_ult_cli_1t'] = reader['fec_ult_cli_1t'].map(lambda x: '2020-01-01' if x == -1 else x)
reader['ult_fec_cli_1t_month'] = reader['fec_ult_cli_1t'].map(lambda x: int(x[5:7])).astype(np.int16)
reader['ult_fec_cli_1t_year'] = reader['fec_ult_cli_1t'].map(lambda x: int(x[0:4]) - 2015).astype(np.int16)
reader['ult_fec_cli_1t_day'] = reader['fec_ult_cli_1t'].map(lambda x: int(x[8:10])).astype(np.int16)
reader['ult_fec_cli_1t_month_int'] = (reader['ult_fec_cli_1t_month'] + 12 * reader['ult_fec_cli_1t_year']).astype(np.int8)
reader.drop('fec_ult_cli_1t',axis=1,inplace=True)

In [53]:
reader['id_segmento'] = reader['id_segmento'].map(lambda x: segmento_dict[x]).astype(np.int8)
target_cols=[f'ind_prod{i}' for i in range(1,26)]
for col in target_cols:
    reader[col] = reader[col].astype(np.int8)

In [54]:
#Obtenemos los productos que si se han comprado por cada segmento
diccionario_segmentos=reader.groupby(by=['id_segmento'])[target_cols].sum().T.to_dict()
diccionario_seg_proc={}
for key,value in diccionario_segmentos.items():
    sol=[]
    for prod,res in value.items():
        if res>0:
            sol.append(prod)
    diccionario_seg_proc[key]=sol
diccionario_seg_proc

{1: ['ind_prod3',
  'ind_prod4',
  'ind_prod5',
  'ind_prod7',
  'ind_prod8',
  'ind_prod9',
  'ind_prod10',
  'ind_prod11',
  'ind_prod12',
  'ind_prod13',
  'ind_prod14',
  'ind_prod15',
  'ind_prod16',
  'ind_prod17',
  'ind_prod18',
  'ind_prod19',
  'ind_prod20',
  'ind_prod21',
  'ind_prod22',
  'ind_prod23',
  'ind_prod24',
  'ind_prod25'],
 2: ['ind_prod1',
  'ind_prod3',
  'ind_prod4',
  'ind_prod5',
  'ind_prod6',
  'ind_prod7',
  'ind_prod8',
  'ind_prod9',
  'ind_prod10',
  'ind_prod11',
  'ind_prod12',
  'ind_prod13',
  'ind_prod14',
  'ind_prod15',
  'ind_prod16',
  'ind_prod17',
  'ind_prod18',
  'ind_prod19',
  'ind_prod20',
  'ind_prod21',
  'ind_prod22',
  'ind_prod23',
  'ind_prod24',
  'ind_prod25'],
 3: ['ind_prod3',
  'ind_prod4',
  'ind_prod5',
  'ind_prod7',
  'ind_prod8',
  'ind_prod9',
  'ind_prod10',
  'ind_prod11',
  'ind_prod12',
  'ind_prod13',
  'ind_prod14',
  'ind_prod16',
  'ind_prod17',
  'ind_prod18',
  'ind_prod19',
  'ind_prod20',
  'ind_prod22',
 

In [55]:
reader.head()

Unnamed: 0.1,Unnamed: 0,cod_persona,pais,sexo,edad,xti_empleado,xti_nuevo_cliente,num_antiguedad,xti_rel,xti_rel_1mes,...,month_int,fecha_alta_month,fecha_alta_year,fecha_alta_day,fecha_alta_month_int,fecha_alta_day_int,ult_fec_cli_1t_month,ult_fec_cli_1t_year,ult_fec_cli_1t_day,ult_fec_cli_1t_month_int
0,0,178103,0,1,35,0,0,6,1,1,...,1,1,20,12,241,7342,1,5,1,61
1,1,503082,0,0,27,0,0,35,1,1,...,1,8,17,10,212,6455,1,5,1,61
2,2,502996,0,0,37,0,0,35,1,1,...,1,8,17,10,212,6455,1,5,1,61
3,3,503053,0,1,23,0,0,35,1,1,...,1,8,17,10,212,6455,1,5,1,61
4,4,503031,0,1,44,0,0,35,1,1,...,1,8,17,10,212,6455,1,5,1,61


In [56]:
reader.to_csv('sant_limpio.csv',index=False)

In [57]:
reader.rename(columns={'cod_persona':'id'},inplace=True)

In [58]:
cols_valor_unico=[]
muy_bajos_unos=[]
bajos_unos=[]
medios_unos=[]
for col in target_cols:
    if len(reader[col].value_counts().values)<2:
        cols_valor_unico.append(col)
    elif reader[col].value_counts().values[1]/len(reader) <0.001:
        muy_bajos_unos.append(col)
    elif reader[col].value_counts().values[1]/len(reader) <0.005:
        bajos_unos.append(col)
    else:
        medios_unos.append(col)
print('Columnas de valor unico: ',cols_valor_unico)
print('Columnas con muy poco : ',muy_bajos_unos)
print('Columnas con pocos : ',bajos_unos)
print('Columnas con intermedio : ',medios_unos)

Columnas de valor unico:  ['ind_prod2']
Columnas con muy poco :  ['ind_prod1', 'ind_prod4']
Columnas con pocos :  ['ind_prod10', 'ind_prod11', 'ind_prod17', 'ind_prod21']
Columnas con intermedio :  ['ind_prod3', 'ind_prod5', 'ind_prod6', 'ind_prod7', 'ind_prod8', 'ind_prod9', 'ind_prod12', 'ind_prod13', 'ind_prod14', 'ind_prod15', 'ind_prod16', 'ind_prod18', 'ind_prod19', 'ind_prod20', 'ind_prod22', 'ind_prod23', 'ind_prod24', 'ind_prod25']


In [59]:
reader.columns

Index(['Unnamed: 0', 'id', 'pais', 'sexo', 'edad', 'xti_empleado',
       'xti_nuevo_cliente', 'num_antiguedad', 'xti_rel', 'xti_rel_1mes',
       'tip_rel_1mes', 'indresi', 'indext', 'des_canal', 'xti_extra',
       'tip_dom', 'cod_provincia', 'xti_actividad_cliente', 'imp_renta',
       'id_segmento', 'mean_engagement', 'ind_prod1', 'ind_prod2', 'ind_prod3',
       'ind_prod4', 'ind_prod5', 'ind_prod6', 'ind_prod7', 'ind_prod8',
       'ind_prod9', 'ind_prod10', 'ind_prod11', 'ind_prod12', 'ind_prod13',
       'ind_prod14', 'ind_prod15', 'ind_prod16', 'ind_prod17', 'ind_prod18',
       'ind_prod19', 'ind_prod20', 'ind_prod21', 'ind_prod22', 'ind_prod23',
       'ind_prod24', 'ind_prod25', 'fecha_dato_month', 'fecha_dato_year',
       'month_int', 'fecha_alta_month', 'fecha_alta_year', 'fecha_alta_day',
       'fecha_alta_month_int', 'fecha_alta_day_int', 'ult_fec_cli_1t_month',
       'ult_fec_cli_1t_year', 'ult_fec_cli_1t_day',
       'ult_fec_cli_1t_month_int'],
      dtype='object

In [60]:
for col in target_cols:
    reader[col].replace(-1,0,inplace=True)
data_reducida=reader[['id','month_int'] + target_cols].copy()
data_reducida.fillna(0,inplace=True)

In [61]:
data_reducida.sort_values(by = ['id','month_int'],inplace=True)

In [62]:
#Creamos columnas con un shift sobre un dataset reducido para pruebas
DIFF_CONDS = {}
for shift_val in range(1,18):
    name = 'id_shift_' + str(shift_val)
    data_reducida[name] = data_reducida['id'].shift(shift_val).fillna(0).astype(np.int32)
    DIFF_CONDS[shift_val] = ((data_reducida['id'] - data_reducida[name]) != 0)
    data_reducida.drop(name,axis = 1,inplace=True)

for col in target_cols:
    for shift_val in range(1,18):
        name = col + '_s_' + str(shift_val)
        data_reducida[name] = data_reducida[col].shift(shift_val).fillna(0).astype(np.int8)
        data_reducida[name][DIFF_CONDS[shift_val]] = 0

In [63]:
for col in target_cols:
    data_reducida[col] = (data_reducida[col] - data_reducida[col + '_s_1']).astype(np.int8)
    data_reducida[col] = (data_reducida[col] > 0).astype(np.int8)

In [64]:
MIN_MONTH_DICT = data_reducida.groupby('id')['month_int'].min().to_dict()
data_reducida['min_month_int'] = data_reducida['id'].map(lambda x: MIN_MONTH_DICT[x])

data_reducida = data_reducida[data_reducida['min_month_int'] != data_reducida['month_int']]

data_reducida['sum_inds'] = data_reducida[target_cols].sum(axis=1)
data_reducida = data_reducida[(data_reducida['sum_inds'] != 0) | (data_reducida['month_int'] == 18)].copy()
#combined_small = combined_small[(combined_small['sum_inds'] != 0) | (combined_small['month_int'] >= 17)].copy()
data_reducida.to_csv('data_reducida_procesada.csv', index=False)

In [65]:
cols_to_combine = ['edad', 'num_antiguedad', 'des_canal', 'cod_provincia',
       'fecha_alta_day', 'fecha_alta_month', 'fecha_alta_month_int','fecha_alta_day_int',
       'fecha_alta_year', 'fecha_dato_month', 'fecha_dato_year',
       'xti_actividad_cliente',
       'xti_empleado',
       'xti_nuevo_cliente',
       'indext',
       'xti_extra', 'xti_rel', 'xti_rel_1mes', 'indresi',
       'pais', 'imp_renta', 'id_segmento', 'sexo',
       'tip_rel_1mes', 'ult_fec_cli_1t_day', 'ult_fec_cli_1t_month',
       'ult_fec_cli_1t_month_int', 'ult_fec_cli_1t_year']

#Creamos columnas con un shift sobre el dataset inicial
DIFF_CONDS = {}
for shift_val in [1]:
    name = 'id_shift_' + str(shift_val)
    reader[name] = reader['id'].shift(shift_val).fillna(0).astype(np.int32)
    DIFF_CONDS[shift_val] = ((reader['id'] - reader[name]) != 0)
    reader.drop(name,axis = 1,inplace=True)
shifted_feature_names = []
for col in cols_to_combine + target_cols:
    for shift_val in [1]:
        name = col + '_s_' + str(shift_val)
        reader[name] = reader[col].shift(shift_val).fillna(0).astype(np.int32)
        reader[name][DIFF_CONDS[shift_val]] = 0
        if col in cols_to_combine:
            shifted_feature_names.append(name)

In [66]:
reader=reader.iloc[:,1:]

In [67]:
#Creamos columnas con las varianciones de las variables X para predecir
diff_feautres_s1 = []
for col in cols_to_combine:
    name = col + '_s1_diff'
    diff_feautres_s1.append(name)
    reader[name] = (reader[col] - reader[col + '_s_1']).astype(np.int32)

In [68]:
MIN_MONTH_DICT = reader.groupby('id')['month_int'].min().to_dict()
reader['min_month_int'] = reader['id'].map(lambda x: MIN_MONTH_DICT[x]).astype(np.int8)

#Antiguedad minima del usuario
MIN_ANTIGUEDAD_DICT = reader.groupby('id')['num_antiguedad'].min().to_dict()
reader['min_antiguedad'] = reader['id'].map(lambda x: MIN_ANTIGUEDAD_DICT[x]).astype(np.int16)

#Antiguedad maxima del usuario
MAX_ANTIGUEDAD_DICT = reader.groupby('id')['num_antiguedad'].max().to_dict()
reader['max_antiguedad'] = reader['id'].map(lambda x: MAX_ANTIGUEDAD_DICT[x]).astype(np.int16)

#Edad minima del usuario
MIN_AGE_DICT = reader.groupby('id')['edad'].min().to_dict()
reader['min_edad'] = reader['id'].map(lambda x: MIN_AGE_DICT[x]).astype(np.int16)

#Edad maxima del usuario
MAX_AGE_DICT = reader.groupby('id')['edad'].max().to_dict()
reader['max_edad'] = reader['id'].map(lambda x: MAX_AGE_DICT[x]).astype(np.int16)

#Minimo maximo y desviancion estandar de la renta por cada usuario
MIN_RENTA_DICT = reader.groupby('id')['imp_renta'].min().to_dict()
reader['min_renta'] = reader['id'].map(lambda x: MIN_RENTA_DICT[x])
MAX_RENTA_DICT = reader.groupby('id')['imp_renta'].max().to_dict()
reader['max_renta'] = reader['id'].map(lambda x: MAX_RENTA_DICT[x])

In [69]:
RENTA_VAL_COUNTS = reader.groupby('imp_renta')['id'].nunique().to_dict()
reader['renta_freq'] = reader['imp_renta'].map(lambda x: RENTA_VAL_COUNTS[x])
reader.sort_values(by = ['id','month_int'],inplace=True)
#Eliminamos los id duplicados porque hemos reducido la informacion a una sola fila por usuario
combined_nd = reader.drop_duplicates('id')

In [70]:
list(combined_nd.columns)

['id',
 'pais',
 'sexo',
 'edad',
 'xti_empleado',
 'xti_nuevo_cliente',
 'num_antiguedad',
 'xti_rel',
 'xti_rel_1mes',
 'tip_rel_1mes',
 'indresi',
 'indext',
 'des_canal',
 'xti_extra',
 'tip_dom',
 'cod_provincia',
 'xti_actividad_cliente',
 'imp_renta',
 'id_segmento',
 'mean_engagement',
 'ind_prod1',
 'ind_prod2',
 'ind_prod3',
 'ind_prod4',
 'ind_prod5',
 'ind_prod6',
 'ind_prod7',
 'ind_prod8',
 'ind_prod9',
 'ind_prod10',
 'ind_prod11',
 'ind_prod12',
 'ind_prod13',
 'ind_prod14',
 'ind_prod15',
 'ind_prod16',
 'ind_prod17',
 'ind_prod18',
 'ind_prod19',
 'ind_prod20',
 'ind_prod21',
 'ind_prod22',
 'ind_prod23',
 'ind_prod24',
 'ind_prod25',
 'fecha_dato_month',
 'fecha_dato_year',
 'month_int',
 'fecha_alta_month',
 'fecha_alta_year',
 'fecha_alta_day',
 'fecha_alta_month_int',
 'fecha_alta_day_int',
 'ult_fec_cli_1t_month',
 'ult_fec_cli_1t_year',
 'ult_fec_cli_1t_day',
 'ult_fec_cli_1t_month_int',
 'edad_s_1',
 'num_antiguedad_s_1',
 'des_canal_s_1',
 'cod_provincia_s_1',

In [71]:
def model(input_shape, output_shape,dense_units=100,dense_layers=6, dropout_rate=0.5):
    model = Sequential()    
    # Añadir capas Conv2D y MaxPooling2D
    for i in range(dense_layers):
        if i==0:
            model.add(Dense(units=dense_units,activation='relu',input_dim=input_shape))
        else:
            model.add(Dense(units=dense_units,activation='relu'))
            model.add(Dropout(dropout_rate))
    # Capa de salida
    model.add(Dense(output_shape, activation='softmax'))
    
    # Compilar el modelo
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy','recall','categorical_accuracy'])
    
    return model

In [72]:
def convert_tensor(X_train,X_test,y_train,y_test):
    X_train=tf.convert_to_tensor(X_train,dtype=tf.float32)
    X_test=tf.convert_to_tensor(X_test,dtype=tf.float32)
    y_train=tf.convert_to_tensor(y_train,dtype=tf.float32)
    y_test=tf.convert_to_tensor(y_test,dtype=tf.float32)
    return X_train, X_test, y_train,y_test

Unnamed: 0,id,pais,sexo,edad,xti_empleado,xti_nuevo_cliente,num_antiguedad,xti_rel,xti_rel_1mes,tip_rel_1mes,...,ult_fec_cli_1t_month_int_s1_diff,ult_fec_cli_1t_year_s1_diff,min_month_int,min_antiguedad,max_antiguedad,min_edad,max_edad,min_renta,max_renta,renta_freq
166144,150095,0,1,40,0,1,1,1,1,0,...,61,5,6,1,10,40,41,108066,108066,1
166139,150176,0,1,78,0,1,1,1,1,0,...,61,5,6,1,10,78,79,-1,-1,11218
166130,150796,0,0,78,0,1,1,1,1,0,...,61,5,6,1,10,78,79,47495,47495,1
166109,151832,0,1,39,0,1,1,1,1,0,...,61,5,6,1,10,39,40,97075,97075,1
166111,151958,0,0,85,0,1,1,1,1,0,...,61,5,6,1,10,85,85,136289,136289,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20664,1537586,0,1,50,0,0,245,1,1,0,...,61,5,1,245,254,50,51,181501,181501,1
20665,1537634,0,0,55,0,0,245,1,1,0,...,61,5,1,245,254,55,56,267601,267601,1
20662,1537661,0,0,78,0,0,245,1,1,0,...,61,5,1,245,254,78,78,69504,69504,1
20663,1537693,0,1,65,2,0,246,1,1,0,...,61,5,1,246,255,65,66,444287,444287,1


In [77]:
otros=True
columnas_sin_target=combined_nd.columns
columnas_sin_target=[col for col in columnas_sin_target if col not in target_cols]
resultados={}
for segment in combined_nd['id_segmento'].unique():
    res_seg={}
    acc=[]
    rec=[]
    models=[]
    for prod in diccionario_seg_proc[segment]:
        if prod!='ind_prod2':
            X=combined_nd[columnas_sin_target].copy()
            y=combined_nd[prod].copy()
            tipo='0'
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            if tipo=='NN':
                X_train, X_test, y_train, y_test=convert_tensor(X_train, X_test, y_train, y_test)
                modelo=model(X_train.shape[-1],y_train.shape[-1],100,10,.5)
                callbacks = [
                EarlyStopping(monitor='val_loss', patience=3, verbose=1),  # Parada temprana si la pérdida en validación deja de disminuir
                ModelCheckpoint('modelo_epoch_{epoch:02d}.keras', save_freq='epoch', verbose=1)  # Guardar el modelo en cada época
                ]
                history=modelo.fit(X_train,y_train,epochs=2,batch_size=8,validation_data=(X_test,y_test),callbacks=callbacks)
                y_pred=modelo.predict(X_test)

                accuracy = accuracy_score(y_test.numpy(), y_pred.numpy())
                print(f'Accuracy: {accuracy}')
                recall = recall_score(y_test.numpy(), y_pred.numpy())
                print(f'Recall: {recall}')
                conf = confusion_matrix(y_test.numpy(), y_pred.numpy())
                print(f'Matriz de confusion: {conf}')


            else:
                # Identificar características categóricas
                categorical_features_indices = np.where(X.dtypes != np.float64)[0]

                # Calcular los pesos de las clases
                class_weights = [len(y) / (2 * sum(y == c)) for c in np.unique(y)]
                # Crear el modelo CatBostClassifier
                modelo = CatBoostClassifier(
                    depth=6,
                    loss_function='Logloss',
                    class_weights=class_weights,
                    verbose=1000)

                # Entrenar el modelo
                modelo.fit(X_train, y_train)

                y_pred = modelo.predict_proba(X_test)
                threshold=.5
                y_pred=[1 if p[1]>threshold else 0 for p in y_pred ]
                models.append(modelo)
                # Evaluar la precisión del modelo
                accuracy = accuracy_score(y_test, y_pred)
                acc.append(accuracy)
                print('\n','*'*50)
                print('\n PARA EL SEGMENTO: ',segment)
                print('\n','*'*50)
                print('-'*50,'\nProducto: ',prod)
                print(f'Accuracy: {accuracy}')
                recall = recall_score(y_test, y_pred)
                rec.append(recall)
                print(f'Recall: {recall}')
                conf = confusion_matrix(y_test, y_pred)
                print(f'Matriz de confusion: {conf} ','\n')
        if otros:
            faltan=25-len(acc)
            if faltan>0:
                acc+=[1]*faltan
                rec+=[1]*faltan
    resultados[segment]={'accuracy':acc,'recall':rec,'models':models}


Learning rate set to 0.04838
0:	learn: 0.6354288	total: 105ms	remaining: 1m 45s
999:	learn: 0.2821387	total: 20s	remaining: 0us

 **************************************************

 PARA EL SEGMENTO:  4

 **************************************************
-------------------------------------------------- 
Producto:  ind_prod3
Accuracy: 0.8410645575032065
Recall: 0.904480722473081
Matriz de confusion: [[2661  937]
 [ 550 5208]]  

Learning rate set to 0.04838
0:	learn: 0.6427475	total: 24.3ms	remaining: 24.3s


KeyboardInterrupt: 

In [109]:
#Media de los resultados
for segment in resultados.keys():
    print('Valores medios por segmento: ',segment)
    print('Accuracy medio: ', np.mean(resultados[segment]['accuracy']))
    print('Recall medio: ',np.mean(resultados[segment]['recall']))

Valores medios por segmento:  4
Accuracy medio:  0.9638532290306554
Recall medio:  0.908732481166843
Valores medios por segmento:  2
Accuracy medio:  0.9608985321362405
Recall medio:  0.7816858722866797
Valores medios por segmento:  3
Accuracy medio:  0.9577446655524896
Recall medio:  0.8262330727975903
Valores medios por segmento:  1
Accuracy medio:  0.9592054389649982
Recall medio:  0.793933084125231


**Resultados sin filtrado por segmento:**
- Accuracy medio:  0.9219796565483825
- Recall medio:  0.564505857250443
---

**Valores medios por segmento:  4**

- Accuracy medio:  0.8971207287795573
- Recall medio:  0.7402386002440919

**Valores medios por segmento:  2**

- Accuracy medio:  0.9217970642724812
- Recall medio:  0.5633717445733596

**Valores medios por segmento:  3**

- Accuracy medio:  0.9070382642154767
- Recall medio:  0.6177127601546981

**Valores medios por segmento:  1**

- Accuracy medio:  0.9147022814722686
- Recall medio:  0.5691328122618469

In [113]:
import os

In [114]:
for segment in combined_nd['id_segmento'].unique():
    os.mkdir(f'Models/{segment}')
    for prods,models  in zip(diccionario_seg_proc[segment],resultados[segment]['models']):
        raiz=f'Models/{segment}/{prods}.cb'
        models.save_model(raiz)

In [81]:
import os
modelos={}
for segment in combined_nd['id_segmento'].unique():
    lista_mods=[]
    for model in os.listdir(f'Models/{segment}'):
        raiz=f'Models/{segment}/{model}'
        cat = CatBoostClassifier()
        cat.load_model(raiz)
        lista_mods.append(cat)
    modelos[segment]=lista_mods

In [114]:
prods_anterior=[f'ind_prod{i}_s_1' for i in range(1,26)]
data_para_predecir=combined_nd.copy()
data_para_predecir.reset_index(inplace=True)
for i in range(1,26):
    data_para_predecir[f'ind_prod{i}_s_1']=data_para_predecir[f'ind_prod{i}']


In [218]:
data_para_predecir.iloc[:,100:].describe()

Unnamed: 0,ind_prod15_s_1,ind_prod16_s_1,ind_prod17_s_1,ind_prod18_s_1,ind_prod19_s_1,ind_prod20_s_1,ind_prod21_s_1,ind_prod22_s_1,ind_prod23_s_1,ind_prod24_s_1,...,ult_fec_cli_1t_month_int_s1_diff,ult_fec_cli_1t_year_s1_diff,min_month_int,min_antiguedad,max_antiguedad,min_edad,max_edad,min_renta,max_renta,renta_freq
count,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,...,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0
mean,0.004767,0.007931,0.002373,0.040659,0.038436,0.021847,0.003271,0.041921,0.046388,0.103444,...,60.990231,4.999081,3.408602,70.56303,79.183416,39.298809,40.031617,103509.4,103512.6,2691.120439
std,0.06888,0.088703,0.048655,0.197502,0.192249,0.146187,0.057097,0.20041,0.210327,0.304541,...,0.705011,0.066516,3.632328,66.734715,67.188528,17.230801,17.22744,295297.5,295297.2,4789.227956
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,0.0,1.0,-1.0,0.0,-1.0,2.0,-1.0,-1.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,61.0,5.0,1.0,12.0,21.0,24.0,24.0,28260.0,28263.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,61.0,5.0,1.0,44.0,53.0,37.0,38.0,79318.0,79326.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,61.0,5.0,7.0,125.0,134.0,50.0,50.0,133780.0,133784.0,3.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,61.0,5.0,16.0,246.0,255.0,117.0,117.0,28894400.0,28894400.0,11218.0


In [219]:
#Calculamos la prediccion de cada persona en funcion de su segmento, si en el segmento se ha considerado que un producto no se predice ponemos un 0
predicciones_sin_procesar={}
for segment in data_para_predecir['id_segmento'].unique():
    predicciones={}
    X=data_para_predecir[columnas_sin_target].copy()
    for idx,prod in enumerate(diccionario_seg_proc[segment]):
        model=modelos[segment][idx]
        res=model.predict_proba(X)
        predicciones[prod]=res

    for prod in set(set(target_cols)).difference(diccionario_seg_proc[segment]):
        predicciones[prod]=np.zeros(X.shape[0])
    predicciones_sin_procesar[segment]=predicciones

In [220]:
p={}
for persona in data_para_predecir['id'].unique():
    data=data_para_predecir[data_para_predecir['id']==persona]
    indice_persona=data.index[0]
    for key, value in predicciones_sin_procesar.items():
        if data['id_segmento'].values[0]==key:
            seg_pred=predicciones_sin_procesar[key]
            soluciones={}
            for k,v in seg_pred.items():
                if isinstance(v[indice_persona],np.ndarray): #Si es un array devolvemos el segundo elemento que es la probabilidad de la clase 1
                    soluciones[k]=v[indice_persona][1]
                else:
                    soluciones[k]=v[indice_persona]
    p[persona]=soluciones
    

In [221]:
#Convertimos las predicciones a un df
columnas=target_cols + ['id']
df_final = pd.DataFrame(columns=columnas)
for id,k in enumerate(p.keys()):
    dt={j:[v] for j,v in p[k].items()}
    dt['id']=k
    predicciones_df=pd.DataFrame(dt,columns=columnas)
    df_final=pd.concat([df_final,predicciones_df],axis=0)
df_final.head()

Unnamed: 0,ind_prod1,ind_prod2,ind_prod3,ind_prod4,ind_prod5,ind_prod6,ind_prod7,ind_prod8,ind_prod9,ind_prod10,...,ind_prod17,ind_prod18,ind_prod19,ind_prod20,ind_prod21,ind_prod22,ind_prod23,ind_prod24,ind_prod25,id
0,0.0,0.0,5.5e-05,0.0,4e-06,3.8e-05,2.011451e-06,4.3e-05,0.0,3.010401e-06,...,0.0,0.000276,1.5e-05,0.0,0.0,0.0,0.0,2.4e-05,3e-06,5514
0,0.0,0.0,6.4e-05,0.0,2e-06,8.9e-05,1.837039e-07,5e-05,0.0,9.174524e-07,...,0.0,0.000132,4e-06,0.0,0.0,0.0,0.0,1.6e-05,1.2e-05,5541
0,0.0,0.0,3e-05,0.0,2e-06,0.000149,1.976035e-06,4.3e-05,0.0,2.26557e-05,...,0.0,0.000491,7e-06,0.0,0.0,0.0,0.0,0.000106,2e-06,5631
0,0.0,0.0,0.000103,0.0,3e-06,0.000158,1.94927e-07,3.8e-05,0.0,2.818644e-06,...,0.0,0.000341,4e-06,0.0,0.0,0.0,0.0,1.1e-05,1.2e-05,5656
0,0.0,0.0,3e-05,0.0,3e-06,0.000725,5.368101e-06,3.8e-05,0.0,3.896319e-05,...,0.0,0.001075,1.2e-05,0.0,0.0,0.0,0.0,5.9e-05,3e-06,5738


In [222]:
data_para_predecir[f'ind_prod{3}'].describe()

count    46779.000000
mean         0.613053
std          0.487057
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: ind_prod3, dtype: float64

In [223]:
df_final.to_csv('pruebita.csv',index=False)

In [247]:
data=pd.read_csv('pruebita.csv')

In [248]:
data.describe()

Unnamed: 0,ind_prod1,ind_prod2,ind_prod3,ind_prod4,ind_prod5,ind_prod6,ind_prod7,ind_prod8,ind_prod9,ind_prod10,...,ind_prod17,ind_prod18,ind_prod19,ind_prod20,ind_prod21,ind_prod22,ind_prod23,ind_prod24,ind_prod25,id
count,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,...,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0
mean,0.004653301,0.0,0.1227177,0.18594,0.2209158,0.10026,0.122704,0.05375842,0.130493,0.01400396,...,0.02240358,0.1863351,0.01350813,0.01183432,0.009505905,0.05920851,0.09460075,0.2239901,0.009960844,675368.9
std,0.05113982,0.0,0.2572895,0.296086,0.3557951,0.215157,0.2596775,0.1879811,0.265848,0.1088765,...,0.1319669,0.305168,0.07584959,0.07915135,0.06483724,0.1704123,0.2176879,0.3322274,0.07199743,446135.8
min,0.0,0.0,1.222353e-10,0.0,7.584996e-08,0.0,1.002237e-10,2.912019e-11,0.0,2.635886e-12,...,0.0,1.696759e-08,3.997009e-11,0.0,0.0,0.0,0.0,4.445376e-07,4.248143e-11,5514.0
25%,0.0,0.0,9.544491e-07,3.1e-05,3.567097e-05,0.0,2.084432e-05,3.834741e-06,1e-06,1.719861e-06,...,1.79255e-07,0.0007142106,1.957434e-06,4.830547e-09,0.0,5.868473e-08,2.457407e-07,0.0009406846,4.935316e-07,293381.5
50%,1.335811e-08,0.0,4.566914e-05,0.002178,0.0009949808,0.000193,0.0002011072,4.934382e-05,4.4e-05,5.204666e-05,...,1.009342e-06,0.009264848,1.707611e-05,9.482667e-08,2.575673e-08,2.67562e-06,1.847125e-05,0.01356324,9.277251e-06,588304.0
75%,9.166861e-06,0.0,0.008897401,0.298931,0.402589,0.03089,0.04908843,0.0036898,0.020641,0.0001135096,...,7.440108e-06,0.2588512,0.0004818435,8.010995e-06,0.0001041394,0.006165099,0.02710655,0.45195,0.0004087941,1070194.0
max,0.9997809,0.0,0.9993425,0.99103,0.9964295,0.98808,0.9971599,0.9997373,0.989236,0.9999902,...,0.9998301,0.9951061,0.9995094,0.9994665,0.9994041,0.9964837,0.9958459,0.9854988,0.999745,1537800.0


In [249]:
targets_prev=[f'ind_prod{i}_s_1' for i in range(1,26)]
data[targets_prev]=data_para_predecir[target_cols]

In [250]:
data.describe()

Unnamed: 0,ind_prod1,ind_prod2,ind_prod3,ind_prod4,ind_prod5,ind_prod6,ind_prod7,ind_prod8,ind_prod9,ind_prod10,...,ind_prod16_s_1,ind_prod17_s_1,ind_prod18_s_1,ind_prod19_s_1,ind_prod20_s_1,ind_prod21_s_1,ind_prod22_s_1,ind_prod23_s_1,ind_prod24_s_1,ind_prod25_s_1
count,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,...,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0,46779.0
mean,0.004653301,0.0,0.1227177,0.18594,0.2209158,0.10026,0.122704,0.05375842,0.130493,0.01400396,...,0.007931,0.002373,0.040659,0.038436,0.021847,0.003271,0.041921,0.046388,0.103444,0.055132
std,0.05113982,0.0,0.2572895,0.296086,0.3557951,0.215157,0.2596775,0.1879811,0.265848,0.1088765,...,0.088703,0.048655,0.197502,0.192249,0.146187,0.057097,0.20041,0.210327,0.304541,0.228239
min,0.0,0.0,1.222353e-10,0.0,7.584996e-08,0.0,1.002237e-10,2.912019e-11,0.0,2.635886e-12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,9.544491e-07,3.1e-05,3.567097e-05,0.0,2.084432e-05,3.834741e-06,1e-06,1.719861e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.335811e-08,0.0,4.566914e-05,0.002178,0.0009949808,0.000193,0.0002011072,4.934382e-05,4.4e-05,5.204666e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.166861e-06,0.0,0.008897401,0.298931,0.402589,0.03089,0.04908843,0.0036898,0.020641,0.0001135096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.9997809,0.0,0.9993425,0.99103,0.9964295,0.98808,0.9971599,0.9997373,0.989236,0.9999902,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [251]:

for i in range(25):
    # Para verificar si el valor es 0 y asignar 0 en la columna correspondiente
    data.loc[data[targets_prev[i]] == 1, target_cols[i]] = 0

In [253]:
data.drop(targets_prev,axis=1,inplace=True)

In [254]:
data.head()

Unnamed: 0,ind_prod1,ind_prod2,ind_prod3,ind_prod4,ind_prod5,ind_prod6,ind_prod7,ind_prod8,ind_prod9,ind_prod10,...,ind_prod17,ind_prod18,ind_prod19,ind_prod20,ind_prod21,ind_prod22,ind_prod23,ind_prod24,ind_prod25,id
0,0.0,0.0,5.5e-05,0.0,4e-06,3.8e-05,2.011451e-06,4.3e-05,0.0,3.010401e-06,...,0.0,0.000276,1.5e-05,0.0,0.0,0.0,0.0,2.4e-05,3e-06,5514
1,0.0,0.0,6.4e-05,0.0,2e-06,8.9e-05,1.837039e-07,5e-05,0.0,9.174524e-07,...,0.0,0.000132,4e-06,0.0,0.0,0.0,0.0,1.6e-05,1.2e-05,5541
2,0.0,0.0,3e-05,0.0,2e-06,0.000149,1.976035e-06,4.3e-05,0.0,2.26557e-05,...,0.0,0.000491,7e-06,0.0,0.0,0.0,0.0,0.000106,2e-06,5631
3,0.0,0.0,0.000103,0.0,3e-06,0.000158,1.94927e-07,3.8e-05,0.0,2.818644e-06,...,0.0,0.000341,4e-06,0.0,0.0,0.0,0.0,1.1e-05,1.2e-05,5656
4,0.0,0.0,3e-05,0.0,3e-06,0.000725,5.368101e-06,3.8e-05,0.0,3.896319e-05,...,0.0,0.001075,1.2e-05,0.0,0.0,0.0,0.0,5.9e-05,3e-06,5738


In [255]:
data.to_csv('Predicciones_correctas_sin_procesar.csv',index=False)

In [268]:
data=pd.read_csv('Predicciones_correctas_sin_procesar.csv')

In [269]:
def row_to_ordered_dict(row):
    row_dict = row.to_dict()
    sorted_dict = dict(sorted(row_dict.items(), key=lambda item: item[1], reverse=True))
    return sorted_dict

# Crear una nueva columna con los diccionarios ordenados
data['predicted'] = data.apply(row_to_ordered_dict, axis=1)

In [271]:
data['predicted']= data['predicted'].apply(lambda x: [k for k,val in x.items() if (val>.5) & (k!='id')] )

In [276]:
data.rename(columns={'id':'cod_persona'},inplace=True)

In [277]:
data.to_csv('predicciones.csv',index=False)

In [278]:
for i in range(25):
    data.loc[data[target_cols[i]] >= .5,target_cols[i]]=1
    data.loc[data[target_cols[i]] < .5,target_cols[i]]=0

In [280]:
data.to_csv('soluciones.csv',index=False)