## Imports

In [None]:
print("Starting")

import datetime
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import boto3
import pickle
import numpy as np
from sklearn.metrics import mean_squared_error,average_precision_score,mean_squared_error
import joblib
import os

import seaborn as sns
sns.set(font_scale = 1)

import warnings
from sklearn.preprocessing import label_binarize
from multiprocessing import  cpu_count,Pool
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from lifetimes import ModifiedBetaGeoFitter
from lifetimes import GammaGammaFitter
from dateutil.relativedelta import relativedelta 
from lifetimes.utils import summary_data_from_transaction_data
import re
from unicodedata import normalize
import gc
from sklearn.utils import class_weight
from melitk.fda import workspace
import time
import shutil
from base64 import b64decode
import sys
sys.path.append(os.path.dirname(os.path.expanduser(".")))
sys.path.append(os.path.dirname(os.path.expanduser("..")))
sys.path.append(os.path.dirname(os.path.expanduser("../defines")))
from defines import *


s3 = boto3.client('s3')
s3.download_file("fury-data-apps", "marketing-utils/pzenone/utils.py","utils.py")
import utils

target_col = "churn"

TRAIN = True

from preprocessing import *
from train_utils import train_combinations,simple_fit,foo_evaluation_classifier,foo_model_classifier,foo_predict_classifier,anti_dummies,caster,picture

In [None]:
#Cliente BigQuery
AUTH_BIGQUERY = b64decode(os.environ['SECRET_MODELLING_BIGQUERY'])

from google_cloud import BigQuery
from google_cloud import Storage

bq = BigQuery(AUTH_BIGQUERY)
bqs = Storage(AUTH_BIGQUERY)

## Levanto todos los paises y calculo con BTYD

In [None]:
def preprocess(summary_cal):

    # Steps para preprocesar
    steps = []
    
    if (PAIS == 'MLB' or PAIS == 'MLA'):
        steps = steps + [prepaid,asset_mgm,install_mp]
    if (PAIS == 'MLB'):
        steps = steps + [reg_data]
        
    steps = steps + [orders_frequencies,IPT,locations,demograficos, compras_shipping, target, tarjetas, visitas_short,
         install_ml, payments, sellers, asp_pareto]

    
    for func in steps:
        start = time.time()
        bqs.print_gcps("--------------------",bq_log_path=bq_log_path,silent = True)
        bqs.print_gcps("Start time:",datetime.datetime.now(),bq_log_path=bq_log_path,silent = True)
        bqs.print_gcps("Starting:",func.__name__,"Summary size:",summary_cal.shape,bq_log_path=bq_log_path)
        summary_cal = func(bqs, gcps_path_in,summary_cal,"test")
        end = time.time()
        bqs.print_gcps("tiempo en resolver:", end-start,bq_log_path=bq_log_path,silent = True)
    
    return summary_cal

In [None]:
def preprocess_all(bqs,gcps_path_in):
    
    summary_cal = read_gfs_parquet(gcps_path_in, bqs, 'summary_cal')
    summary_cal = preprocess(summary_cal)
        
    out_file = f"summary_train.pkl"
    summary_cal.to_pickle(out_file)
    
    bqs.upload_file(out_file,gcps_path_in + out_file)
    
    return summary_cal

In [None]:
def preparo_datos(summary_cal):
    
    numeric_dtypes = [np.dtype('float32'),np.dtype('float64'), np.dtype('int32'),np.dtype('int64')]
    
    total_cols = list(summary_cal.columns)
    if('CUS_CUST_ID' in total_cols): total_cols.remove('CUS_CUST_ID')
    
    # Evitar filtrar la variable target
    if(target_col in total_cols): total_cols.remove(target_col)
    
    # Eliminado de la query de Target, pero si lo vuelven a usar esta 
    if('GMV_TARGET' in total_cols): total_cols.remove('GMV_TARGET')
    
    # Filtro APP_INSTALL (Uso en Churn) 
    if('APP_INSTALL' in total_cols): total_cols.remove('APP_INSTALL')
    
    train_cols_nocat = [x for x in total_cols if(summary_cal.dtypes[x] in numeric_dtypes)]
    
    cates = [x for x in total_cols if(summary_cal.dtypes[x] not in numeric_dtypes)]

    for x in train_cols_nocat:
        summary_cal[x] = summary_cal[x].astype(float)
    for x in cates:
        summary_cal[x] = summary_cal[x].astype(str)
        
    return summary_cal,cates,train_cols_nocat


In [None]:
# Preprocess parquet files
summary_cal = preprocess_all(bqs,gcps_path_in)

### Entrenamiento Modelo de seleccion de Variables

In [None]:
summary_cal = pd.read_pickle("summary_train.pkl")

# Definidas en Defines.py
summary_cal = summary_cal[[x for x in summary_cal.columns if x not in EXCLUDE_VARIABLES[PAIS]]]
summary_cal.loc[summary_cal.freq_180D < 15] # Filtro por las dudas lo que se filtro en la pivot en el ETL
summary_cal[target_col] = np.where((summary_cal["freq_180D"]/2).astype(int) >  summary_cal["frequency_eval"] ,1,0)

#CUS_CUST_ID no es relevante, y GMV_TARGET - frequency_eval producen Data Leakage
train_cols = [x for x in summary_cal.columns if x not in  ['CUS_CUST_ID','GMV_TARGET','frequency_eval']]

summary_cal,cates,train_cols_nocat = preparo_datos(summary_cal[train_cols])
train, valid= train_test_split(summary_cal,test_size=0.15,random_state=0,stratify=summary_cal[target_col].values)
train, test= train_test_split(train,test_size=0.15,random_state=0,stratify=train[target_col].values)

# Para debuggear modelo
train.to_pickle("train.pkl")
test.to_pickle("test.pkl")
test.to_pickle("valid.pkl")

bqs.upload_file("train.pkl", gcps_path_out + "train.pkl")
bqs.upload_file("test.pkl", gcps_path_out + "test.pkl")
bqs.upload_file("valid.pkl", gcps_path_out + "valid.pkl")

bqs.print_gcps("Churn Rate:\n",summary_cal[target_col].value_counts()/summary_cal.shape[0],bq_log_path=bq_log_path)
proporcion = summary_cal[target_col].value_counts()[1]/summary_cal.shape[0]  # Porpocion que churnea

In [None]:
X_train_ =  np.c_[train[train_cols_nocat].values,train[cates].values]
X_valid_ =  np.c_[valid[train_cols_nocat].values,valid[cates].values]
X_test =  np.c_[test[train_cols_nocat].values,test[cates].values]

y_train_ =  np.array(train[target_col])
y_valid_ =  np.array(valid[target_col])
y_test =  np.array(test[target_col])  

# indices de las categorias
cates_ind = [X_train_.shape[1] - len(cates) + i for i in range(len(cates))]

# Entreno    
basic_params = {
           'use_best_model':True,
           'loss_function': 'Logloss',
            'eval_metric':'Logloss',
            'verbose':False,
            'boosting_type':"Plain",
            'bootstrap_type':"Bernoulli",
            'iterations':500,
#              'class_weights':weights_l,
#              'rsm':0.1,  # OJO!!! esto solo va si tenes mas de 100 features!,
#             'max_ctr_complexity':2,
           'random_state': 45
        }

hyperparams = { 
                'learning_rate':[0.01,0.05,0.1,0.2,0.3],
                'depth': [1,3,5,7,10],                                                       
                'l2_leaf_reg': [1,10,50,100,150,300,500,750,1000],
                'border_count': [5,10,30,60,100,150,200]
            }

d_types = { 'depth': int,
           'l2_leaf_reg': int,
           'border_count':int
          }

# indices de las categorias
cates_ind = [X_train_.shape[1] - len(cates) + i for i in range(len(cates))]  


params = train_combinations(X_train_,X_valid_,y_train_,y_valid_,"average_precision",foo_predict_classifier,
                              foo_model_classifier,basic_params,d_types,hyperparams,cates_ind, n_iter = SELECT_VARIABLES_STEPS)

best_model =  simple_fit(params,X_train_,y_train_,X_valid_,y_valid_,foo_model_classifier,basic_params,d_types,cates_ind)

error_test = foo_evaluation_classifier(y_test,foo_predict_classifier(best_model,X_test))
error_train = foo_evaluation_classifier(y_train_,foo_predict_classifier(best_model,X_train_))
error_validation = foo_evaluation_classifier(y_valid_,foo_predict_classifier(best_model,X_valid_))

bqs.print_gcps("Error Test:\n",error_test,bq_log_path=bq_log_path)
bqs.print_gcps("Error Validation:\n",error_validation,bq_log_path=bq_log_path)
bqs.print_gcps("Error Train:\n",error_train,bq_log_path=bq_log_path)

# Guado resultados!
with open('model.pkl', 'wb') as fout:
    pickle.dump(best_model, fout)

# No subo el modelo porque es el entrenado con muy pocas iteraciones
#bqs.upload_file("model.pkl",gcps_path_out + "model.pkl")

importance = picture(np.r_[X_train_,X_valid_],np.r_[y_train_,y_valid_],X_test,y_test,cates_ind,best_model,foo_model_classifier,train_cols_nocat,cates)

importance.sort_values('Value', ascending = False).to_csv("importance_short.csv",index=False)

### Filtrado de variables por importancia

In [None]:

imp = pd.read_csv(f'importance_short.csv').sort_values('Value', ascending = False)

max_importance = imp.iloc[1,1]
rand_importance = imp.loc[imp.Feature == 'rand'].values[0][1]

print(f"max_importance: {max_importance}")
print(f"rand_importance: {rand_importance}")

# Filtros para seleccionar las varibles relevantes para el entrenamiento completo
mayores_a_rand = list(imp.loc[imp.Value > rand_importance].Feature)
mayores_orden_4 = list(imp.loc[imp.Value >= (max_importance / 1000)].Feature)
distinto_0 = list(imp.loc[imp.Value > 0.0].Feature)

total_vars = [x for x in list(imp.Feature) if (x in mayores_a_rand) and (x in mayores_orden_4) and (x in distinto_0)]

pd.DataFrame(total_vars, columns = ['Feature']).to_csv(f"features_model.csv", index = False)

# Subo features del modelo N al Storage, incluidos el cluster y la columna target
bqs.upload_file(f"features_model.csv", gcps_path_out + f"features_model.csv")

### Entrenamiento Modelo con variables filtradas

In [None]:
bqs.download_file(gcps_path_in + "summary_train.pkl", "summary_train.pkl")
summary_cal = pd.read_pickle("summary_train.pkl")
summary_cal.loc[summary_cal.freq_180D < 15] # Filtro por las dudas lo que se filtro en la pivot en el ETL
summary_cal[target_col] = np.where((summary_cal["freq_180D"]/2).astype(int) >  summary_cal["frequency_eval"] ,1,0)

#CUS_CUST_ID no es relevante, y GMV_TARGET / frequency_eval producen Data Leakage
train_cols = list(pd.read_csv(f"features_model.csv").Feature) + [target_col]


summary_cal,cates,train_cols_nocat = preparo_datos(summary_cal[train_cols])
train, valid= train_test_split(summary_cal,test_size=0.15,random_state=0,stratify=summary_cal[target_col].values)
train, test= train_test_split(train,test_size=0.15,random_state=0,stratify=train[target_col].values)

# Para debuggear modelo
train.to_pickle("train.pkl")
test.to_pickle("test.pkl")
test.to_pickle("valid.pkl")

bqs.upload_file("train.pkl", gcps_path_out + "train.pkl")
bqs.upload_file("test.pkl", gcps_path_out + "test.pkl")
bqs.upload_file("valid.pkl", gcps_path_out + "valid.pkl")

bqs.print_gcps("Churn Rate:\n",summary_cal[target_col].value_counts()/summary_cal.shape[0],bq_log_path=bq_log_path)
proporcion = summary_cal[target_col].value_counts()[1]/summary_cal.shape[0]  # Porpocion que churnea

X_train_ =  np.c_[train[train_cols_nocat].values,train[cates].values]
X_valid_ =  np.c_[valid[train_cols_nocat].values,valid[cates].values]
X_test =  np.c_[test[train_cols_nocat].values,test[cates].values]

y_train_ =  np.array(train[target_col])
y_valid_ =  np.array(valid[target_col])
y_test =  np.array(test[target_col])  

cates_ind = [X_train_.shape[1] - len(cates) + i for i in range(len(cates))]  # indices de las categorias

# Entreno    
basic_params = {
           'use_best_model':True,
           'loss_function': 'Logloss',
            'eval_metric':'Logloss',
            'verbose':False,
            'boosting_type':"Plain",
            'bootstrap_type':"Bernoulli",
            'iterations':500,
#              'class_weights':weights_l,
#              'rsm':0.1,  # OJO!!! esto solo va si tenes mas de 100 features!,
#             'max_ctr_complexity':2,
           'random_state': 45
        }

hyperparams = { 
                'learning_rate':[0.01,0.05,0.1,0.2,0.3],
                'depth': [1,3,5,7,10],                                                       
                'l2_leaf_reg': [1,10,50,100,150,300,500,750,1000],
                'border_count': [5,10,30,60,100,150,200]
            }

d_types = { 'depth': int,
           'l2_leaf_reg': int,
           'border_count':int
          }
cates_ind = [X_train_.shape[1] - len(cates) + i for i in range(len(cates))]  # indices de las categorias

params = train_combinations(X_train_,X_valid_,y_train_,y_valid_,"average_precision",foo_predict_classifier,
                              foo_model_classifier,basic_params,d_types,hyperparams,cates_ind, n_iter = FULL_TRAINING_STEPS)

best_model =  simple_fit(params,X_train_,y_train_,X_valid_,y_valid_,foo_model_classifier,basic_params,d_types,cates_ind)

error_test = foo_evaluation_classifier(y_test,foo_predict_classifier(best_model,X_test))
error_train = foo_evaluation_classifier(y_train_,foo_predict_classifier(best_model,X_train_))
error_validation = foo_evaluation_classifier(y_valid_,foo_predict_classifier(best_model,X_valid_))

bqs.print_gcps("Error Test:\n",error_test,bq_log_path=bq_log_path)
bqs.print_gcps("Error Validation:\n",error_validation,bq_log_path=bq_log_path)
bqs.print_gcps("Error Train:\n",error_train,bq_log_path=bq_log_path)

# Guado resultados!
with open('model.pkl', 'wb') as fout:
    pickle.dump(best_model, fout)
bqs.upload_file("model.pkl",gcps_path_out + "model.pkl")

importance = picture(np.r_[X_train_,X_valid_],np.r_[y_train_,y_valid_],X_test,y_test,cates_ind,best_model,foo_model_classifier,train_cols_nocat,cates)
bqs.upload_file("lgbmImportance.png", gcps_path_out + "lgbmImportance.png")
#importance.to_csv

### Threshold Optimo - Precision Recall Curve

In [None]:
# Busco el treshold optimo. Armo curva PR idel vs test
from sklearn.metrics import precision_score,recall_score

test["churn_proba"] = np.array(best_model.predict_proba(X_test)[:,1])
valid["churn_proba"] = np.array(best_model.predict_proba(X_valid_)[:,1])
train["churn_proba"] = np.array(best_model.predict_proba(X_train_)[:,1])

l_ = []
for t in np.arange(0.001,0.99,0.01):
    l_.append({"tresh":t,"precision":precision_score(test[target_col],np.where(test.churn_proba >= t,1,0)),
               "recall":recall_score(test[target_col],np.where(test.churn_proba >= t,1,0)),
               "cantidad":test.loc[test.churn_proba >= t].shape[0]/test.shape[0]})
df_tresh_test = pd.DataFrame(l_)

l_ = []
for t in np.arange(0.001,0.99,0.01):
    l_.append({"tresh":t,"precision":precision_score(valid[target_col],np.where(valid.churn_proba >= t,1,0)),
               "recall":recall_score(valid[target_col],np.where(valid.churn_proba >= t,1,0)),
               "cantidad":valid.loc[valid.churn_proba >= t].shape[0]/valid.shape[0]})
df_tresh_valid = pd.DataFrame(l_)

l_ = []
for t in np.arange(0.001,0.99,0.01):
    l_.append({"tresh":t,"precision":precision_score(train[target_col],np.where(train.churn_proba >= t,1,0)),
               "recall":recall_score(train[target_col],np.where(train.churn_proba >= t,1,0)),
               "cantidad":train.loc[train.churn_proba >= t].shape[0]/train.shape[0]})
df_tresh_train = pd.DataFrame(l_)

df_tresh_train.to_csv("errores_train.csv", index = False)
df_tresh_valid.to_csv("errores_validation.csv", index = False)
df_tresh_test.to_csv("errores_test.csv", index = False)


bqs.upload_file("errores_train.csv", gcps_path_out + "errores_train.csv")
bqs.upload_file("errores_validation.csv", gcps_path_out + "errores_validation.csv")
bqs.upload_file("errores_test.csv", gcps_path_out + "errores_test.csv")

tresh = df_tresh_test.loc[df_tresh_test.precision >= 0.6].tresh.min()
cant = df_tresh_test.loc[df_tresh_test.tresh == tresh].cantidad.values[0]
pre = df_tresh_test.loc[df_tresh_test.tresh == tresh].precision.values[0]
recall = df_tresh_test.loc[df_tresh_test.tresh == tresh].recall.values[0]

In [None]:

thresh_cant_pre = pd.DataFrame(columns = ['Threshold','Cantidad','Precision'])
thresh_cant_pre = thresh_cant_pre.append({'Threshold': tresh,'Cantidad':cant,'Precision':pre}, ignore_index = True)
thresh_cant_pre.to_csv("thresh_cant_pre.csv", index = False)
bqs.upload_file("thresh_cant_pre.csv", gcps_path_out + "thresh_cant_pre.csv")

In [None]:
# Precision - Recall
sns.lineplot(data=df_tresh_train, x="recall", y="precision")

In [None]:
sns.set(font_scale = 1)
plt.figure(figsize=(10,10))

bines = 50
df2 = test.copy()

df2['buckets'] = pd.qcut(df2['churn_proba'].rank(method='first'),bines,labels=list(range(bines)))
agregados = df2.groupby('buckets').churn.mean()
plt.plot(agregados)
plt.hlines(y=df2.churn.mean(),xmin=0,xmax=bines-1,color="black")

tresh_ind = agregados[agregados>df2.churn.mean()].sort_values().index.min()
print("Treshold sugerido:",df2.groupby('buckets').churn_proba.min()[tresh_ind])


In [None]:
metrics_dict = {
            'rand': 123
        }
    
workspace.save_metrics(metrics_dict)

import pickle
df = pd.DataFrame([1,2,4,5],columns = ["hola"])

serialized_dataset = pickle.dumps(df)
workspace.save_raw_model(serialized_dataset)