## Imports

In [1]:
import datetime
import os
import pickle
import pandas as pd
import s3fs

from melitk.analytics.connectors.core.authentication import Authentication
from melitk.analytics.connectors.teradata import ConnTeradata
from melitk.analytics.connectors.presto import ConnPresto
from melitk.fda import workspace
import datetime
from dateutil.relativedelta import relativedelta
from shared.settings import DATASET_FILENAME, SAMPLE

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from lifetimes.utils import summary_data_from_transaction_data
import numpy as np
import datetime
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from lifetimes import ModifiedBetaGeoFitter
from lifetimes import GammaGammaFitter
import dask.dataframe as dd
from dask.multiprocessing import get
from collections import Counter
from sklearn.model_selection import train_test_split
import gc
import joblib
import boto3
from dateutil.relativedelta import relativedelta

cores = cpu_count() - 1

s3_path = "ltv-ml/ML/MLM/LTV_long/"

In [2]:
def _write_dataframe_to_csv_on_s3(df, path_s3):
    import boto3
    from io import StringIO
    """ Write a dataframe to a CSV on S3 """
    a = path_s3.split('//')
    b = a[1].split('/')
    bucket = b[0]
    c = path_s3.split(bucket+'/')
    path = c[1]
    
    buffer = StringIO()
    df.to_csv(buffer,index=False)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, path).put(Body=buffer.getvalue())
    return None

## Levanto la data

In [4]:
def asigno_quintiles_4(x):
    step1 = int(x.shape[0]*0.1) 
    step2 = int(x.shape[0]*0.2)
    
    l_ = []
    l_ = l_ + ["q"+str(1)]*step1 
    l_ = l_ + ["q"+str(2)]*step1 
    l_ = l_ + ["q"+str(3)]*step2 
    
    step3 = x.shape[0] - len(l_)
    l_ = l_ + ["q4"]*step3
    return l_

In [5]:
def aMultiprocesar_IPT(x):
    aux = x.sort_values(by=["cust","date"])
    aux = aux.drop_duplicates(subset=["cust","date"]) # para que el IPT no de infinito 
    
    if(aux.shape[0] == 0):
        return pd.DataFrame([],columns = ["cust","IPT_mean","IPT_std","IPT_max","IPT_min","IPT_sum","IPT_CV"])
    aux = aux.groupby("cust").apply(lambda x: x["date"].diff()).reset_index()

    if("date" not in aux.columns):
        return pd.DataFrame([],columns = ["cust","IPT_mean","IPT_std","IPT_max","IPT_min","IPT_sum","IPT_CV"])
    
    aux = aux[["cust","date"]] 
    aux= aux.dropna(subset = ["date"]) ## saco los nan para que no rompa el log10
    if(aux.shape[0] == 0):
        return pd.DataFrame([],columns = ["cust","IPT_mean","IPT_std","IPT_max","IPT_min","IPT_sum","IPT_CV"])
    aux["date"] = aux["date"].apply(lambda x: np.log10(x/ np.timedelta64(1, 'D')))
    if(aux.shape[0] == 0):
        return pd.DataFrame([],columns = ["cust","IPT_mean","IPT_std","IPT_max","IPT_min","IPT_sum","IPT_CV"])
    
    aux =  aux.groupby("cust")["date"].agg({"IPT_mean":"mean","IPT_std":"std","IPT_max":"max","IPT_min":"min","IPT_sum":"sum"}).reset_index()
    aux["IPT_CV"] = aux["IPT_std"]/aux["IPT_mean"]
    return aux[["cust","IPT_mean","IPT_std","IPT_max","IPT_min","IPT_sum","IPT_CV"]] 

def aMultiprocesar_sales(x):
    aux =  x.groupby("cust")["sales"].agg({"money_mean":"mean","money_std":"std","money_max":"max","money_min":"min",
                                        "money_sum":"sum"}).reset_index()
    aux["money_CV"] = aux["money_std"]/aux["money_mean"]
    return aux[["cust","money_sum","money_CV","money_mean","money_std","money_max","money_min"]]
    
def preprocesing(country_cal):
    
    ########################### Preproc para pareto  ###########################
    print("Preproc para pareto")
    
    summary_cal = summary_data_from_transaction_data(country_cal, 'cust', 'date', freq="W",
                                                     monetary_value_col='sales',  # le saca la primer compra (semana de compras) y toma el promedio agrupado por semana (frequency)
                                            observation_period_end=country_cal.date.max()) 
    
    ########################### My recency   ###########################
    print("Recency frequency")
    df = country_cal.groupby('cust')["date"].agg({"date_max":"max","date_min":"min"}).reset_index()
    df["my_recency"] = (country_cal.date.max()  - df["date_max"])/np.timedelta64(1, "W") # me genero el recency y la semanalizo
    df[["my_recency"]] = df[["my_recency"]].applymap(np.int64)

    summary_cal = pd.merge(summary_cal,df,on="cust",how="left").fillna(0)
    
    ########################### Pareto  ###########################
    bgf = ModifiedBetaGeoFitter()
    bgf.fit(summary_cal['frequency'], summary_cal['recency'], summary_cal['T'])
    print("fiteo")
    summary_cal["n_purchases_pred"] = dd.from_pandas(summary_cal, npartitions=cores).apply(lambda x: bgf.conditional_expected_number_of_purchases_up_to_time(52*7, x['frequency'], x['recency'], x['T']),axis = 1,meta = ('float')).compute(scheduler='processes')
    print("npred")

    def apply_pAlive(x):  # hay modelos que devuelve una lista como proba, hay que hacer un unpak en estos casos
        y = bgf.conditional_probability_alive(x['frequency'], x['recency'], x['T'])
        if(type(y) == np.ndarray):
            return y[0]
        else:
            return y

    summary_cal["p_alive"] = dd.from_pandas(summary_cal, npartitions=cores).apply(lambda x: apply_pAlive(x),axis = 1,meta = ('float')).compute(scheduler='processes')
    print("p_alive")

    # Si imputo valores crashea ya que no puedo converger, intente moviendole el penalizer pero la cosa no mejora...
    summary_gg = summary_cal.copy()
    summary_gg.loc[summary_gg["monetary_value"] == 0,"monetary_value"] = min(summary_gg.loc[summary_gg["monetary_value"] > 0]["monetary_value"])
    #summary_cal = summary_cal.loc[summary_cal["monetary_value"] > 0]
    ggf = GammaGammaFitter(penalizer_coef = 0)
    ggf.fit(summary_gg['frequency']+1,
            summary_gg['monetary_value'])

    print("fit")

    resu = ggf.conditional_expected_average_profit(summary_gg['frequency'],
            summary_gg['monetary_value']).reset_index()

    resu.columns = ["ind","montary_sales_forecast"]
    summary_cal["montary_sales_forecast"] = resu["montary_sales_forecast"]
    # la cantidad de compras que le va a hacer el usuario en su año de vida es la cantidad de compras que hara * compras promedio
    summary_cal["clv_pareto"] = summary_cal["n_purchases_pred"]*summary_cal["montary_sales_forecast"] 
    summary_cal["target_pareto"] = summary_cal["n_purchases_pred"]*summary_cal["clv_pareto"]    
    
    ########################### IPT  ###########################
    print("IPT")
    cust_ids = list(country_cal["cust"].unique())

    def chunks(l, n):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i + n]

    l_dfs = []
    for i in chunks(cust_ids,len(cust_ids)//cores):
        l_dfs.append(country_cal.loc[country_cal["cust"].isin(i)])

    p = Pool(cpu_count())
    l_procedDfs = list(tqdm(p.imap(aMultiprocesar_IPT,l_dfs), total=len(l_dfs)))
    p.close()
    p.join()

    df = pd.concat(l_procedDfs)
    summary_cal = pd.merge(summary_cal,df,on="cust",how="left").fillna(0)
    
   
    ########################### Sales  ############################
    print("sales")
    cust_ids = list(country_cal["cust"].unique())

    def chunks(l, n):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i + n]

    l_dfs = []
    for i in chunks(cust_ids,len(cust_ids)//cores):
        l_dfs.append(country_cal.loc[country_cal["cust"].isin(i)])

      

    p = Pool(cpu_count())
    l_procedDfs = list(tqdm(p.imap(aMultiprocesar_sales,l_dfs), total=len(l_dfs)))
    p.close()
    p.join()

    df = pd.concat(l_procedDfs)
    summary_cal = pd.merge(summary_cal,df,on="cust",how="left").fillna(0)
    
    return summary_cal

In [4]:
def L_porcentage(country_cal,vectorizer,L_num):

    # saco los na..
    country_cal = country_cal.dropna(subset=[L_num])
    country_cal[L_num] = country_cal[L_num].apply(lambda x: str(int(x)))

    def categories(y):    
        dtm = vectorizer.transform(y[L_num].values)  # a sparse matrix
        vocab = vectorizer.get_feature_names()  # a list
        words_freq = np.asarray(dtm.sum(axis=0))      

        return pd.DataFrame(words_freq / words_freq.sum(),columns = vocab)

    cust_ids = list(country_cal["cust"].unique())

    def chunks(l, n):
        """Yield successive n-sized chunks from l."""
        for i in range(0, len(l), n):
            yield l[i:i + n]

    l_dfs = []
    for i in chunks(cust_ids,len(cust_ids)//8):
        l_dfs.append(country_cal.loc[country_cal["cust"].isin(i)])

    aux = []
    for batch in l_dfs:
        aux.append(
            dd.from_pandas(batch, npartitions=cores).groupby("cust").apply(
                lambda x:categories(x)).compute(scheduler='processes').reset_index().drop(["level_1"],axis = 1))
        os.write(1,b"\ntermino batch")
    
    L = L_num.split('_')[1]
    resu = pd.concat(aux)
    resu.columns = [x if(x == "cust") else L + "_"+x for x in resu.columns]
    return resu

In [5]:
def quintil_past(country_qpast):
    aux_f = country_qpast.set_index("date").to_period("W").to_timestamp().reset_index().drop_duplicates(
        subset = ["date","cust"]).groupby("cust")["sales"].agg(["size"]).reset_index()    
    aux_f.columns = ["cust","freq_q"]
    print(country_qpast.columns)
    aux_s = country_qpast.groupby("cust")["sales"].sum().reset_index()
    print(aux_s.columns)
    aux_s.columns = ["cust","sales_q"]     
    
    aux = pd.merge(aux_f,aux_s,on="cust",how = "outer")
    aux["clv_q"] = aux["freq_q"]*aux["sales_q"]
    
    aux = aux.sort_values(by = "clv_q",ascending = False)
    aux["quintil_past"] = asigno_quintiles_4(aux)
    
    aux = aux.replace({"quintil_past":{"q4":0,"q3":1,"q2":2,"q1":3}})
    return aux[["cust","quintil_past"]]

In [6]:
def month_data_size(summary_1,inic,country):
    for i in range(4):
        c_aux = country.loc[(country.date < inic - relativedelta(months = i)) & (country.date >= inic - relativedelta(months = (i+1)))]
        aux = c_aux.drop_duplicates(
            subset = ["cust","date"]).groupby("cust").size().reset_index(name = "size")[["cust","size"]]
        aux.columns = ["cust","size_"+str(i)]
        summary_1 = pd.merge(summary_1,aux,on="cust",how = "left").fillna(0)
    return summary_1

def month_data_sales(summary_1,inic,country):
    for i in range(4):
        c_aux = country.loc[(country.date < inic - relativedelta(months = (i))) & (country.date >= inic - relativedelta(months = (i+1)))]
        aux = c_aux.groupby("cust").sum().reset_index()[["cust","sales"]]
        aux.columns = ["cust","sales_"+str(i)]
        summary_1 = pd.merge(summary_1,aux,on="cust",how = "left").fillna(0)
    return summary_1

def month_data_recency(summary_1,inic,country):
    for i in range(4):
        c_aux = country.loc[(country.date < inic - relativedelta(months = (i))) & (country.date >= inic - relativedelta(months = (i+1)))]
        aux = c_aux.groupby('cust')["date"].agg({"date_max":"max","date_min":"min"}).reset_index()
        aux["my_recency"] = (c_aux.date.max()  - aux["date_max"])/np.timedelta64(1, "W") # me genero el recency y la semanalizo
        aux = aux[["cust","my_recency"]]
        aux.columns = ["cust","recency_"+str(i)]
        summary_1 = pd.merge(summary_1,aux,on="cust",how = "left").fillna(-1)
    return summary_1

In [None]:
def get_real_date(actual,expect):
    resta = (datetime.date(actual.year,actual.month,actual.day) - expect)
    if(resta.days < 0):
        return datetime.date(actual.year -1,expect.month,expect.day)
    return expect

# me quedo con el hotsale que este mas cerca
def nearest_date(items,pivot):
    pivot = datetime.date(pivot.year,pivot.month,pivot.day)
    difs = [np.abs((x - pivot).days) for x in items ]
    return items[np.argmin(difs)]

def last_hot(summary_1,near_hot,country):    
    c_aux = country.loc[(country.date < (near_hot + relativedelta(days = 5))) & (country.date >= (near_hot - relativedelta(days = (5))))]
    aux = c_aux.drop_duplicates(
        subset = ["cust","date"]).groupby("cust").size().reset_index(name = "size")[["cust","size"]]
    aux.columns = ["cust","size_last_hot"]
    summary_1 = pd.merge(summary_1,aux,on="cust",how = "left").fillna(0)
    
    return summary_1

In [None]:
def month_lag_12(summary_1,inic,country):    
    c_aux = country.loc[(country.date < (inic - relativedelta(months = 12-3))) & (country.date >= (inic - relativedelta(months = (12))))]
    print(c_aux.date.min(), c_aux.date.max())
    aux = c_aux.drop_duplicates(
        subset = ["cust","date"]).groupby("cust").size().reset_index(name = "size")[["cust","size"]]
    aux.columns = ["cust","size_lag_12"]
    summary_1 = pd.merge(summary_1,aux,on="cust",how = "left").fillna(0)
    return summary_1

In [10]:
locations = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Location.csv",sep = "|")
locations["SHP_ADD_STATE_NAME_R"] = locations["SHP_ADD_STATE_NAME_R"].str.lower()

locations.loc[locations.SHP_ADD_STATE_NAME_R == "distrito federal","SHP_ADD_STATE_NAME_R"] = "estado de méxico"
locations.loc[locations.SHP_ADD_STATE_NAME_R == "estado de mexico","SHP_ADD_STATE_NAME_R"] = "estado de méxico"
locations.loc[locations.SHP_ADD_STATE_NAME_R == "ciudad de mexico","SHP_ADD_STATE_NAME_R"] = "estado de méxico"
locations.loc[locations.SHP_ADD_STATE_NAME_R == "nuevo leon","SHP_ADD_STATE_NAME_R"] = "nuevo león"
locations.loc[locations.SHP_ADD_STATE_NAME_R == "san luis potosi","SHP_ADD_STATE_NAME_R"] = "san luis potosí"
locations.loc[locations.SHP_ADD_STATE_NAME_R == "yucatan","SHP_ADD_STATE_NAME_R"] = "yucatán"

In [3]:
country = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Raw_Extend_MLM_1_2019-07-20.csv",sep = "|").append(
    pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Raw_Extend_MLM_2_2019-07-20.csv",sep = "|"),ignore_index = True)
#country = pd.read_csv("s3://fda-labs/ltv-ml/ML/sample_Raw_MLA.csv")[["date","cust","sales","CATEG_L1","CATEG_L2","CATEG_L3"]]
country.columns = ["date","cust","sales","CATEG_L1","CATEG_L2","CATEG_L3"]
country["date"] = pd.to_datetime(country["date"],infer_datetime_format=True,errors = "coerce")

In [10]:
#country = country.sample(frac = 0.01)

In [17]:
end_cal = country.date.max() - relativedelta(months=6)
start_cal = end_cal - relativedelta(months=18)   # 18 meses de training

country_cal = country.loc[country['date'] >= start_cal]
country_cal = country_cal.loc[country_cal['date'] <= end_cal]
print(country_cal.date.min(),country_cal.date.max())

# |||||||||| Cargo Freq, recency, my_recency,ect - IPT - sales  ||||||||||
summary_cal = preprocesing(country_cal)  # le paso toda la data para que fite bien

past_end = (end_cal - relativedelta(months = 3))

analisis_users = country.loc[(country['date'] > past_end) & (country['date'] <= end_cal)]["cust"] # defino este cohort como los usuarios que estuvieorn en los ultimos 3 meses
country_cal = country_cal.loc[country_cal.cust.isin(analisis_users.values)] # me que con los que entraron en los ultimos 3 meses

summary_cal = summary_cal.loc[summary_cal.cust.isin(analisis_users.values)] # me quedo con los usuarios de analisis

# |||||||||| Cargo Porcentage de uso por categoria  ||||||||||
vectorizer = CountVectorizer(token_pattern=r'[0-9].*',                         
                             lowercase=False)

# Creo el count vecrtoirzer y lo guardo en S3
vectorizer.fit([str(int(x)) for x in country_cal["CATEG_L1"].unique() if (x==x)])  # saco los nan para que el vectorizer capte todo
joblib.dump(vectorizer, "countVectorizer.sav")
s3 = boto3.client('s3')
s3.upload_file("countVectorizer.sav", "fda-labs", s3_path + "countVectorizer_L1.sav")

summary_cal = pd.merge(summary_cal, L_porcentage(country_cal[["cust","CATEG_L1"]],  # le paso solo lo que necesita usar
                                                 vectorizer,"CATEG_L1"),on="cust",how = "left")

# |||||||||| Le asigno el LTV que tenia en los ultimos  ||||||||||
past_inic = (end_cal - relativedelta(months = 6))
past_end = (end_cal - relativedelta(months = 3))

country_eval = country.loc[(country['date'] > past_end) & (country['date'] <= end_cal)] # defino este cohort como los usuarios que estuvieorn en los ultimos 3 meses
country_eval = country.loc[(country['date'] > past_inic) & (country['date'] <= end_cal) &
                          country.cust.isin(country_eval.cust.values)] # tomo la plata que generaron en 6 meses

summary_cal = pd.merge(summary_cal,
                       quintil_past(country_eval),
                      on = "cust", how = "left").fillna(0) # si no estaba, entonces es quintil ultimo

# |||||||||| Agrego locations ||||||||||
locations = locations[["CUS_CUST_ID","SHP_ADD_STATE_NAME_R"]]
locations.columns = ["cust","SHP_ADD_STATE_NAME_R"]
summary_cal = pd.merge(summary_cal,
                       locations,
                      on = "cust", how = "left").fillna("estado de méxico") # si no estaba, entonces es quintil ultimo

locations_ = locations.SHP_ADD_STATE_NAME_R.value_counts()[:31].index  # top ciudades
aux = pd.get_dummies(summary_cal.SHP_ADD_STATE_NAME_R).drop(["tlaxcala"],axis = 1)
# agrego columnas en caso que falten
for loc in locations_:
    if(loc not in aux.columns):
        aux[loc] = 0
        
summary_cal = pd.concat([summary_cal,aux],axis = 1).drop(["SHP_ADD_STATE_NAME_R"],axis = 1)

# |||||||||| Agrego datos de la ultima compra ||||||||||
summary_cal = month_data_size(summary_cal,end_cal,country_cal)
summary_cal = month_data_sales(summary_cal,end_cal,country_cal)
summary_cal = month_data_recency(summary_cal,end_cal,country_cal)

# |||||||||| Agrego datos de las visitas ||||||||||
visitas = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Visitas_train.csv",sep = "|")
visitas["recency_date"] = pd.to_datetime(visitas["recency_date"],infer_datetime_format=True,errors = "coerce")
visitas["recency_date"] = (end_cal  - visitas["recency_date"])/np.timedelta64(1, "D")
visitas["first_date"] = pd.to_datetime(visitas["first_date"],infer_datetime_format=True,errors = "coerce")
visitas["first_date"] = (end_cal  - visitas["first_date"])/np.timedelta64(1, "D")
cols = cols = [x for x in visitas.columns if("L1" in x)]
visitas["L1_tot"] = visitas[cols].sum(axis = 1)

summary_cal = pd.merge(summary_cal,visitas,on="cust",how="left")
summary_cal["recency_date"] = summary_cal["recency_date"].fillna(-1)
summary_cal["first_date"] = summary_cal["first_date"].fillna(-1)
summary_cal = summary_cal.fillna(0)

# agrego visitas segunda
visitas = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Visitas_train2.csv",sep = "|")
visitas["recency_date"] = pd.to_datetime(visitas["recency_date"],infer_datetime_format=True,errors = "coerce")
visitas["recency_date"] = (end_cal  - visitas["recency_date"])/np.timedelta64(1, "D")
visitas["first_date"] = pd.to_datetime(visitas["first_date"],infer_datetime_format=True,errors = "coerce")
visitas["first_date"] = (end_cal  - visitas["first_date"])/np.timedelta64(1, "D")
cols = cols = [x for x in visitas.columns if("L1" in x)]
visitas["L1_tot"] = visitas[cols].sum(axis = 1)

visitas.columns = [x+"_2" if(x != "cust") else x for x in visitas.columns]

summary_cal = pd.merge(summary_cal,visitas,on="cust",how="left")
summary_cal["recency_date_2"] = summary_cal["recency_date_2"].fillna(-1)
summary_cal["first_date_2"] = summary_cal["first_date_2"].fillna(-1)
summary_cal = summary_cal.fillna(0)

# |||||||||| Agrego datos de comportamiento en compras ||||||||||
compras = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Tipos_compras_train.csv",sep = "|")
compras_cols = ["CANTIDAD_COMPRAS_FREE_SHIPPING", "CANTIDAD_COMPRAS_NO_ENVIOS","CANTIDAD_COMPRAS_NO_CARRITO",
"CANTIDAD_COMPRAS_ENVIO_FULLFILMENT","CANTIDAD_COMPRAS_ENVIO_DROP_OFF","CANTIDAD_COMPRAS_ENVIO_EXPRESS",
"CANTIDAD_COMPRAS_ENVIO_EXPRESS_DOMICILIO","CANTIDAD_COMPRAS_ENVIO_EXPRESS_RETIRO_SUCURSAL","CANTIDAD_COMPRAS_ITEMS_NUEVOS"]

compras = compras[["CUS_CUST_ID"]+compras_cols]
compras.columns = ["cust"]+ compras_cols
summary_cal = pd.merge(summary_cal,compras,on="cust",how="left").fillna(0)

# |||||||||| Agrego datos de quejas en CX ||||||||||
claims = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Claims_train.csv",sep = "|")
claims = claims.groupby("CUS_CUST_ID").sum().reset_index()

claims.columns = ["cust" if(x == "CUS_CUST_ID") else x for x in claims.columns]
summary_cal = pd.merge(summary_cal,
                       claims,
                      on = "cust", how = "left").fillna(0) # si no estaba, entonces es quintil ultimo

# |||||||||| Lag 12 meses ||||||||||
summary_cal = month_lag_12(summary_cal,end_cal,country)

# |||||||||| Hot sale info ||||||||||
near_hot = nearest_date([get_real_date(end_cal,datetime.date(end_cal.year,5,28)),
              get_real_date(end_cal,datetime.date(end_cal.year,7,16)),
              get_real_date(end_cal,datetime.date(end_cal.year,12,20))],end_cal)

summary_cal = last_hot(summary_cal,near_hot,country)

# |||||||||| Data weekly durante 3 meses ||||||||||
for i in range(12):
    print("alive")
    fin = end_cal - relativedelta(weeks = i)
    study = country_eval.loc[(country_eval.date <= fin) & (country_eval.date >= (fin - relativedelta(weeks = 1)))]
    study = study.drop_duplicates(subset = ["cust","date"]).groupby("cust").size().reset_index(name = "size_"+str(i)+"_weekly")
    summary_cal = pd.merge(summary_cal,study,on="cust",how = "left").fillna(0)

In [18]:
# target
end_eval = end_cal + relativedelta(months=3)

country_eval = country.loc[(country['date'] > end_cal) & (country['date'] <= end_eval)]

aux_plata = country_eval[["cust","sales"]].groupby("cust")["sales"].agg(["sum"]).reset_index()
aux_transac = country_eval.set_index("date").to_period("W").to_timestamp().reset_index().drop_duplicates(
    subset = ["date","cust"]).groupby("cust")["sales"].agg(["size"]).reset_index()

aux_plata.columns = ["cust","money_eval"]
aux_transac.columns = ["cust","frequency_eval"]

aux_plata = pd.merge(aux_plata,aux_transac,on="cust",how = "outer").fillna(0)
aux_plata["engage_ltv"] = aux_plata["money_eval"]*aux_plata["frequency_eval"]

summary_cal = pd.merge(summary_cal,aux_plata,on="cust",how = "left").fillna(0)
summary_cal = summary_cal.sort_values(by = "engage_ltv",ascending = False)
summary_cal["quintil_true"] = asigno_quintiles_4(summary_cal)

In [23]:
summary_cal.to_pickle("cal.pkl")
s3 = boto3.client('s3')
s3.upload_file("cal.pkl", "fda-labs", s3_path+"summary_train.pkl")

In [24]:
l_1 = []
for q in summary_cal.quintil_true.unique():
    l_1.append(summary_cal.loc[summary_cal.quintil_true == q].sample(frac=0.1))

_write_dataframe_to_csv_on_s3(pd.concat(l_1),"s3://fda-labs/"+s3_path+"sample_summary_train.csv")

In [16]:
# Ahora lo mismo para el set de testing!

In [21]:
# (año 0 ~ año 1.5) -- (año 1.5 ~ año 1.8)   : train -- test
#                      (año 0 ~ año 1.5) -- (año 1.5 ~ año 1.8)   : idem para test final pero con 1 año de overlap

end_cal = country.date.max() - relativedelta(months=3)  # me guardo 1 año para training
start_cal = end_cal - relativedelta(months=18)

country_cal = country.loc[country['date'] >= start_cal]
country_cal = country_cal.loc[country_cal['date'] <= end_cal]

print(country_cal.date.min(),country_cal.date.max())

# |||||||||| Cargo Freq, recency, my_recency,ect - IPT - sales  ||||||||||
summary_cal = preprocesing(country_cal)

past_end = (end_cal - relativedelta(months = 3))

analisis_users = country.loc[(country['date'] > past_end) & (country['date'] <= end_cal)]["cust"] # defino este cohort como los usuarios que estuvieorn en los ultimos 3 meses
country_cal = country_cal.loc[country_cal.cust.isin(analisis_users.values)] # me que con los que entraron en los ultimos 3 meses

summary_cal = summary_cal.loc[summary_cal.cust.isin(analisis_users.values)] # me quedo con los usuarios de analisis

# |||||||||| Cargo Porcentage de uso por categoria  ||||||||||
summary_cal = pd.merge(summary_cal, L_porcentage(country_cal[["cust","CATEG_L1"]],  # le paso solo lo que necesita usar
                                                 vectorizer,"CATEG_L1"),on="cust",how = "left")

# |||||||||| Le asigno el LTV que tenia en los ultimos  ||||||||||
past_inic = (end_cal - relativedelta(months = 6))
past_end = (end_cal - relativedelta(months = 3))

country_eval = country.loc[(country['date'] > past_end) & (country['date'] <= end_cal)] # defino este cohort como los usuarios que estuvieorn en los ultimos 3 meses
country_eval = country.loc[(country['date'] > past_inic) & (country['date'] <= end_cal) &
                          country.cust.isin(country_eval.cust.values)] # tomo la plata que generaron en 6 meses

summary_cal = pd.merge(summary_cal,
                       quintil_past(country_eval),
                      on = "cust", how = "left").fillna(0) # si no estaba, entonces es quintil ultimo

# |||||||||| Agrego locations ||||||||||

summary_cal = pd.merge(summary_cal,
                       locations,    # ya tiene bien el nombte :)
                       on="cust", how = "left").fillna("estado de méxico") # si no estaba, entonces es quintil ultimo

aux = pd.get_dummies(summary_cal.SHP_ADD_STATE_NAME_R).drop(["tlaxcala"],axis = 1)
# agrego columnas en caso que falten
for loc in locations_:
    if(loc not in aux.columns):
        aux[loc] = 0
        
summary_cal = pd.concat([summary_cal,aux],axis = 1).drop(["SHP_ADD_STATE_NAME_R"],axis = 1)

# |||||||||| Agrego datos de la ultima compra ||||||||||
summary_cal = month_data_size(summary_cal,end_cal,country_cal)
summary_cal = month_data_sales(summary_cal,end_cal,country_cal)
summary_cal = month_data_recency(summary_cal,end_cal,country_cal)

# |||||||||| Agrego datos de las visitas ||||||||||
visitas = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Visitas_test.csv",sep = "|")
visitas["recency_date"] = pd.to_datetime(visitas["recency_date"],infer_datetime_format=True,errors = "coerce")
visitas["recency_date"] = (end_cal  - visitas["recency_date"])/np.timedelta64(1, "D")
visitas["first_date"] = pd.to_datetime(visitas["first_date"],infer_datetime_format=True,errors = "coerce")
visitas["first_date"] = (end_cal  - visitas["first_date"])/np.timedelta64(1, "D")
cols = cols = [x for x in visitas.columns if("L1" in x)]
visitas["L1_tot"] = visitas[cols].sum(axis = 1)

summary_cal = pd.merge(summary_cal,visitas,on="cust",how="left")
summary_cal["recency_date"] = summary_cal["recency_date"].fillna(-1)
summary_cal["first_date"] = summary_cal["first_date"].fillna(-1)
summary_cal = summary_cal.fillna(0)

# guardo el segundo archivo
visitas = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Visitas_test2.csv",sep = "|")
visitas["recency_date"] = pd.to_datetime(visitas["recency_date"],infer_datetime_format=True,errors = "coerce")
visitas["recency_date"] = (end_cal  - visitas["recency_date"])/np.timedelta64(1, "D")
visitas["first_date"] = pd.to_datetime(visitas["first_date"],infer_datetime_format=True,errors = "coerce")
visitas["first_date"] = (end_cal  - visitas["first_date"])/np.timedelta64(1, "D")
cols = cols = [x for x in visitas.columns if("L1" in x)]
visitas["L1_tot"] = visitas[cols].sum(axis = 1)

visitas.columns = [x+"_2" if(x != "cust") else x for x in visitas.columns]

summary_cal = pd.merge(summary_cal,visitas,on="cust",how="left")
summary_cal["recency_date_2"] = summary_cal["recency_date_2"].fillna(-1)
summary_cal["first_date_2"] = summary_cal["first_date_2"].fillna(-1)
summary_cal = summary_cal.fillna(0)

# |||||||||| Agrego datos de comportamiento en compras ||||||||||
compras = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Tipos_compras_test.csv",sep = "|")
compras_cols = ["CANTIDAD_COMPRAS_FREE_SHIPPING", "CANTIDAD_COMPRAS_NO_ENVIOS","CANTIDAD_COMPRAS_NO_CARRITO",
"CANTIDAD_COMPRAS_ENVIO_FULLFILMENT","CANTIDAD_COMPRAS_ENVIO_DROP_OFF","CANTIDAD_COMPRAS_ENVIO_EXPRESS",
"CANTIDAD_COMPRAS_ENVIO_EXPRESS_DOMICILIO","CANTIDAD_COMPRAS_ENVIO_EXPRESS_RETIRO_SUCURSAL","CANTIDAD_COMPRAS_ITEMS_NUEVOS"]

compras = compras[["CUS_CUST_ID"]+compras_cols]
compras.columns = ["cust"]+ compras_cols
summary_cal = pd.merge(summary_cal,compras,on="cust",how="left").fillna(0)

# |||||||||| Agrego datos de quejas en CX ||||||||||
claims = pd.read_csv("s3://fda-labs/ltv-ml/ML/MLM/Datasets/Claims_test.csv",sep = "|")
claims = claims.groupby("CUS_CUST_ID").sum().reset_index()

claims.columns = ["cust" if(x == "CUS_CUST_ID") else x for x in claims.columns]
summary_cal = pd.merge(summary_cal,
                       claims,
                      on = "cust", how = "left").fillna(0) # si no estaba, entonces es quintil ultimo

# |||||||||| Lag 12 meses ||||||||||
summary_cal = month_lag_12(summary_cal,end_cal,country)

# |||||||||| Hot sale info ||||||||||
near_hot = nearest_date([get_real_date(end_cal,datetime.date(end_cal.year,5,28)),
              get_real_date(end_cal,datetime.date(end_cal.year,7,16)),
              get_real_date(end_cal,datetime.date(end_cal.year,12,20))],end_cal)

summary_cal = last_hot(summary_cal,near_hot,country)

# |||||||||| Data weekly durante 3 meses ||||||||||
for i in range(12):
    print("alive")
    fin = end_cal - relativedelta(weeks = i)
    study = country_eval.loc[(country_eval.date <= fin) & (country_eval.date >= (fin - relativedelta(weeks = 1)))]
    study = study.drop_duplicates(subset = ["cust","date"]).groupby("cust").size().reset_index(name = "size_"+str(i)+"_weekly")
    summary_cal = pd.merge(summary_cal,study,on="cust",how = "left").fillna(0)

2017-10-20 00:00:00 2019-04-20 00:00:00
Preproc para pareto
Recency frequency
fiteo
npred
p_alive
fit
IPT


100%|██████████| 40/40 [04:35<00:00,  6.89s/it]  


sales


100%|██████████| 40/40 [00:04<00:00,  8.43it/s]
  if sys.path[0] == '':
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  if sys.path[0] == '':
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  if sys.path[0] == '':
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  if sys.path[0] == '':
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  if sys.path[0] == '':
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  

Index(['date', 'cust', 'sales', 'CATEG_L1', 'CATEG_L2', 'CATEG_L3'], dtype='object')
Index(['cust', 'sales'], dtype='object')


In [22]:
# target
end_eval = end_cal + relativedelta(months=3)

country_eval = country.loc[(country['date'] > end_cal) & (country['date'] <= end_eval)]

aux_plata = country_eval[["cust","sales"]].groupby("cust")["sales"].agg(["sum"]).reset_index()
aux_transac = country_eval.set_index("date").to_period("W").to_timestamp().reset_index().drop_duplicates(
    subset = ["date","cust"]).groupby("cust")["sales"].agg(["size"]).reset_index()

aux_plata.columns = ["cust","money_eval"]
aux_transac.columns = ["cust","frequency_eval"]

aux_plata = pd.merge(aux_plata,aux_transac,on="cust",how = "outer").fillna(0)
aux_plata["engage_ltv"] = aux_plata["money_eval"]*aux_plata["frequency_eval"]

summary_cal = pd.merge(summary_cal,aux_plata,on="cust",how = "left").fillna(0)
summary_cal = summary_cal.sort_values(by = "engage_ltv",ascending = False)
summary_cal["quintil_true"] = asigno_quintiles_4(summary_cal)

In [12]:
summary_cal.to_pickle("cal.pkl")
s3 = boto3.client('s3')
s3.upload_file("cal.pkl", "fda-labs", s3_path+"summary_test.pkl")

In [5]:
def read_pickle_csv(path):
    import boto3
    import pandas as pd
    import os
    s3 = boto3.client('s3')
    bucket = path.split('/')[2]
    resto = '/'.join(path.split('/')[3:])
    s3.download_file(bucket, resto,"aux.pkl")
    pp = pd.read_pickle("aux.pkl")
    os.remove("aux.pkl")
    return pp

In [6]:
summary_cal = read_pickle_csv("s3://fda-labs/"+s3_path+"summary_test.pkl")

In [8]:
country_eval = country.loc[(country.date <= end_cal) & (country.date >= (end_cal - relativedelta(weeks = 12)))]

In [7]:
end_cal = summary_cal.date_max.max()

In [9]:
for i in range(12):
    print("alive")
    fin = end_cal - relativedelta(weeks = i)
    study = country_eval.loc[(country_eval.date <= fin) & (country_eval.date >= (fin - relativedelta(weeks = 1)))]
    study = study.drop_duplicates(subset = ["cust","date"]).groupby("cust").size().reset_index(name = "size_"+str(i)+"_weekly")
    summary_cal = pd.merge(summary_cal,study,on="cust",how = "left").fillna(0)
    

alive
alive
alive
alive
alive
alive
alive
alive
alive
alive
alive
alive


In [10]:
summary_cal[summary_cal.columns[180:]].head(10)

Unnamed: 0,FLAG_Claim_other,FLAG_CLAIM_CLOSED,TIEMPO_RESOLUCION,size_lag_12,size_last_hot,size_0_weekly,size_1_weekly,size_2_weekly,size_3_weekly,size_4_weekly,size_5_weekly,size_6_weekly,size_7_weekly,size_8_weekly,size_9_weekly,size_10_weekly,size_11_weekly
0,0.0,1.0,0.0,0.0,0.0,5.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,72.0,3.0,6.0,7.0,8.0,7.0,6.0,7.0,7.0,4.0,7.0,5.0,5.0,6.0
2,0.0,32.0,123.0,3.0,9.0,7.0,7.0,6.0,7.0,8.0,7.0,7.0,6.0,5.0,6.0,7.0,5.0
3,0.0,0.0,0.0,0.0,0.0,7.0,6.0,3.0,2.0,4.0,3.0,2.0,3.0,0.0,2.0,3.0,5.0
4,0.0,0.0,0.0,3.0,5.0,4.0,5.0,1.0,4.0,6.0,5.0,1.0,0.0,0.0,3.0,5.0,5.0
5,0.0,1.0,0.0,10.0,0.0,1.0,5.0,3.0,3.0,2.0,4.0,2.0,2.0,1.0,0.0,0.0,1.0
6,0.0,2.0,17.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,4.0,3.0,1.0,1.0,0.0,0.0,0.0
7,0.0,2.0,6.0,1.0,3.0,3.0,3.0,6.0,4.0,7.0,1.0,5.0,1.0,2.0,4.0,1.0,1.0
8,0.0,2.0,71.0,12.0,2.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0
9,0.0,0.0,0.0,12.0,6.0,0.0,2.0,2.0,1.0,1.0,3.0,2.0,1.0,0.0,2.0,3.0,1.0


In [11]:
end_cal

Timestamp('2019-04-20 00:00:00')