### Generacion de target para modelos de upsell.
#### Consideracines:
1. Se considera como target los eventos 108 que impliquen un incremento en el revenue del producto basico.
2. Que el nuevo producto se mantenga por aproximadamente 3 meses.
3. Ya que algunos eventos 108 son posteriores a eventos 100108 (agendas) consideraremos como target los siguentes casos:
- Agendas en el periodo de interes.
- Los upgrades en el perido de interes que no tuvieron una agenda previa.

Esquema.


              start_date_agendas   start_date          end_date       end_date_upgrade  cancel_date
        --------------|-----------------|-----------------|-----------------|-----------------|------------------

In [12]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")
from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [8]:
upgrades_basicos=catalog.load('upgrades_basicos')
eop=catalog.load("eop")
cliente_activo=catalog.load("cliente_activo")
agendas_basicos=catalog.load("agendas_basicos")

2021-03-25 20:55:15,958 - kedro.io.data_catalog - INFO - Loading data from `upgrades_basicos` (SQLPartitionedDataSet)...
2021-03-25 20:55:15,960 - kedro.io.data_catalog - INFO - Loading data from `eop` (SQLPartitionedDataSet)...
2021-03-25 20:55:15,961 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...
2021-03-25 20:55:15,962 - kedro.io.data_catalog - INFO - Loading data from `agendas_basicos` (SQLPartitionedDataSet)...


In [9]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [27]:
def create_target_upsell(upgrades_basicos: SQLPartitionedDataSet,
                         eop: SQLPartitionedDataSet,
                         agendas_basicos: SQLPartitionedDataSet,
                         cliente_activo: pd.DataFrame,
                         parameters: Dict,
                        date: str) -> None:

    
    """
    The following function will define the target variable for the upsell model of Basic products, considering:
    #### Products from PLAN VIGENTE including :
        - DIRECTV ACCES MIX
        - DIRECTV BRONCE SD
        - DIRECTV BRONCE HD ONLY
        - DIRECTV PLATA SD
        - DIRECTV PLATA HD ONLY
        - DIRECTV ORO MIX
        - DIRECTV ORO HD ONLY
    #### Target definition:
        - Existing customer upgrades product.
        - Stays active for 3 or more months (end of discount price period)
    ---
    ## Target methodology
        1. Definition of possible product upgrades
        2. Loading customer base (EOP)
        3. Loading upgrade events (108) for the period of interest (calculation window)
        4. Loading potential cancellation events (142,133) for the cancellation window (3 months)
        5. Loading agendas.
    Parameters
    ----------
    upgrades_basicos:
        dataset defined in ``catalog_raw.yml`` with raw data information related to upgrades of Basic products
    eop:
        dataset defined in ``catalog_raw.yml`` with raw data information related to the client's EOP state
    cliente_activo:
        pandas dataframe with active customers for period
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame
        pandas dataframe with xsell target for period
    """
    
    # Initialize logger
    log = initialize_logger()
    
    table_name = "target_upsell"
    write_to_parquet = parameters["write_to_parquet"]
    overwrite = parameters["targets"][table_name]["overwrite"]
    end_date = str(parameters["end_date"])
    log.info(f"Start the process of create upsell target for {date}")
    
    # Check if target can be created (date + max window < previous sunday)
    target_parameters = parameters["targets"][table_name]
    max_window = max([target_parameters[x] for x in target_parameters.keys() if x.endswith("window")])
    upper_bound = (pd.to_datetime(date) + timedelta(days=max_window)).strftime("%Y%m%d")
    previous_sunday = dt.today() - timedelta(days=dt.today().weekday() + 1)
    
    if pd.to_datetime(upper_bound, format="%Y%m%d") > previous_sunday:
        log.info(f"Cannot create upsell target for {date}: Not enough future information")
        return None

    # Compare with what is already processed
    path = f"{parameters['paths']['target_path']}{table_name}/"
    os.makedirs(path, exist_ok=True)
    processed_dates = os.listdir(path)
    match = [file for file in processed_dates if str(date) in file]
    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_final = pd.read_parquet(path + match[0], engine="pyarrow")

    else:
        
        product_rank = parameters["targets"]["target_upsell"]["upsell_products_rank"]
        product_tecnology=parameters["targets"]["target_upsell"]["product_and_tecnology"]
        product_tecnology = {value : key for (key, value) in product_tecnology.items()}

        start_date = date
        end_date = (pd.to_datetime(date) + timedelta(days=parameters["targets"]["target_upsell"]["calculation_window"])
                    ).strftime("%Y%m%d")
        cancel_date = (pd.to_datetime(date) + timedelta(days=parameters["targets"]["target_upsell"]["activation_window"])
                       ).strftime("%Y%m%d")

        end_date_upgrades = (pd.to_datetime(date) + timedelta(days=2*parameters["targets"]["target_upsell"]["calculation_window"])).strftime("%Y%m%d")

        start_date_agendas = (pd.to_datetime(start_date)-timedelta(days=28)).strftime("%Y%m%d")
        
        # Get EoP active clients from previous period to exclude new clients
        products_allowed_to_move=tuple([key for (key, value) in product_tecnology.items() if value.find('ORO')==-1]) 
        period_to_load = get_previous_month(start_date)
        df_clientes = eop.filter_by(condition=f"PRC_TIPO_ID = 3 AND PRC_CODIGO  IN {products_allowed_to_move}",
                                     #base of customers that can made an upgrade
                                     date=period_to_load)
        log.info(f"EOP shape {df_clientes.shape[0]}")
        # Get the user tecnology
        df_clientes["tecno_eop"]=df_clientes["PRC_CODIGO"].map(product_tecnology)
        df_clientes["tecno_eop"]=[y.split(" ")[2] for x,y in enumerate(df_clientes["tecno_eop"])]
        df_clientes["tecno_eop"]=np.where(df_clientes.tecno_eop=="MIX","SD",df_clientes.tecno_eop)

        # Get Upgrades for target creation
        products_allowed_to_move=tuple([key for (key, value) in product_tecnology.items()]) 
        df_upgrades = upgrades_basicos.filter_by(condition=f"EVENTO_ID = 108 AND PRODUCTO_ID IN {products_allowed_to_move}",
                                                 date=[start_date, end_date_upgrades],
                                                 target=True)
        #Tecnology of the basic product.
        df_upgrades["tecno_up"]=[y.split(" ")[2] for x,y in enumerate(df_upgrades.PRODUCTO_ID.map(product_tecnology))]
        df_upgrades["tecno_up"]=np.where(df_upgrades.tecno_up=="MIX","SD",df_upgrades.tecno_up)
        
        #Get Agendas
        df_agenda = agendas_basicos.filter_by(date=[start_date_agendas, end_date])
        df_agenda.rename(columns={"FECHA":"FECHA_AGENDA"},inplace=True)
        
        #Get Cancelations
        df_cancelations = upgrades_basicos.filter_by(date=[start_date,cancel_date], target=True)
        # Keep only first cancellation by CUSTOMER, PRODUCT
        df_cancelations.sort_values(["CUSTOMER_ID", "PRODUCTO_ID", "FECHA"], ascending=[False, False, True],
                                        inplace=True)
        df_cancelations.drop_duplicates(subset=["CUSTOMER_ID", "PRODUCTO_ID"], keep="last", inplace=True)

        df_clientes_upgrades = pd.merge(
                    df_clientes[["CUSTOMER_ID", "PRC_CODIGO","tecno_eop"]],
                    df_upgrades[["CUSTOMER_ID", "PRODUCTO_ID", "FECHA","tecno_up"]],
                    on=["CUSTOMER_ID"],
                    how="inner",
                    validate="1:m")

        df_clientes_upgrades.sort_values(["CUSTOMER_ID", "PRC_CODIGO", "FECHA"], ascending=[False, False, True],inplace=True)
        df_clientes_upgrades.drop_duplicates(subset=["CUSTOMER_ID", "PRC_CODIGO"], keep="last", inplace=True)
        
        del df_upgrades;
        gc.collect()
        
        # Rank initial product (PRC_CODIGO) from EOP table
        df_product_rank = pd.DataFrame(product_rank.items(), columns=["PRODUCTO_RANK_INI", "PRC_CODIGO"]).explode("PRC_CODIGO")
        df_clientes_upgrades_ranked = pd.merge(df_clientes_upgrades,
                                           df_product_rank,
                                           on="PRC_CODIGO",
                                           how="left",
                                           validate="m:1")

        # Rank last product (PRODUCTO_ID) from plan_evento table
        df_product_rank.rename(columns={"PRC_CODIGO": "PRODUCTO_ID","PRODUCTO_RANK_INI": "PRODUCTO_RANK_END"}, inplace=True)
        df_clientes_upgrades_ranked = pd.merge(df_clientes_upgrades_ranked,
                                               df_product_rank,
                                               on="PRODUCTO_ID",
                                               how="left",
                                               validate="m:1")
        
        # Calculate target based on initial and end product plus tecnology
        mask=(df_clientes_upgrades_ranked["PRODUCTO_RANK_END"] > df_clientes_upgrades_ranked["PRODUCTO_RANK_INI"]) & (df_clientes_upgrades_ranked["tecno_eop"] ==df_clientes_upgrades_ranked["tecno_up"])
        df_clientes_upgrades_ranked["TARGET"] = np.where(mask, 1, 0)
        log.info(f" Number of events 108 ending as upgrades {df_clientes_upgrades_ranked.TARGET.sum()}")
        
        # Merge with target df to check for activation period
        df_target = pd.merge(df_clientes_upgrades_ranked,
                             df_cancelations[["CUSTOMER_ID", "PRODUCTO_ID", "FECHA"]],
                             on=["CUSTOMER_ID", "PRODUCTO_ID"],
                             how="left")
        
        del df_clientes_upgrades_ranked,df_cancelations;
        gc.collect()


        # Compute time difference between events
        df_target["DATE_DIFF"] = (df_target["FECHA_y"] - df_target["FECHA_x"]) / np.timedelta64(1, "D")
        df_target["TARGET"] = np.where((df_target["DATE_DIFF"] > 0) & \
                                       (df_target["DATE_DIFF"] <= parameters["targets"]["target_upsell"][
                                           "activation_window"]),
                                       0,
                                       df_target["TARGET"])
        log.info(f" Number of events 108 ending as upgrades after cancelation rule {df_target.TARGET.sum()}")
        df_target = drop_extra_rename_remaining(df_target)
        
        # Remove duplicates prioritizing upgrades
        df_target.sort_values(["CUSTOMER_ID", "TARGET"], ascending=False,inplace=True)
        df_target.drop_duplicates(subset=["CUSTOMER_ID"], keep="first", inplace=True)


        # quitar upgrades agendados el mes anterior
        df_final = pd.merge(df_target,
                        df_agenda,
                        left_on=['CUSTOMER_ID','PRC_CODIGO'],
                        right_on=['CUSTOMER_ID','PRODUCTO_ID'],
                        how='left')
        
        # quitar agendas mes anterior
        upgrades_agendados_antes = df_final[(df_final.FECHA_AGENDA)<(pd.to_datetime(start_date)).strftime("%Y%m%d")]
        df_final = df_final.drop(upgrades_agendados_antes.index)
        log.info(f" Number of events 108 ending as upgrades after agenda rule {df_final.TARGET.sum()}")
        
        # quitar upgrades mes futuro no agendados este mes
        upgrades_futuros = df_final[df_final.FECHA>(pd.to_datetime(start_date)+timedelta(days=28)).strftime("%Y%m%d")]
        df_final = df_final.drop(upgrades_futuros[upgrades_futuros.FECHA_AGENDA.isna()].index)
        log.info(f" Number of events 108 ending as upgrades after upgrades future rule {df_final.TARGET.sum()}")

        df_final.sort_values(["CUSTOMER_ID", "TARGET"], ascending=False,inplace=True)
        df_final.drop_duplicates(subset=["CUSTOMER_ID"], keep="first", inplace=True)
        
        # Merge back to EOP
        df_final= drop_extra_rename_remaining(df_final) 
       
        df_final = pd.merge(df_clientes[["CUSTOMER_ID", "PRC_CODIGO"]],
                            df_final[["CUSTOMER_ID", "TARGET", "FECHA", "PRODUCTO_ID"]],
                            on="CUSTOMER_ID",
                            how="left",
                            validate="1:1")

        target=df_final.loc[df_final.CUSTOMER_ID.isin(cliente_activo.CUSTOMER_ID.unique())]
        
        del df_target, df_final;
        gc.collect()
        
        target["TARGET"].fillna(0, inplace=True)
        target["TARGET"] =  target["TARGET"].astype(np.int32)
        target["DATE_EXP"] = period_to_load
        target["DATE_CALC"] = date
        target.rename({"FECHA": "FECHA_TARGET"}, inplace=True)
        
        if write_to_parquet:
            file = f"{parameters['paths']['target_path']}{table_name}/{table_name}_{date}.parquet"
            target.to_parquet(file, engine="pyarrow")

        # Return
        log.info(f"size {target.shape[0]}")
        log.info(
            f"""Exporting target for period {start_date} and rate {
            np.round(100 * target.loc[target['TARGET'] == 1]['CUSTOMER_ID'].nunique() / target['CUSTOMER_ID'].nunique(), 2)
            }%""")

    return None

In [28]:
dates=calculate_dates_to_process_for_target(parameters,'target_upsell')
print(dates)

['20181203', '20181231', '20190128', '20190225', '20190325', '20190422', '20190520', '20190617', '20190715', '20190812', '20190909', '20191007', '20191104', '20191202', '20191230', '20200127', '20200224', '20200323', '20200420', '20200518', '20200615', '20200713', '20200810', '20200907', '20201005', '20201102', '20201130', '20201228', '20210125', '20210222', '20210322', '20210419', '20210517', '20210614', '20210712']


In [29]:
for date in dates:
    cliente_activo_df=create_cliente_activo(cliente_activo,date)
    create_target_upsell(upgrades_basicos,
                         eop,
                         agendas_basicos,
                         cliente_activo_df,
                         parameters,
                         date)

2021-03-25 21:03:05,748 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201811
2021-03-25 21:03:08,458 - aa_engine_pkg.assets.utils.utilities - INFO - Start the process of create upsell target for 20181203
select DATE_EXP, CUSTOMER_ID, PRC_CODIGO, PRODUCTO, PRC_TIPO_ID, TEC_ID, MOP, TENURE from stg_uy_eop_customer where DATE_EXP = 201811 and PRC_TIPO_ID = 3 AND PRC_CODIGO  IN (135, 216, 217, 147, 169)
2021-03-25 21:03:09,537 - aa_engine_pkg.assets.utils.utilities - INFO - EOP shape 18942
select * from stg_uy_plan_evento where PROD_CATEGORY_ID = 3 and EVENTO_ID IN (107,108,133,142) and FECHA > to_date('20181203235900', 'yyyymmddhh24miss') and FECHA <= to_date('20190128235900', 'yyyymmddhh24miss') and EVENTO_ID = 108 AND PRODUCTO_ID IN (135, 216, 217, 147, 169, 132, 139)
select CUSTOMER_ID, PRODUCTO_ID, PRODUCTO, TRUNC(FECHA) FECHA_AGENDA from stg_uy_plan_