In [36]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks_uy/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")
from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [37]:
date='20200516'

In [38]:
upgrades_basicos=catalog.load('upgrades_basicos')
eop=catalog.load("eop")
cliente_activo=catalog.load("cliente_activo")

2021-01-12 13:12:31,985 - kedro.io.data_catalog - INFO - Loading data from `upgrades_basicos` (SQLPartitionedDataSet)...
2021-01-12 13:12:31,986 - kedro.io.data_catalog - INFO - Loading data from `eop` (SQLPartitionedDataSet)...
2021-01-12 13:12:31,987 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [39]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [40]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2021-01-12 13:12:33,482 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 202004


In [41]:
# Initialize logger
log = initialize_logger()

table_name = "target_upsell"
write_to_parquet = parameters["write_to_parquet"]
overwrite = parameters["targets"][table_name]["overwrite"]
end_date = str(parameters["end_date"])
log.info(f"Start the process of create upsell target for {date}")

2021-01-12 13:12:37,735 - aa_engine_pkg.assets.utils.utilities - INFO - Start the process of create upsell target for 20200516


In [42]:
# Check if target can be created (date + max window < previous sunday)
target_parameters = parameters["targets"][table_name]
max_window = max([target_parameters[x] for x in target_parameters.keys() if x.endswith("window")])
upper_bound = (pd.to_datetime(date) + timedelta(days=max_window)).strftime("%Y%m%d")
previous_sunday = dt.today() - timedelta(days=dt.today().weekday() + 1)

In [43]:
product_rank = parameters["targets"]["target_upsell"]["upsell_products_rank"]
products_allowed_to_move=parameters["targets"]["target_upsell"]["products_allowed_to_move"]
product_tecnology=parameters["targets"]["target_upsell"]["product_and_tecnology"]
product_tecnology = {value : key for (key, value) in product_tecnology.items()}

start_date = date
end_date = (pd.to_datetime(date) + timedelta(days=parameters["targets"]["target_upsell"]["calculation_window"])).strftime("%Y%m%d")
cancel_date = (pd.to_datetime(date) + timedelta(days=parameters["targets"]["target_upsell"]["activation_window"])).strftime("%Y%m%d")

# Get EoP active clients from previous period to exclude new clients
period_to_load = get_previous_month(start_date)
df_clientes = eop.filter_by(condition=f"PRC_TIPO_ID = 3 AND PRC_CODIGO  IN {tuple(products_allowed_to_move)}",
                             #base of customers that can made an upgrade
                             date=period_to_load)


select DATE_EXP, CUSTOMER_ID, PRC_CODIGO, PRODUCTO, PRC_TIPO_ID, TEC_ID, MOP, TENURE from stg_uy_eop_customer where DATE_EXP = 202004 and PRC_TIPO_ID = 3 AND PRC_CODIGO  IN (216, 217, 147, 169, 135, 132)


In [52]:
df_clientes["tecno_eop"]=df_clientes["PRC_CODIGO"].map(product_tecnology)
df_clientes.head()

Unnamed: 0,DATE_EXP,CUSTOMER_ID,PRC_CODIGO,PRODUCTO,PRC_TIPO_ID,TEC_ID,MOP,TENURE,tecno_eop
0,202004,21332,132,ORO,3,10,Invoice,222,DIRECTV ORO MIX
1,202004,21350,132,ORO,3,10,Tarjeta de Credito,222,DIRECTV ORO MIX
2,202004,21717,132,ORO,3,10,Tarjeta de Credito,221,DIRECTV ORO MIX
3,202004,21780,135,BRONCE,3,10,Invoice,221,DIRECTV ACCESS MIX
4,202004,8069,132,ORO,3,20,Tarjeta de Credito,223,DIRECTV ORO MIX


In [53]:
# Get the user tecnology
df_clientes["tecno_eop"]=df_clientes["PRC_CODIGO"].map(product_tecnology)
df_clientes["tecno_eop"]=[y.split(" ")[2] for x,y in enumerate(df_clientes["tecno_eop"])]

In [54]:
df_clientes.head()

Unnamed: 0,DATE_EXP,CUSTOMER_ID,PRC_CODIGO,PRODUCTO,PRC_TIPO_ID,TEC_ID,MOP,TENURE,tecno_eop
0,202004,21332,132,ORO,3,10,Invoice,222,MIX
1,202004,21350,132,ORO,3,10,Tarjeta de Credito,222,MIX
2,202004,21717,132,ORO,3,10,Tarjeta de Credito,221,MIX
3,202004,21780,135,BRONCE,3,10,Invoice,221,MIX
4,202004,8069,132,ORO,3,20,Tarjeta de Credito,223,MIX


In [55]:
df_clientes.tecno_eop.unique()

array(['MIX', 'SD', 'HD'], dtype=object)

In [68]:
df_cancelations = upgrades_basicos.filter_by(date=[start_date,
                                                   cancel_date],
                                             target=True)

df_clientes_upgrades = pd.merge(
    df_clientes[["CUSTOMER_ID", "PRC_CODIGO","tecno_eop"]],
    df_upgrades[["CUSTOMER_ID", "PRODUCTO_ID", "FECHA","tecno_up"]],
    on=["CUSTOMER_ID"],
    how="inner",
    validate="1:m")
del df_upgrades;
gc.collect()

df_clientes_upgrades.sort_values(["CUSTOMER_ID", "PRC_CODIGO", "FECHA"], ascending=[False, False, True],
                                 inplace=True)
df_clientes_upgrades.drop_duplicates(subset=["CUSTOMER_ID", "PRC_CODIGO"], keep="last", inplace=True)

df_product_rank = pd.DataFrame(product_rank.items(), columns=["PRODUCTO_RANK_INI", "PRC_CODIGO"])
df_product_rank = df_product_rank.explode("PRC_CODIGO")


select * from stg_uy_plan_evento where PROD_CATEGORY_ID = 3 and EVENTO_ID IN (107,108,133,142) and FECHA > to_date('20200516235900', 'yyyymmddhh24miss') and FECHA <= to_date('20200911235900', 'yyyymmddhh24miss')


In [69]:
# Rank initial product (PRC_CODIGO) from EOP table
df_clientes_upgrades_ranked = pd.merge(df_clientes_upgrades,
                                       df_product_rank,
                                       on="PRC_CODIGO",
                                       how="left",
                                       validate="m:1")

del df_clientes_upgrades
gc.collect()

125

In [70]:
# Rank last product (PRODUCTO_ID) from plan_evento table
df_product_rank.rename(columns={"PRC_CODIGO": "PRODUCTO_ID",
                                "PRODUCTO_RANK_INI": "PRODUCTO_RANK_END"}, inplace=True)
df_clientes_upgrades_ranked = pd.merge(df_clientes_upgrades_ranked,
                                       df_product_rank,
                                       on="PRODUCTO_ID",
                                       how="left",
                                       validate="m:1")
df_clientes_upgrades_ranked.head()

Unnamed: 0,CUSTOMER_ID,PRC_CODIGO,tecno_eop,PRODUCTO_ID,FECHA,tecno_up,PRODUCTO_RANK_INI,PRODUCTO_RANK_END
0,55722003,169,HD,139,2020-06-01 15:26:45,HD,4,7
1,55713629,217,HD,217,2020-05-25 18:09:46,HD,2,2
2,55685312,169,HD,169,2020-05-21 14:13:32,HD,4,4
3,55683634,217,HD,147,2020-06-04 17:45:58,SD,2,3
4,55459215,169,HD,135,2020-06-01 00:07:09,MIX,4,5


In [73]:
#Solving the situation of MIX, this tecnology is SD
df_clientes_upgrades_ranked["tecno_eop"]=np.where(df_clientes_upgrades_ranked.tecno_eop.isin(["MIX"]),"SD",df_clientes_upgrades_ranked.tecno_eop)
df_clientes_upgrades_ranked["tecno_up"]=np.where(df_clientes_upgrades_ranked.tecno_up.isin(["MIX"]),"SD",df_clientes_upgrades_ranked.tecno_up)

In [75]:
# Calculate target based on initial and end product plus tecnology
df_clientes_upgrades_ranked["TARGET"] = np.where((df_clientes_upgrades_ranked["PRODUCTO_RANK_END"] > \
                                                 df_clientes_upgrades_ranked["PRODUCTO_RANK_INI"]) & ((df_clientes_upgrades_ranked["tecno_eop"] == \
                                                                                                       df_clientes_upgrades_ranked["tecno_up"])), 1, 0)

In [76]:
df_clientes_upgrades_ranked.head()

Unnamed: 0,CUSTOMER_ID,PRC_CODIGO,tecno_eop,PRODUCTO_ID,FECHA,tecno_up,PRODUCTO_RANK_INI,PRODUCTO_RANK_END,TARGET
0,55722003,169,HD,139,2020-06-01 15:26:45,HD,4,7,1
1,55713629,217,HD,217,2020-05-25 18:09:46,HD,2,2,0
2,55685312,169,HD,169,2020-05-21 14:13:32,HD,4,4,0
3,55683634,217,HD,147,2020-06-04 17:45:58,SD,2,3,0
4,55459215,169,HD,135,2020-06-01 00:07:09,SD,4,5,0


In [77]:
# Keep only first move by CUSTOMER, PRODUCT
df_cancelations.sort_values(["CUSTOMER_ID", "PRODUCTO_ID", "FECHA"], ascending=[False, False, True], inplace=True)
df_cancelations.drop_duplicates(subset=["CUSTOMER_ID", "PRODUCTO_ID"], keep="last", inplace=True)

# Merge with target df to check for activation period
df_target = pd.merge(df_clientes_upgrades_ranked,
                     df_cancelations[["CUSTOMER_ID", "PRODUCTO_ID", "FECHA"]],
                     on=["CUSTOMER_ID", "PRODUCTO_ID"],
                     how="left",
                     validate="1:m")

del df_clientes_upgrades_ranked, df_cancelations;
gc.collect()

# Compute time difference between events
df_target["DATE_DIFF"] = (df_target["FECHA_y"] - df_target["FECHA_x"]) / np.timedelta64(1, "D")
log.info(f" Number of events 108 ending as upgrades before product changes rule {df_target.TARGET.sum()}")

df_target["TARGET"] = np.where((df_target["DATE_DIFF"] > 0) & \
                               (df_target["DATE_DIFF"] <= parameters["targets"]["target_upsell"][
                                   "activation_window"]),
                               0,
                               df_target["TARGET"])
df_target = drop_extra_rename_remaining(df_target)
log.info(f" Number of events 108 ending as upgrades after product changes rule {df_target.TARGET.sum()}")

2021-01-12 13:46:28,645 - aa_engine_pkg.assets.utils.utilities - INFO -  Number of events 108 ending as upgrades before product changes rule 275
2021-01-12 13:46:28,652 - aa_engine_pkg.assets.utils.utilities - INFO -  Number of events 108 ending as upgrades after product changes rule 245


In [79]:
# Merge back to EOP
df_final = pd.merge(df_clientes[["CUSTOMER_ID", "PRC_CODIGO"]],
                    df_target[["CUSTOMER_ID", "TARGET", "FECHA", "PRODUCTO_ID"]],
                    on="CUSTOMER_ID",
                    how="left",
                    validate="1:1")

target=df_final.loc[df_final.CUSTOMER_ID.isin(cliente_activo_df.CUSTOMER_ID.unique())]

del df_target, df_final;
gc.collect()

target["TARGET"].fillna(0, inplace=True)
target["TARGET"] =  target["TARGET"].astype(np.int32)
target["DATE_EXP"] = period_to_load
target["DATE_CALC"] = date
target.rename({"FECHA": "FECHA_TARGET"}, inplace=True)

In [80]:
target.head()

Unnamed: 0,CUSTOMER_ID,PRC_CODIGO,TARGET,FECHA,PRODUCTO_ID,DATE_EXP,DATE_CALC
0,21332,132,0,NaT,,202004,20200516
1,21350,132,0,NaT,,202004,20200516
2,21717,132,0,NaT,,202004,20200516
3,21780,135,0,NaT,,202004,20200516
4,8069,132,0,NaT,,202004,20200516


In [81]:
target.TARGET.mean()

0.006070968381405491

In [82]:
target.loc[target.TARGET==1]

Unnamed: 0,CUSTOMER_ID,PRC_CODIGO,TARGET,FECHA,PRODUCTO_ID,DATE_EXP,DATE_CALC
231,53426359,169,1,2020-06-08 10:01:57,139,202004,20200516
566,53740668,169,1,2020-06-12 12:21:16,139,202004,20200516
667,53016110,147,1,2020-06-01 14:15:57,132,202004,20200516
677,53216165,147,1,2020-05-23 12:32:51,135,202004,20200516
718,53314122,147,1,2020-06-01 00:09:00,135,202004,20200516
...,...,...,...,...,...,...,...
40064,52403899,147,1,2020-05-27 12:45:23,132,202004,20200516
40200,52411278,169,1,2020-06-01 19:11:31,139,202004,20200516
40207,52412175,147,1,2020-06-01 11:40:21,132,202004,20200516
40251,52604445,169,1,2020-06-05 12:00:23,139,202004,20200516


In [None]:
def create_target_upsell(upgrades_basicos: SQLPartitionedDataSet,
                         eop: SQLPartitionedDataSet,
                         cliente_activo: pd.DataFrame,
                         parameters: Dict,
                         date: str) -> pd.DataFrame:
    """Function that takes care of generating the target feature for the up-sell model
    Up-selling includes:
        - Switching to a higher level of service on the same tecnology (e.g., from Silver SD to Gold SD)

    Target definition:
        - Existing customer switches service.
        - Stays with new product for at least 84 days

    Target methodology:
        0. Definition of product ranking (update as needed)
        1. Loading of customer base (e.g., all post-paid customers in Colombia)
        2. Loading of upgrade events (108) for period of interest (calculation_window)
        3. Loading of events (107,108,133,142) to detect customers that switch products for period of
        interest (activation_window)
        4. For customers that have an event (108) in the calculation_window, compare previous service to new one to determine if it
    is an upgrade using the product ranking.
        5. For said customers in (4), check if another event happens in the activation_window after the 108 event. If it does not happen, then the
    customers are target for the model.

    Parameters
    ----------
    upgrades_basicos: 
        dataset defined in ``catalog_raw.yml`` with raw data information related to upgrades of programming service products
    eop:
        dataset defined in ``catalog_raw.yml`` with raw data information related to the client's EoP state
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``
    
    Returns
    -------
    pd.DataFrame
        Master table with up-sell target feature for one period (date+1; date+calculation_window)
    """

    # Initialize logger
    log = initialize_logger()
    
    table_name = "target_upsell"
    write_to_parquet = parameters["write_to_parquet"]
    overwrite = parameters["targets"][table_name]["overwrite"]
    end_date = str(parameters["end_date"])
    log.info(f"Start the process of create upsell target for {date}")
    
    # Check if target can be created (date + max window < previous sunday)
    target_parameters = parameters["targets"][table_name]
    max_window = max([target_parameters[x] for x in target_parameters.keys() if x.endswith("window")])
    upper_bound = (pd.to_datetime(date) + timedelta(days=max_window)).strftime("%Y%m%d")
    previous_sunday = dt.today() - timedelta(days=dt.today().weekday() + 1)
    
    if pd.to_datetime(upper_bound, format="%Y%m%d") > previous_sunday:
        log.info(f"Cannot create upsell target for {date}: Not enough future information")
        return None

    # Compare with what is already processed
    path = f"{parameters['paths']['target_path']}{table_name}/"
    os.makedirs(path, exist_ok=True)
    processed_dates = os.listdir(path)
    match = [file for file in processed_dates if str(date) in file]
    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_final = pd.read_parquet(path + match[0], engine="pyarrow")

    else:
        product_rank = parameters["targets"]["target_upsell"]["upsell_products_rank"]
        products_allowed_to_move=parameters["targets"]["target_upsell"]["products_allowed_to_move"]
        product_tecnology=parameters["targets"]["target_upsell"]["product_and_tecnology"]
        product_tecnology = {value : key for (key, value) in product_tecnology.items()}

        start_date = date
        end_date = (pd.to_datetime(date) + timedelta(days=parameters["targets"]["target_upsell"]["calculation_window"])).strftime("%Y%m%d")
        cancel_date = (pd.to_datetime(date) + timedelta(days=parameters["targets"]["target_upsell"]["activation_window"])).strftime("%Y%m%d")

        # Get EoP active clients from previous period to exclude new clients
        period_to_load = get_previous_month(start_date)
        df_clientes = eop.filter_by(condition=f"PRC_TIPO_ID = 3 AND PRC_CODIGO  IN {tuple(products_allowed_to_move)}",
                                     #base of customers that can made an upgrade
                                     date=period_to_load)

        # Get the user tecnology
        df_clientes["tecno_eop"]=df_clientes["PRC_CODIGO"].map(product_tecnology)
        df_clientes["tecno_eop"]=[y.split(" ")[2] for x,y in enumerate(df_clientes["tecno_eop"])]

        # Get data for target creation
        moves=tuple([ value[0] for (key,value) in product_rank.items()])
        df_upgrades = upgrades_basicos.filter_by(condition=f"EVENTO_ID = 108 AND PRODUCTO_ID IN {moves}",
                                                 date=[start_date, end_date],
                                                 target=True)
        #Tecnology of the basic product.
        df_upgrades["tecno_up"]=[y.split(" ")[1] for x,y in enumerate(df_upgrades["PRODUCTO"])]

        df_cancelations = upgrades_basicos.filter_by(date=[start_date,
                                                           cancel_date],
                                                     target=True)

        df_clientes_upgrades = pd.merge(
            df_clientes[["CUSTOMER_ID", "PRC_CODIGO","tecno_eop"]],
            df_upgrades[["CUSTOMER_ID", "PRODUCTO_ID", "FECHA","tecno_up"]],
            on=["CUSTOMER_ID"],
            how="inner",
            validate="1:m")
        del df_upgrades;
        gc.collect()

        df_clientes_upgrades.sort_values(["CUSTOMER_ID", "PRC_CODIGO", "FECHA"], ascending=[False, False, True],
                                         inplace=True)
        df_clientes_upgrades.drop_duplicates(subset=["CUSTOMER_ID", "PRC_CODIGO"], keep="last", inplace=True)

        df_product_rank = pd.DataFrame(product_rank.items(), columns=["PRODUCTO_RANK_INI", "PRC_CODIGO"])
        df_product_rank = df_product_rank.explode("PRC_CODIGO")


            # Rank initial product (PRC_CODIGO) from EOP table
        df_clientes_upgrades_ranked = pd.merge(df_clientes_upgrades,
                                               df_product_rank,
                                               on="PRC_CODIGO",
                                               how="left",
                                               validate="m:1")

        del df_clientes_upgrades
        gc.collect()

        # Rank last product (PRODUCTO_ID) from plan_evento table
        df_product_rank.rename(columns={"PRC_CODIGO": "PRODUCTO_ID",
                                        "PRODUCTO_RANK_INI": "PRODUCTO_RANK_END"}, inplace=True)
        df_clientes_upgrades_ranked = pd.merge(df_clientes_upgrades_ranked,
                                               df_product_rank,
                                               on="PRODUCTO_ID",
                                               how="left",
                                               validate="m:1")

        #Solving the situation of MIX, this tecnology is SD
        df_clientes_upgrades_ranked["tecno_eop"]=np.where(df_clientes_upgrades_ranked.tecno_eop.isin(["MIX"]),"SD",df_clientes_upgrades_ranked.tecno_eop)
        df_clientes_upgrades_ranked["tecno_up"]=np.where(df_clientes_upgrades_ranked.tecno_up.isin(["MIX"]),"SD",df_clientes_upgrades_ranked.tecno_up)
        
        # Calculate target based on initial and end product plus tecnology
        df_clientes_upgrades_ranked["TARGET"] = np.where((df_clientes_upgrades_ranked["PRODUCTO_RANK_END"] > \
                                                         df_clientes_upgrades_ranked["PRODUCTO_RANK_INI"]) & ((df_clientes_upgrades_ranked["tecno_eop"] == \
                                                                                                               df_clientes_upgrades_ranked["tecno_up"])), 1, 0)

        # Keep only first move by CUSTOMER, PRODUCT
        df_cancelations.sort_values(["CUSTOMER_ID", "PRODUCTO_ID", "FECHA"], ascending=[False, False, True], inplace=True)
        df_cancelations.drop_duplicates(subset=["CUSTOMER_ID", "PRODUCTO_ID"], keep="last", inplace=True)

        # Merge with target df to check for activation period
        df_target = pd.merge(df_clientes_upgrades_ranked,
                             df_cancelations[["CUSTOMER_ID", "PRODUCTO_ID", "FECHA"]],
                             on=["CUSTOMER_ID", "PRODUCTO_ID"],
                             how="left",
                             validate="1:m")

        del df_clientes_upgrades_ranked, df_cancelations;
        gc.collect()

        # Compute time difference between events
        df_target["DATE_DIFF"] = (df_target["FECHA_y"] - df_target["FECHA_x"]) / np.timedelta64(1, "D")
        log.info(f" Number of events 108 ending as upgrades before product changes rule {df_target.TARGET.sum()}")

        df_target["TARGET"] = np.where((df_target["DATE_DIFF"] > 0) & \
                                       (df_target["DATE_DIFF"] <= parameters["targets"]["target_upsell"][
                                           "activation_window"]),
                                       0,
                                       df_target["TARGET"])
        df_target = drop_extra_rename_remaining(df_target)
        log.info(f" Number of events 108 ending as upgrades after product changes rule {df_target.TARGET.sum()}")

        # Merge back to EOP
        df_final = pd.merge(df_clientes[["CUSTOMER_ID", "PRC_CODIGO"]],
                            df_target[["CUSTOMER_ID", "TARGET", "FECHA", "PRODUCTO_ID"]],
                            on="CUSTOMER_ID",
                            how="left",
                            validate="1:1")
        
        target=df_final.loc[df_final.CUSTOMER_ID.isin(cliente_activo.CUSTOMER_ID.unique())]
        
        del df_target, df_final;
        gc.collect()

        target["TARGET"].fillna(0, inplace=True)
        target["TARGET"] =  target["TARGET"].astype(np.int32)
        target["DATE_EXP"] = period_to_load
        target["DATE_CALC"] = date
        target.rename({"FECHA": "FECHA_TARGET"}, inplace=True)
        
        if write_to_parquet:
            file = f"{parameters['paths']['target_path']}{table_name}/{table_name}_{date}.parquet"
            target.to_parquet(file, engine="pyarrow")

        # Return
        log.info(
            f"""Exporting target for period {start_date} and rate {
            np.round(100 * target[target['TARGET'] == 1]['CUSTOMER_ID'].nunique() / target['CUSTOMER_ID'].nunique(), 2)
            }%""")

    return target
    