In [1]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [2]:
date='20180605'

In [3]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [4]:
cliente_activo=catalog.load("cliente_activo")

2020-12-30 20:26:32,032 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [5]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2020-12-30 20:26:32,943 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201805


  % ((self.server_version_info,))


In [6]:
plan_evento=catalog.load("plan_evento")

2020-12-30 20:26:35,003 - kedro.io.data_catalog - INFO - Loading data from `plan_evento` (SQLPartitionedDataSet)...


In [13]:
vars_to_string = ["PROD_CATEGORY_ID", "EVENTO_ID"]

vars_to_dummy = ["CAT_COMBINED"]

vars_to_group_by = ["CUSTOMER_ID", "FECHA"]

id_cols = ["CUSTOMER_ID", "DATE_EXP", "DATE_CALC"]

categories_dict = {"EVENTO_ID": ["108", "123", "133", "142", "171", "5229", "100133", "200142", "100142"],
                   "PROD_CATEGORY_ID": ["1", "3", "5", "6"]
                   }

past_periods = [14, 21, 28, 84, 168, 252, 336]




In [10]:
# Initialize logger
log = initialize_logger()
log.info("Creating master table plan evento")
lookback_days = parameters["masters"]["plan_evento"]["look_back_days"]

2020-12-30 20:27:39,337 - aa_engine_pkg.assets.utils.utilities - INFO - Creating master table plan evento


In [11]:
# Read parameters
start_date = (pd.to_datetime(date) - timedelta(days=lookback_days)).strftime("%Y%m%d")

# Calculate period to load for active clients
log.info("Loading table clientes")
period_to_load = get_previous_month(date)
df_clientes = cliente_activo_df

# Get intencion de baja and churn tables
log.info("Loading table plan_evento")
plan_evento = plan_evento.filter_by(date=[start_date,
                                          date]).drop_duplicates()

# Format variables
log.info("Formatting string variables")
plan_evento[vars_to_string] = plan_evento[vars_to_string].astype(str)
plan_evento["FECHA"] = plan_evento["FECHA"].dt.strftime("%Y%m%d")

log.info("Merging tables")
plan_evento = pd.merge(df_clientes,
                       plan_evento,
                       on=["CUSTOMER_ID"],
                       how="inner",
                       validate="1:m")

2020-12-30 20:27:44,705 - aa_engine_pkg.assets.utils.utilities - INFO - Loading table clientes
2020-12-30 20:27:44,707 - aa_engine_pkg.assets.utils.utilities - INFO - Loading table plan_evento
select CUSTOMER_ID, FECHA, EVENTO_ID, PROD_CATEGORY_ID from stg_uy_plan_evento where EVENTO_ID <> 118 and PROD_CATEGORY_ID is not null and PRODUCTO_ID is not null and FECHA >= to_date('20170704', 'yyyymmdd') and FECHA < to_date('20180605', 'yyyymmdd')
2020-12-30 20:27:52,878 - aa_engine_pkg.assets.utils.utilities - INFO - Formatting string variables
2020-12-30 20:27:54,908 - aa_engine_pkg.assets.utils.utilities - INFO - Merging tables


In [12]:
plan_evento.head()

Unnamed: 0,CUSTOMER_ID,FECHA,EVENTO_ID,PROD_CATEGORY_ID
0,146123,20180517,100142,5
1,146123,20180517,100142,3
2,146123,20180601,142,5
3,146123,20180601,142,3
4,146197,20180101,142,1


In [14]:
# Impute categories
impute_categories(plan_evento,
                  "EVENTO_ID",
                  categories_dict["EVENTO_ID"]
                  )
impute_categories(plan_evento,
                  "PROD_CATEGORY_ID",
                  categories_dict["PROD_CATEGORY_ID"]
                  )

plan_evento["CAT_COMBINED"] = plan_evento["EVENTO_ID"] + "_" + plan_evento["PROD_CATEGORY_ID"]

df_list = []
for var in vars_to_dummy:
    log.info(f'---- {var}')
    df_list.append(create_dummy_variables(df=plan_evento,
                                          vars_to_groupby=vars_to_group_by,
                                          var_to_dummy=var,
                                          include_total=False,
                                          include_subtotal=False))

df_plan_evento_dummies = reduce(lambda left, right: pd.merge(left, right, on=vars_to_group_by, how="outer"),
                                df_list)


2020-12-30 20:46:42,626 - numexpr.utils - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-12-30 20:46:42,627 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.
2020-12-30 20:46:42,725 - aa_engine_pkg.assets.utils.utilities - INFO - ---- CAT_COMBINED


In [15]:
df_plan_evento_dummies.head()

Unnamed: 0,CUSTOMER_ID,FECHA,CAT_COMBINED_100142_1,CAT_COMBINED_100142_3,CAT_COMBINED_100142_5,CAT_COMBINED_100142_OTHER,CAT_COMBINED_108_1,CAT_COMBINED_108_3,CAT_COMBINED_108_5,CAT_COMBINED_123_1,...,CAT_COMBINED_200142_5,CAT_COMBINED_200142_OTHER,CAT_COMBINED_5229_1,CAT_COMBINED_5229_3,CAT_COMBINED_5229_5,CAT_COMBINED_5229_OTHER,CAT_COMBINED_OTHER_1,CAT_COMBINED_OTHER_3,CAT_COMBINED_OTHER_5,CAT_COMBINED_OTHER_OTHER
0,8069,20180122,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,8069,20180301,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8176,20180412,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,8855,20180418,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,8855,20180504,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [16]:
# Calculate past variables
df_plan_evento_dummies_past = add_relative_calculate_past(df_plan_evento_dummies,
                                                          date_col="FECHA",
                                                          id_cols=["CUSTOMER_ID"],
                                                          start_date=start_date,
                                                          end_date=date,
                                                          periods=past_periods,
                                                          agg=[np.nansum],
                                                          period_freq="D")

# Add date variables
df_plan_evento_dummies_past["DATE_EXP"] = period_to_load
df_plan_evento_dummies_past["DATE_CALC"] = date

# Change variable names 
table_preffix = parameters["masters"]["plan_evento"]["table_preffix"]
rename_table(df_plan_evento_dummies_past,
             preffix=table_preffix,
             ids_to_exclude=id_cols)

2020-12-30 20:48:12,493 - aa_engine_pkg.assets.utils.utilities - INFO - Adding relative date between 20170704 and 20180605
2020-12-30 20:48:12,535 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 14
2020-12-30 20:48:12,652 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 21
2020-12-30 20:48:12,780 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 28
2020-12-30 20:48:12,917 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 84
2020-12-30 20:48:13,133 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 168
2020-12-30 20:48:13,439 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 252
2020-12-30 20:48:13,746 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 336


In [17]:
df_plan_evento_dummies_past.head()

Unnamed: 0,CUSTOMER_ID,EVE_CAT_COMBINED_200142_3_nansum_14,EVE_CAT_COMBINED_108_5_nansum_14,EVE_CAT_COMBINED_200142_1_nansum_14,EVE_CAT_COMBINED_OTHER_1_nansum_14,EVE_CAT_COMBINED_100142_5_nansum_14,EVE_CAT_COMBINED_133_3_nansum_14,EVE_CAT_COMBINED_108_1_nansum_14,EVE_CAT_COMBINED_5229_3_nansum_14,EVE_CAT_COMBINED_171_5_nansum_14,...,EVE_CAT_COMBINED_123_1_nansum_336,EVE_CAT_COMBINED_171_1_nansum_336,EVE_CAT_COMBINED_5229_OTHER_nansum_336,EVE_CAT_COMBINED_171_3_nansum_336,EVE_CAT_COMBINED_OTHER_3_nansum_336,EVE_CAT_COMBINED_142_3_nansum_336,EVE_CAT_COMBINED_200142_OTHER_nansum_336,EVE_CAT_COMBINED_142_OTHER_nansum_336,DATE_EXP,DATE_CALC
0,8069,,,,,,,,,,...,0,0,0,0,0,0,0,0,201805,20180605
1,8176,,,,,,,,,,...,0,0,0,0,0,0,0,0,201805,20180605
2,8855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,2,0,0,0,201805,20180605
3,8864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,0,201805,20180605
4,9289,,,,,,,,,,...,0,1,0,0,0,0,0,0,201805,20180605


In [None]:
def create_master_plan_evento(plan_evento: SQLPartitionedDataSet,
                              cliente_activo: pd.DataFrame,
                              parameters: Dict,
                              date: str) -> pd.DataFrame:
    """Creates mastertable with customer's events features for one period of data
    Parameters
    ----------
    plan_evento:
        dataset defined in ``catalog.yml`` with raw data information related to customer events
    cliente_activo:
        dataset defined in ``catalog.yml`` with raw data information related to active clients at EoP
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame:
        mastertable with customer events features for one period
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "plan_evento"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_plan_evento_dummies_past = pd.read_parquet(match[0], engine="pyarrow")

    else:
        log.info("Creating master table plan evento")
        lookback_days = parameters["masters"]["plan_evento"]["look_back_days"]

        # Read parameters
        start_date = (pd.to_datetime(date) - timedelta(days=lookback_days)).strftime("%Y%m%d")

        # Calculate period to load for active clients
        log.info("Loading table clientes")
        period_to_load = get_previous_month(date)
        df_clientes = cliente_activo

        # Get intencion de baja and churn tables
        log.info("Loading table plan_evento")
        plan_evento = plan_evento.filter_by(date=[start_date,
                                                  date]).drop_duplicates()

        # Format variables
        log.info("Formatting string variables")
        plan_evento[vars_to_string] = plan_evento[vars_to_string].astype(str)
        plan_evento["FECHA"] = plan_evento["FECHA"].dt.strftime("%Y%m%d")

        log.info("Merging tables")
        plan_evento = pd.merge(df_clientes,
                               plan_evento,
                               on=["CUSTOMER_ID"],
                               how="inner",
                               validate="1:m")

        # Impute categories
        impute_categories(plan_evento,
                          "EVENTO_ID",
                          categories_dict["EVENTO_ID"]
                          )
        impute_categories(plan_evento,
                          "PROD_CATEGORY_ID",
                          categories_dict["PROD_CATEGORY_ID"]
                          )

        plan_evento["CAT_COMBINED"] = plan_evento["EVENTO_ID"] + "_" + plan_evento["PROD_CATEGORY_ID"]

        df_list = []
        for var in vars_to_dummy:
            log.info(f'---- {var}')
            df_list.append(create_dummy_variables(df=plan_evento,
                                                  vars_to_groupby=vars_to_group_by,
                                                  var_to_dummy=var,
                                                  include_total=False,
                                                  include_subtotal=False))

        df_plan_evento_dummies = reduce(lambda left, right: pd.merge(left, right, on=vars_to_group_by, how="outer"),
                                        df_list)

        # Calculate past variables
        df_plan_evento_dummies_past = add_relative_calculate_past(df_plan_evento_dummies,
                                                                  date_col="FECHA",
                                                                  id_cols=["CUSTOMER_ID"],
                                                                  start_date=start_date,
                                                                  end_date=date,
                                                                  periods=past_periods,
                                                                  agg=[np.nansum],
                                                                  period_freq="D")

        # Add date variables
        df_plan_evento_dummies_past["DATE_EXP"] = period_to_load
        df_plan_evento_dummies_past["DATE_CALC"] = date

        # Change variable names 
        table_preffix = parameters["masters"]["plan_evento"]["table_preffix"]
        rename_table(df_plan_evento_dummies_past,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols)

        if write_to_parquet:
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_plan_evento_dummies_past.to_parquet(file, engine="pyarrow")

        log.info(
            f"Exporting {df_plan_evento_dummies_past.shape[0]} rows and {df_plan_evento_dummies_past.shape[1]} columns")

    return df_plan_evento_dummies_past