In [1]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [2]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [3]:
past_periods = [7, 28, 84, 168]

In [4]:
def create_master_mudanza(mudanza: SQLPartitionedDataSet,
                          cliente_activo: pd.DataFrame,
                          parameters: Dict,
                          date: str) -> pd.DataFrame:
    """Creates master table with customer change of residency features for one period of data
    Parameters
    ----------
    mudanza: 
        dataset defined in ``catalog_raw.yml`` with raw data information related to customer change of residency
    cliente_activo: 
        dataset defined in ``catalog_raw.yml`` with raw data information related to active clients at EoP
    date: 
        period to process
    parameters: 
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame
        Master table with customer change of residency features for one period
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "mudanza"

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_mudanza_past = pd.read_parquet(match[0], engine="pyarrow")

    else:

        # Initialize logger
        log = initialize_logger()
        log.info("Creating master table mudanza")

        # Read parameters
        lookback_days = parameters["masters"]["global"]["look_back_days"]
        start_date = (pd.to_datetime(date) - timedelta(days=lookback_days)).strftime("%Y%m%d")

        # Calculate period to load for active clients
        period_to_load = get_previous_month(date)
        df_clientes = cliente_activo[["CUSTOMER_ID"]]

        # Get intencion de baja and churn tables
        df_mudanza = mudanza.filter_by(date=[start_date,
                                             date]).drop_duplicates()

        # Format date
        df_mudanza["FECHA"] = df_mudanza["FECHA"].dt.strftime("%Y%m%d")

        # Merging
        df_master_mudanza = df_clientes.merge(df_mudanza,
                                              on="CUSTOMER_ID",
                                              how="inner")

        # Calculate days since last mudanza
        df_master_mudanza["DAYS"] = (pd.to_datetime(date) - pd.to_datetime(df_master_mudanza["FECHA"],
                                                                           format="%Y%m%d")) / np.timedelta64(1, "D")
        df_days = flatten_df(df_master_mudanza.groupby("CUSTOMER_ID").agg({"DAYS": [np.nanmin, np.nanmax]}))
        # Calculate n of mudanzas
        df_master_mudanza["N"] = 1

        log.info("Creating past variables...")
        df_mudanza_past = add_relative_calculate_past(df=df_master_mudanza,
                                                      id_cols=["CUSTOMER_ID"],
                                                      periods=past_periods,
                                                      columns_to_expand=["N"],
                                                      agg=["sum"],
                                                      date_col="FECHA",
                                                      start_date=start_date,
                                                      end_date=date,
                                                      period_freq="D")

        df_mudanza_past = df_mudanza_past.merge(df_days, on="CUSTOMER_ID", validate="1:1")

        # Add date variables
        df_mudanza_past["DATE_EXP"] = period_to_load
        df_mudanza_past["DATE_CALC"] = date

        # Rename table
        rename_table(df=df_mudanza_past,
                     preffix="MUD",
                     ids_to_exclude=["CUSTOMER_ID", "DATE_EXP", "DATE_CALC"]
                     )
        # Return
        log.info(f"Exporting {df_mudanza_past.shape[0]} rows and {df_mudanza_past.shape[1]} columns")

        if write_to_parquet:
            log.info(f"\n------ Writing {table_name} for period {date} to parquet ------")
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_mudanza_past.to_parquet(file, engine="pyarrow")

        log.info(f"Exporting {df_mudanza_past.shape[0]} rows and {df_mudanza_past.shape[1]} columns")

    return df_mudanza_past

In [5]:
cliente_activo=catalog.load("cliente_activo")
mudanza=catalog.load("mudanza")

2020-12-30 20:20:22,303 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...
2020-12-30 20:20:22,305 - kedro.io.data_catalog - INFO - Loading data from `mudanza` (SQLPartitionedDataSet)...


In [7]:
dates = calculate_dates_to_process_for_master(parameters, table_name="mudanza")
print(dates)

['20181203', '20181231', '20190128', '20190225', '20190325', '20190422']


In [8]:
for date in dates:
    print(f"Processing date {date}")
    cliente_activo_df= create_cliente_activo(cliente_activo,date)
    create_master_mudanza(mudanza,cliente_activo_df,parameters,date)
    

Processing date 20181203
2020-12-30 20:20:34,936 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201811


  % ((self.server_version_info,))


2020-12-30 20:20:38,654 - aa_engine_pkg.assets.utils.utilities - INFO - Creating master table mudanza
select CUSTOMER_ID, FECHA, DOMICILIO_INS_FACT, DATE_EXP from stg_uy_mudanza where FECHA >= to_date('20180618', 'yyyymmdd') and FECHA < to_date('20181203', 'yyyymmdd')
2020-12-30 20:20:39,169 - aa_engine_pkg.assets.utils.utilities - INFO - Creating past variables...
2020-12-30 20:20:39,170 - aa_engine_pkg.assets.utils.utilities - INFO - Adding relative date between 20180618 and 20181203
2020-12-30 20:20:39,186 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 7
2020-12-30 20:20:39,195 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 28
2020-12-30 20:20:39,204 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 84
2020-12-30 20:20:39,214 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 168
2020-12-30 20:20:39,347 - aa_engine_pkg.assets.utils.utilities - INFO - Exporting 8675 rows and 9