In [1]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [2]:
date='20180605'

In [3]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [4]:
cliente_activo=catalog.load("cliente_activo")

2020-12-30 18:38:38,061 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [5]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2020-12-30 18:38:41,750 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201805


  % ((self.server_version_info,))


In [6]:
eventos_facturados=catalog.load("eventos_facturados")

2020-12-30 18:38:46,595 - kedro.io.data_catalog - INFO - Loading data from `eventos_facturados` (SQLPartitionedDataSet)...


In [7]:
num_vars = ["MONTO", "BALANCE_BROUGHT_FORWARD"]

vars_to_group_by = ["CUSTOMER_ID"]

id_cols = ["CUSTOMER_ID", "DATE_EXP", "DATE_CALC"]

past_periods = [1, 2, 3, 4, 5, 6]


In [13]:
# Initialize logger
log = initialize_logger()

In [9]:
# Read parameters
look_back_months = parameters["masters"]["global"]["look_back_months"]
periods_to_load = get_last_k_periods(date, look_back_months)
periods_to_load = tuple(periods_to_load)
start_date = periods_to_load[-1]
period_to_load = get_previous_month(date)

# Calculate period to load for active clients
df_clientes = cliente_activo_df[["CUSTOMER_ID"]]
df_ef = eventos_facturados.filter_by_period(date=periods_to_load).drop_duplicates()

select * from stg_uy_evento_facturado where DATE_EXP in ('201806', '201805', '201804', '201803', '201802', '201801')


In [11]:
df_ef.head()

Unnamed: 0,NRO_FACTURA,CUSTOMER_ID,FECHA_FACTURACION,BALANCE_BROUGHT_FORWARD,MONTO,FINANCIAL_ACCOUNT_ID,BALANCE,DATE_EXP
0,13663356,53586698,2018-04-10,1451.28,1027.0,473911,0.0,201804
1,13663357,2462051,2018-04-10,1745.0,1745.0,205846,0.0,201804
2,13663358,53590960,2018-04-10,1936.64,-591.29,474338,0.0,201804
3,13663359,2462481,2018-04-10,1364.0,1364.0,205852,0.0,201804
4,13663360,53461575,2018-04-10,1467.0,1600.33,460888,0.0,201804


In [14]:
for var in ["MONTO", "BALANCE_BROUGHT_FORWARD"]:
    log.info(f"----{var}")
    df_ef[var] = scale_values(df=df_ef,
                          vars_to_groupby=["DATE_EXP"],
                          var_to_scale=var,
                          by_ranking=False)

2020-12-30 18:42:53,874 - aa_engine_pkg.assets.utils.utilities - INFO - ----MONTO
2020-12-30 18:42:54,085 - aa_engine_pkg.assets.utils.utilities - INFO - ----BALANCE_BROUGHT_FORWARD


In [17]:
df_ef_past = add_relative_calculate_past(df_ef,
                                         id_cols=["CUSTOMER_ID"],
                                         date_col="DATE_EXP",
                                         start_date=start_date,
                                         end_date=period_to_load,
                                         periods=past_periods,
                                         period_freq="M",
                                         agg={'MONTO': [np.nanmean, np.nanmax, np.nanmin],'BALANCE_BROUGHT_FORWARD': [np.nanmean, np.nanmax, np.nanmin]})

2020-12-30 18:43:40,890 - aa_engine_pkg.assets.utils.utilities - INFO - Adding relative date between 201801 and 201805
2020-12-30 18:43:41,173 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 1
2020-12-30 18:43:41,345 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 2
2020-12-30 18:43:41,542 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 3
2020-12-30 18:43:41,760 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 4
2020-12-30 18:43:42,012 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 5
2020-12-30 18:43:42,286 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 6


In [18]:
# Add date variables
df_ef_past["DATE_EXP"] = period_to_load

# Change variable names 
table_preffix = parameters["masters"]["eventos_facturados"]["table_preffix"]
rename_table(df_ef_past,
             preffix=table_preffix,
             ids_to_exclude=id_cols)

In [19]:
df_ef_past.head()

Unnamed: 0,CUSTOMER_ID,EVF_MONTO_nanmean_1,EVF_MONTO_nanmax_1,EVF_MONTO_nanmin_1,EVF_BALANCE_BROUGHT_FORWARD_nanmean_1,EVF_BALANCE_BROUGHT_FORWARD_nanmax_1,EVF_BALANCE_BROUGHT_FORWARD_nanmin_1,EVF_MONTO_nanmean_2,EVF_MONTO_nanmax_2,EVF_MONTO_nanmin_2,...,EVF_BALANCE_BROUGHT_FORWARD_nanmean_5,EVF_BALANCE_BROUGHT_FORWARD_nanmax_5,EVF_BALANCE_BROUGHT_FORWARD_nanmin_5,EVF_MONTO_nanmean_6,EVF_MONTO_nanmax_6,EVF_MONTO_nanmin_6,EVF_BALANCE_BROUGHT_FORWARD_nanmean_6,EVF_BALANCE_BROUGHT_FORWARD_nanmax_6,EVF_BALANCE_BROUGHT_FORWARD_nanmin_6,DATE_EXP
0,53586698,0.283254,0.283254,0.283254,0.285699,0.285699,0.285699,0.282242,0.283254,0.28123,...,0.190066,0.369196,0.052684,0.247675,0.367913,0.058302,0.190066,0.369196,0.052684,201805
1,2462051,0.438455,0.438455,0.438455,0.438725,0.438725,0.438725,0.437364,0.438455,0.436272,...,0.434859,0.439774,0.431606,0.434526,0.438455,0.430254,0.434859,0.439774,0.431606,201805
2,53590960,0.240068,0.240068,0.240068,0.0,0.0,0.0,0.120034,0.240068,0.0,...,0.175442,0.473642,0.0,0.237175,0.471458,0.0,0.175442,0.473642,0.0,201805
3,2462481,0.356099,0.356099,0.356099,0.357523,0.357523,0.357523,0.35505,0.356099,0.354,...,0.35231,0.357523,0.348816,0.351761,0.356099,0.348085,0.35231,0.357523,0.348816,201805
4,53461575,0.421595,0.421595,0.421595,0.407892,0.407892,0.407892,0.413314,0.421595,0.405032,...,0.349422,0.480812,0.052423,0.421695,0.47871,0.371267,0.349422,0.480812,0.052423,201805


In [None]:
def create_master_eventos_fact(eventos_facturados: SQLPartitionedDataSet,
                               cliente_activo: pd.DataFrame,
                               parameters: Dict,
                               date: str) -> pd.DataFrame:
    """Creates master table with billing features for one period of data
    Parameters
    ----------
    eventos_facturados:
        dataset defined in ``catalog_raw.yml`` with raw data information related to customer billing state at EoP
    cliente_activo:
        dataset defined in ``catalog_raw.yml`` with raw data information related to active clients at EoP
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame
        Master table with billing features for one period
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "eventos_facturados"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_ef_past = pd.read_parquet(match[0], engine="pyarrow")

    else:
        # Read parameters
        look_back_months = parameters["masters"]["global"]["look_back_months"]
        periods_to_load = get_last_k_periods(date, look_back_months)
        periods_to_load = tuple(periods_to_load)
        start_date = periods_to_load[-1]
        period_to_load = get_previous_month(date)

        # Calculate period to load for active clients
        df_clientes = cliente_activo[["CUSTOMER_ID"]]
        df_ef = eventos_facturados.filter_by_period(date=periods_to_load).drop_duplicates()
        log.info("Merging with EOP...")
        df_ef = pd.merge(df_clientes,
                         df_ef,
                         on=["CUSTOMER_ID"],
                         how="inner",
                         validate="1:m")

        for var in ["MONTO", "BALANCE_BROUGHT_FORWARD"]:
            log.info(f"----{var}")
            df_ef[var] = scale_values(df=df_ef,
                                      vars_to_groupby=["DATE_EXP"],
                                      var_to_scale=var,
                                      by_ranking=False)

        df_ef_past = add_relative_calculate_past(df_ef,
                                                 id_cols=["CUSTOMER_ID"],
                                                 date_col="DATE_EXP",
                                                 start_date=start_date,
                                                 end_date=period_to_load,
                                                 periods=past_periods,
                                                 period_freq="M",
                                                 agg={'MONTO': [np.nanmean, np.nanmax, np.nanmin],
                                                      'BALANCE_BROUGHT_FORWARD': [np.nanmean, np.nanmax, np.nanmin]}
                                                 )

        # Add date variables
        df_ef_past["DATE_EXP"] = period_to_load

        # Change variable names 
        table_preffix = parameters["masters"]["eventos_facturados"]["table_preffix"]
        rename_table(df_ef_past,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols)
        log.info(f"Exporting {df_ef_past.shape[0]} rows and {df_ef_past.shape[1]} columns")

        if write_to_parquet:
            log.info(f"\n------ Writing {table_name} for period {date} to parquet ------")
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_ef_past.to_parquet(file, engine="pyarrow")

        log.info(f"Exporting {df_ef_past.shape[0]} rows and {df_ef_past.shape[1]} columns")

    return df_ef_past