In [7]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet
from aa_engine_pkg.assets.utils import *

In [19]:
eventos_facturados = catalog.load("eventos_facturados")
cliente_activo = catalog.load("cliente_activo")

2021-06-15 20:23:31,118 - kedro.io.data_catalog - INFO - Loading data from `eventos_facturados` (SQLPartitionedDataSet)...
2021-06-15 20:23:31,120 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [21]:
date = "20201207"
cliente_activo = create_cliente_activo(cliente_activo,date)

2021-06-15 20:23:39,030 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_eop_customer where PRC_TIPO_ID = 3 and DATE_EXP = 202011


In [22]:
log = initialize_logger()
look_back_months = parameters["masters"]["global"]["look_back_months"]
periods_to_load = get_last_k_periods(date, look_back_months)
periods_to_load = tuple(periods_to_load)
start_date = periods_to_load[-1]
period_to_load = get_previous_month(date)

# Calculate period to load for active clients
df_clientes = cliente_activo[["CUSTOMER_ID"]]
df_ef = eventos_facturados.filter_by_period(date=periods_to_load).drop_duplicates()
log.info("Merging with EOP...")
df_ef = pd.merge(df_clientes,
                 df_ef,
                 on=["CUSTOMER_ID"],
                 how="inner",
                 validate="1:m")

select * from stg_uy_evento_facturado where DATE_EXP in ('202012', '202011', '202010', '202009', '202008', '202007')
2021-06-15 20:24:08,978 - aa_engine_pkg.assets.utils.utilities - INFO - Merging with EOP...


In [23]:
df_ef.head()

Unnamed: 0,CUSTOMER_ID,NRO_FACTURA,FECHA_FACTURACION,BALANCE_BROUGHT_FORWARD,MONTO,FINANCIAL_ACCOUNT_ID,BALANCE,DATE_EXP
0,70100,21043842,2020-11-10,1047.0,1047.0,6129,0.0,202011
1,70100,20105254,2020-07-10,1047.0,1047.0,6129,0.0,202007
2,70100,20335438,2020-08-10,1047.0,1047.0,6129,0.0,202008
3,70100,20564622,2020-09-10,1047.0,1047.0,6129,0.0,202009
4,70100,20806923,2020-10-12,1047.0,1047.0,6129,0.0,202010


In [24]:
df_ef.BALANCE_BROUGHT_FORWARD.describe()

count    671865.000000
mean       2472.621845
std        1410.160641
min      -29707.000000
25%        1608.000000
50%        2221.000000
75%        3019.000000
max       84790.000000
Name: BALANCE_BROUGHT_FORWARD, dtype: float64

In [43]:
a = pd.Series([87,30,25,41,12,40,40,120,66,66,777,40.5])
print(a)
print(a.rank())
print(a.rank()/len(a))

0      87.0
1      30.0
2      25.0
3      41.0
4      12.0
5      40.0
6      40.0
7     120.0
8      66.0
9      66.0
10    777.0
11     40.5
dtype: float64
0     10.0
1      3.0
2      2.0
3      7.0
4      1.0
5      4.5
6      4.5
7     11.0
8      8.5
9      8.5
10    12.0
11     6.0
dtype: float64
0     0.833333
1     0.250000
2     0.166667
3     0.583333
4     0.083333
5     0.375000
6     0.375000
7     0.916667
8     0.708333
9     0.708333
10    1.000000
11    0.500000
dtype: float64


In [44]:
a.median()

40.75

In [49]:
sum(a<=87)/len(a)

0.8333333333333334

In [25]:
df_ef["BALANCE_BROUGHT_FORWARD_SCALED"] = scale_values(df=df_ef,
                                      vars_to_groupby=["DATE_EXP"],
                                      var_to_scale="BALANCE_BROUGHT_FORWARD",
                                      by_ranking=False)

In [53]:
df_ef[["BALANCE_BROUGHT_FORWARD","BALANCE_BROUGHT_FORWARD_SCALED"]].loc[df_ef.BALANCE_BROUGHT_FORWARD_SCALED>=0.85]

Unnamed: 0,BALANCE_BROUGHT_FORWARD,BALANCE_BROUGHT_FORWARD_SCALED
75,4855.29,1.000000
100,5365.00,0.998569
121,4344.00,0.901620
122,4344.00,0.907159
123,4344.00,0.905843
...,...,...
670576,4984.47,0.928385
670598,4943.19,0.927557
670787,4867.04,0.917821
670870,4626.60,0.869277


In [9]:
num_vars = ["MONTO", "BALANCE_BROUGHT_FORWARD"]

vars_to_group_by = ["CUSTOMER_ID"]

id_cols = ["CUSTOMER_ID", "DATE_EXP", "DATE_CALC"]

past_periods = [1, 2, 3, 4, 5, 6]


def create_master_eventos_fact(eventos_facturados: SQLPartitionedDataSet,
                               cliente_activo: pd.DataFrame,
                               parameters: Dict,
                               date: str) -> pd.DataFrame:
    """Creates master table with billing features for one period of data
    Parameters
    ----------
    eventos_facturados:
        dataset defined in ``catalog_raw.yml`` with raw data information related to customer billing state at EoP
    cliente_activo:
        dataset defined in ``catalog_raw.yml`` with raw data information related to active clients at EoP
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame
        Master table with billing features for one period
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "eventos_facturados"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_ef_past = pd.read_parquet(match[0], engine="pyarrow")

    else:
        # Read parameters
        look_back_months = parameters["masters"]["global"]["look_back_months"]
        periods_to_load = get_last_k_periods(date, look_back_months)
        periods_to_load = tuple(periods_to_load)
        start_date = periods_to_load[-1]
        period_to_load = get_previous_month(date)

        # Calculate period to load for active clients
        df_clientes = cliente_activo[["CUSTOMER_ID"]]
        df_ef = eventos_facturados.filter_by_period(date=periods_to_load).drop_duplicates()
        log.info("Merging with EOP...")
        df_ef = pd.merge(df_clientes,
                         df_ef,
                         on=["CUSTOMER_ID"],
                         how="inner",
                         validate="1:m")

        for var in ["MONTO", "BALANCE_BROUGHT_FORWARD"]:
            log.info(f"----{var}")
            df_ef[var] = scale_values(df=df_ef,
                                      vars_to_groupby=["DATE_EXP"],
                                      var_to_scale=var,
                                      by_ranking=False)

        df_ef_past = add_relative_calculate_past(df_ef,
                                                 id_cols=["CUSTOMER_ID"],
                                                 date_col="DATE_EXP",
                                                 start_date=start_date,
                                                 end_date=period_to_load,
                                                 periods=past_periods,
                                                 period_freq="M",
                                                 agg={'MONTO': [np.nanmean, np.nanmax, np.nanmin],
                                                      'BALANCE_BROUGHT_FORWARD': [np.nanmean, np.nanmax, np.nanmin]}
                                                 )

        # Add date variables
        df_ef_past["DATE_EXP"] = period_to_load

        # Change variable names 
        table_preffix = parameters["masters"]["eventos_facturados"]["table_preffix"]
        rename_table(df_ef_past,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols)
        log.info(f"Exporting {df_ef_past.shape[0]} rows and {df_ef_past.shape[1]} columns")

        if write_to_parquet:
            log.info(f"\n------ Writing {table_name} for period {date} to parquet ------")
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_ef_past.to_parquet(file, engine="pyarrow")

        log.info(f"Exporting {df_ef_past.shape[0]} rows and {df_ef_past.shape[1]} columns")

    return df_ef_past


In [20]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos