In [1]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [2]:
date='20180605'

In [3]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [4]:
cliente_activo=catalog.load("cliente_activo")

2020-12-30 19:40:13,805 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [5]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2020-12-30 19:40:14,886 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201805


  % ((self.server_version_info,))


In [6]:
mora=catalog.load("mora")

2020-12-30 19:40:18,683 - kedro.io.data_catalog - INFO - Loading data from `mora` (SQLPartitionedDataSet)...


In [7]:
vars_to_scale = ["MONTO_ULT_FACTURA",
                 "MORA_1_30",
                 "MORA_31_60",
                 "MORA_61_90",
                 "MORA_91_120",
                 "MORA_121_150",
                 "MORA_151_180",
                 "MORA_180",
                 "TOT_AMT_OVERDUE"]

id_cols = ["CUSTOMER_ID", "DATE_EXP", "DATE_CALC"]

past_periods = [7, 28, 84, 168]

In [8]:
# Initialize logger
log = initialize_logger()

In [9]:
# Read parameters
look_back_days = parameters["masters"]["mora"]["look_back_days"]
start_date = (pd.to_datetime(date) - timedelta(days=look_back_days)).strftime("%Y%m%d")

# Calculate period to load for active clients
log.info("Reading tables...")
period_to_load = get_previous_month(date)

df_clientes = cliente_activo_df[["CUSTOMER_ID"]]

# Get intencion morosidad table
df_morosidad = mora.filter_by(date=[start_date, date]).drop_duplicates()

df_morosidad["FECHA_PAGO"] = pd.to_datetime(df_morosidad["PAGO"].astype(str))
df_morosidad["EDAD_MORA"] = (df_morosidad["FECHA"] - df_morosidad["FECHA_PAGO"]) / np.timedelta64(1, "D")

# Format
df_morosidad["FECHA"] = df_morosidad["FECHA"].dt.strftime("%Y%m%d")

# Merging
log.info("Merging with EOP..")
df_morosidad_activo = df_clientes.merge(df_morosidad,
                                        on="CUSTOMER_ID",
                                        how="inner")


2020-12-30 19:41:06,682 - aa_engine_pkg.assets.utils.utilities - INFO - Reading tables...
select CUSTOMER_ID, FECHA, MONTO_ULT_FACTURA, PAGO, MORA_1_30, MORA_31_60, MORA_61_90, MORA_91_120, MORA_121_150, MORA_151_180, MORA_180, TOT_AMT_OVERDUE, DATE_EXP from stg_uy_cliente_moroso where FECHA >= to_date('20180313', 'yyyymmdd') and FECHA < to_date('20180605', 'yyyymmdd')
2020-12-30 19:41:08,639 - aa_engine_pkg.assets.utils.utilities - INFO - Merging with EOP..


In [10]:
df_morosidad_activo.head()

Unnamed: 0,CUSTOMER_ID,FECHA,MONTO_ULT_FACTURA,PAGO,MORA_1_30,MORA_31_60,MORA_61_90,MORA_91_120,MORA_121_150,MORA_151_180,MORA_180,TOT_AMT_OVERDUE,DATE_EXP,FECHA_PAGO,EDAD_MORA
0,165445,20180603,3236.5,20180510,1565.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,201806,2018-05-10,24.0
1,165445,20180601,3236.5,20180510,1565.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,201806,2018-05-10,22.0
2,165445,20180604,3236.5,20180510,1565.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,201806,2018-05-10,25.0
3,165445,20180602,3236.5,20180510,1565.0,0.0,0.0,0.0,0.0,0.0,0.0,1565.0,201806,2018-05-10,23.0
4,176591,20180604,3913.0,20180510,1907.5,0.0,0.0,0.0,0.0,0.0,0.0,1907.5,201806,2018-05-10,25.0


In [11]:
# Create scale variables
log.info("Scaling numeric variables...")
for var in vars_to_scale:
    log.info(f"---- {var}")
    df_morosidad_activo[var + "_scaled"] = scale_values(df=df_morosidad_activo,
                                                        vars_to_groupby=["DATE_EXP"],
                                                        var_to_scale=var,
                                                        by_ranking=False)

2020-12-30 19:44:02,099 - aa_engine_pkg.assets.utils.utilities - INFO - Scaling numeric variables...
2020-12-30 19:44:02,100 - aa_engine_pkg.assets.utils.utilities - INFO - ---- MONTO_ULT_FACTURA
2020-12-30 19:44:02,122 - aa_engine_pkg.assets.utils.utilities - INFO - ---- MORA_1_30
2020-12-30 19:44:02,137 - aa_engine_pkg.assets.utils.utilities - INFO - ---- MORA_31_60
2020-12-30 19:44:02,152 - aa_engine_pkg.assets.utils.utilities - INFO - ---- MORA_61_90
2020-12-30 19:44:02,166 - numexpr.utils - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-12-30 19:44:02,167 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.
2020-12-30 19:44:02,176 - aa_engine_pkg.assets.utils.utilities - INFO - ---- MORA_91_120
2020-12-30 19:44:02,191 - aa_engine_pkg.assets.utils.utilities - INFO - ---- MORA_121_150
2020-12-30 19:44:02,209 - aa_engine_pkg.assets.utils.utilities - INFO - ---- MORA_151_180
2020-12-30 19:44:02,229 - aa_engine_pkg.asse

In [12]:
create_evolution_variables(df=df_morosidad_activo,
                           var_name='PROP_OVERDUE',
                           numerator='TOT_AMT_OVERDUE',
                           denominator='MONTO_ULT_FACTURA')

# Creating number of bills in default
df_morosidad_activo["PAGO"] = df_morosidad_activo["PAGO"].astype(str)
df_bills = df_morosidad_activo[["CUSTOMER_ID", "PAGO", ]].drop_duplicates()
df_bills["FACTURAS"] = 1
df_bills = add_relative_calculate_past(df=df_bills,
                                       id_cols=["CUSTOMER_ID"],
                                       periods=past_periods,
                                       agg={'FACTURAS': [np.size]},
                                       date_col="PAGO",
                                       start_date=start_date,
                                       end_date=date,
                                       period_freq="D")

# Create expanded variables
log.info("Creating past variables...")
df_morosidad_activo_past = add_relative_calculate_past(df=df_morosidad_activo,
                                                       id_cols=["CUSTOMER_ID"],
                                                       periods=past_periods,
                                                       agg={
                                                           "TOT_AMT_OVERDUE_scaled": [np.nanmax],
                                                           "PROP_OVERDUE": [np.nanmax],
                                                           "EDAD_MORA": [np.nanmax],
                                                           "MORA_1_30_scaled": [np.nanmax],
                                                           "MORA_31_60_scaled": [np.nanmax],
                                                           "MORA_61_90_scaled": [np.nanmax],
                                                           "MORA_91_120_scaled": [np.nanmax],
                                                           "MORA_121_150_scaled": [np.nanmax],
                                                           "MORA_151_180_scaled": [np.nanmax],
                                                           "MORA_180_scaled": [np.nanmax]},
                                                       date_col="FECHA",
                                                       start_date=start_date,
                                                       end_date=date,
                                                       period_freq="D")

df_morosidad_activo_past = df_morosidad_activo_past.merge(df_bills, on=["CUSTOMER_ID"])

# Add date variables
df_morosidad_activo_past["DATE_EXP"] = period_to_load
df_morosidad_activo_past["DATE_CALC"] = date

# Change variable names 
table_preffix = parameters["masters"]["mora"]["table_preffix"]
rename_table(df_morosidad_activo_past,
             preffix=table_preffix,
             ids_to_exclude=id_cols)

2020-12-30 19:44:27,494 - aa_engine_pkg.assets.utils.utilities - INFO - Adding relative date between 20180313 and 20180605
2020-12-30 19:44:27,507 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 7
2020-12-30 19:44:27,516 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 28
2020-12-30 19:44:27,569 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 84
2020-12-30 19:44:27,639 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 168
2020-12-30 19:44:27,815 - aa_engine_pkg.assets.utils.utilities - INFO - Creating past variables...
2020-12-30 19:44:27,816 - aa_engine_pkg.assets.utils.utilities - INFO - Adding relative date between 20180313 and 20180605
2020-12-30 19:44:27,871 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 7
2020-12-30 19:44:27,931 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 28
2020-12-30 19:44:27,990 - aa_engine_pk

In [13]:
df_morosidad_activo_past.head()

Unnamed: 0,CUSTOMER_ID,MOR_TOT_AMT_OVERDUE_scaled_nanmax_7,MOR_PROP_OVERDUE_nanmax_7,MOR_EDAD_MORA_nanmax_7,MOR_MORA_1_30_scaled_nanmax_7,MOR_MORA_31_60_scaled_nanmax_7,MOR_MORA_61_90_scaled_nanmax_7,MOR_MORA_91_120_scaled_nanmax_7,MOR_MORA_121_150_scaled_nanmax_7,MOR_MORA_151_180_scaled_nanmax_7,...,MOR_MORA_91_120_scaled_nanmax_168,MOR_MORA_121_150_scaled_nanmax_168,MOR_MORA_151_180_scaled_nanmax_168,MOR_MORA_180_scaled_nanmax_168,MOR_FACTURAS_size_7,MOR_FACTURAS_size_28,MOR_FACTURAS_size_84,MOR_FACTURAS_size_168,DATE_EXP,DATE_CALC
0,165445,0.257159,0.483547,25.0,0.393859,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,201805,20180605
1,176591,0.324183,0.487478,25.0,0.500623,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,201805,20180605
2,195026,0.223402,0.241317,25.0,0.340087,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,201805,20180605
3,218815,0.429856,0.488718,22.0,0.668953,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,201805,20180605
4,220990,0.608423,0.488621,25.0,0.953398,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,201805,20180605


In [None]:
def create_master_mora(mora: SQLPartitionedDataSet,
                       cliente_activo: pd.DataFrame,
                       parameters: Dict,
                       date: str) -> pd.DataFrame:
    """Creates master table with customer default features for one period of data
    Parameters
    ----------
    mora: 
        dataset defined in ``catalog_raw.yml`` with raw data information related to customers in default
    cliente_activo: 
        dataset defined in ``catalog_raw.yml`` with raw data information related to active clients at EoP
    date: 
        period to process
    parameters: 
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame
        Master table with customer default features for one period
        
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "mora"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_morosidad_activo_past = pd.read_parquet(match[0], engine="pyarrow")

    else:
        log.info("Creating master table mora")

        # Read parameters
        look_back_days = parameters["masters"]["mora"]["look_back_days"]
        start_date = (pd.to_datetime(date) - timedelta(days=look_back_days)).strftime("%Y%m%d")

        # Calculate period to load for active clients
        log.info("Reading tables...")
        period_to_load = get_previous_month(date)

        df_clientes = cliente_activo[["CUSTOMER_ID"]]

        # Get intencion morosidad table
        df_morosidad = mora.filter_by(date=[start_date, date]).drop_duplicates()

        df_morosidad["FECHA_PAGO"] = pd.to_datetime(df_morosidad["PAGO"].astype(str))
        df_morosidad["EDAD_MORA"] = (df_morosidad["FECHA"] - df_morosidad["FECHA_PAGO"]) / np.timedelta64(1, "D")

        # Format
        df_morosidad["FECHA"] = df_morosidad["FECHA"].dt.strftime("%Y%m%d")

        # Merging
        log.info("Merging with EOP..")
        df_morosidad_activo = df_clientes.merge(df_morosidad,
                                                on="CUSTOMER_ID",
                                                how="inner")

        # Create scale variables
        log.info("Scaling numeric variables...")
        for var in vars_to_scale:
            log.info(f"---- {var}")
            df_morosidad_activo[var + "_scaled"] = scale_values(df=df_morosidad_activo,
                                                                vars_to_groupby=["DATE_EXP"],
                                                                var_to_scale=var,
                                                                by_ranking=False)

        create_evolution_variables(df=df_morosidad_activo,
                                   var_name='PROP_OVERDUE',
                                   numerator='TOT_AMT_OVERDUE',
                                   denominator='MONTO_ULT_FACTURA')

        # Creating number of bills in default
        df_morosidad_activo["PAGO"] = df_morosidad_activo["PAGO"].astype(str)
        df_bills = df_morosidad_activo[["CUSTOMER_ID", "PAGO", ]].drop_duplicates()
        df_bills["FACTURAS"] = 1
        df_bills = add_relative_calculate_past(df=df_bills,
                                               id_cols=["CUSTOMER_ID"],
                                               periods=past_periods,
                                               agg={'FACTURAS': [np.size]},
                                               date_col="PAGO",
                                               start_date=start_date,
                                               end_date=date,
                                               period_freq="D")

        # Create expanded variables
        log.info("Creating past variables...")
        df_morosidad_activo_past = add_relative_calculate_past(df=df_morosidad_activo,
                                                               id_cols=["CUSTOMER_ID"],
                                                               periods=past_periods,
                                                               agg={
                                                                   "TOT_AMT_OVERDUE_scaled": [np.nanmax],
                                                                   "PROP_OVERDUE": [np.nanmax],
                                                                   "EDAD_MORA": [np.nanmax],
                                                                   "MORA_1_30_scaled": [np.nanmax],
                                                                   "MORA_31_60_scaled": [np.nanmax],
                                                                   "MORA_61_90_scaled": [np.nanmax],
                                                                   "MORA_91_120_scaled": [np.nanmax],
                                                                   "MORA_121_150_scaled": [np.nanmax],
                                                                   "MORA_151_180_scaled": [np.nanmax],
                                                                   "MORA_180_scaled": [np.nanmax]},
                                                               date_col="FECHA",
                                                               start_date=start_date,
                                                               end_date=date,
                                                               period_freq="D")

        df_morosidad_activo_past = df_morosidad_activo_past.merge(df_bills, on=["CUSTOMER_ID"])

        # Add date variables
        df_morosidad_activo_past["DATE_EXP"] = period_to_load
        df_morosidad_activo_past["DATE_CALC"] = date

        # Change variable names 
        table_preffix = parameters["masters"]["mora"]["table_preffix"]
        rename_table(df_morosidad_activo_past,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols)
        log.info(f"Exporting {df_morosidad_activo_past.shape[0]} rows and {df_morosidad_activo_past.shape[1]} columns")

        if write_to_parquet:
            log.info(f"\n------ Writing {table_name} for period {date} to parquet ------")
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_morosidad_activo_past.to_parquet(file, engine="pyarrow")

        log.info(f"Exporting {df_morosidad_activo_past.shape[0]} rows and {df_morosidad_activo_past.shape[1]} columns")

    return df_morosidad_activo_past