In [1]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [2]:
date='20180605'

In [3]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [4]:
cliente_activo=catalog.load("cliente_activo")

2020-12-30 12:27:55,533 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [10]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2020-12-30 12:32:50,866 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201805


In [7]:
clientes=catalog.load("clientes")

2020-12-30 12:32:21,714 - kedro.io.data_catalog - INFO - Loading data from `clientes` (SQLPartitionedDataSet)...


In [26]:
string_vars = ["CP", "PROVINCIA"]

vars_to_dummy = ["PROVINCIA"]

categories_dict = {
    "PROVINCIA": ['MONTEVIDEO',	'CANELONES','MALDONADO','COLONIA','SAN JOSE','PAYSANDU','SALTO','TACUAREMBO','RIO NEGRO','SORIANO','ROCHA','FLORIDA'],
    "CP_META": ['11',	'12',	'14',	'15',	'16',	'20',	'27',	'30',	'33',	'37',	'40',	'45',	'50',	'55',	'60',	'65',	'70',
                '75',	'80',	'85',	'90',	'91',	'94',	'97']
}

id_cols = ["CUSTOMER_ID"]

num_vars = ["EDAD"]

In [27]:

# Initialize logger
log = initialize_logger()
# If table is not found, then create it:
# Calculate dates
period_to_load = get_previous_month(date)

df_cliente_activo = cliente_activo_df[["CUSTOMER_ID"]]

# Read parameters
df_clientes = clientes.filter_by_period(date=period_to_load).drop_duplicates()

log.info("Merging with EOP...")
df_clientes = pd.merge(df_cliente_activo,
                       df_clientes,
                       on="CUSTOMER_ID",
                       validate="1:m")

select CUSTOMER_ID, DATE_EXP, FECHA_CREATE, FECHANACIMIENTO, CP, PROVINCIA, DOMICILO_INS_FACT from stg_uy_clientes where GENERO NOT LIKE 'Compania' and CUSTOMER_STATUS LIKE 'N%' and DATE_EXP = 201805


  % ((self.server_version_info,))


2020-12-30 14:10:59,793 - aa_engine_pkg.assets.utils.utilities - INFO - Merging with EOP...


In [28]:
df_clientes.head()

Unnamed: 0,CUSTOMER_ID,DATE_EXP,FECHA_CREATE,FECHANACIMIENTO,CP,PROVINCIA,DOMICILO_INS_FACT
0,144472,201805,2005-11-24 16:52:47,1963-11-05,55002,ARTIGAS,DOMICILIO DE FACTURACION
1,144472,201805,2005-11-24 16:52:47,1963-11-05,55002,ARTIGAS,DOMICILIO DE INSTALACION
2,145360,201805,2005-12-12 18:10:40,1955-01-09,15000,CANELONES,DOMICILIO DE INSTALACION
3,146099,201805,2005-12-20 17:35:39,1958-03-13,11500,MONTEVIDEO,DOMICILIO DE FACTURACION
4,146099,201805,2005-12-20 17:35:39,1958-03-13,11500,MONTEVIDEO,DOMICILIO DE INSTALACION


In [29]:
# Drop duplicated entries based on DOMICILIO_INS_FACT and 

log.info("Dropping duplicated entries...")
df_clientes = df_clientes[df_clientes["DOMICILO_INS_FACT"] == "DOMICILIO DE INSTALACION"]

df_clientes = df_clientes.sort_values(["CUSTOMER_ID", "FECHA_CREATE"], ascending=[False, True]).drop_duplicates(
    subset=["CUSTOMER_ID"], keep="last")

log.info("Formatting variables...")
# Format string variables
for var in string_vars:
    df_clientes[var] = df_clientes[var].astype(str)
    format_string_variable(df_clientes, var)
    
# Calculate age
df_clientes["EDAD"] = (pd.to_datetime(df_clientes["DATE_EXP"], format="%Y%m") - df_clientes[
    "FECHANACIMIENTO"]) / np.timedelta64(1, "Y")

2020-12-30 14:10:59,972 - aa_engine_pkg.assets.utils.utilities - INFO - Dropping duplicated entries...
2020-12-30 14:11:00,125 - aa_engine_pkg.assets.utils.utilities - INFO - Formatting variables...


In [30]:
df_clientes.head()

Unnamed: 0,CUSTOMER_ID,DATE_EXP,FECHA_CREATE,FECHANACIMIENTO,CP,PROVINCIA,DOMICILO_INS_FACT,EDAD
91525,53966010,201805,2018-05-30 17:04:02,1956-12-01,12500,MONTEVIDEO,DOMICILIO DE INSTALACION,61.413992
16749,53964986,201805,2018-05-30 12:02:03,1950-08-05,40001,RIVERA,DOMICILIO DE INSTALACION,67.738557
79822,53964888,201805,2018-05-30 11:28:03,1976-11-16,40002,RIVERA,DOMICILIO DE INSTALACION,41.45465
75538,53964645,201805,2018-05-30 09:40:02,1988-05-09,55000,ARTIGAS,DOMICILIO DE INSTALACION,29.977344
9266,53963514,201805,2018-05-29 16:39:07,1981-08-17,90600,CANELONES,DOMICILIO DE INSTALACION,36.704381


In [31]:
# Create CP meta-category
df_clientes["CP_META"] = df_clientes["CP"].str[:2]
impute_categories(df_clientes,
                  "CP_META",
                  categories_dict["CP_META"])

# Create PROVINCIA dummies
log.info("Creating dummy variables....")
df = []
for var in vars_to_dummy:
    log.info(f'---- {var}')
    df.append(create_dummy_variables(df=df_clientes,
                                     vars_to_groupby=id_cols,
                                     var_to_dummy=var,
                                     include_total=False))

df_dummies = reduce(lambda left, right: pd.merge(left, right, on=id_cols, how="outer"), df)

2020-12-30 14:11:03,061 - aa_engine_pkg.assets.utils.utilities - INFO - Creating dummy variables....
2020-12-30 14:11:03,062 - aa_engine_pkg.assets.utils.utilities - INFO - ---- PROVINCIA


In [32]:
df_dummies.head()

Unnamed: 0,CUSTOMER_ID,PROVINCIA_ARTIGAS,PROVINCIA_CANELONES,PROVINCIA_CERRO_LARGO,PROVINCIA_COLONIA,PROVINCIA_DURAZNO,PROVINCIA_FLORES,PROVINCIA_FLORIDA,PROVINCIA_LAVALLEJA,PROVINCIA_MALDONADO,...,PROVINCIA_PAYSANDU,PROVINCIA_RIO_NEGRO,PROVINCIA_RIVERA,PROVINCIA_ROCHA,PROVINCIA_SALTO,PROVINCIA_SAN_JOSE,PROVINCIA_SORIANO,PROVINCIA_TACUAREMBO,PROVINCIA_TREINTA_Y_TRES,PROVINCIA_SUBTOTAL
0,8069,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,8176,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,8677,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,8686,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,8855,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [33]:
# Join with num vars and expand
df_clientes_dummies = pd.merge(df_clientes[id_cols + num_vars],
                               df_dummies,
                               on=id_cols,
                               how="inner",
                               validate="1:1")

df_clientes_dummies["DATE_EXP"] = period_to_load
df_clientes_dummies["DATE_CALC"] = date

table_preffix = parameters["masters"]["clientes"]["table_preffix"]
rename_table(df_clientes_dummies,
             preffix=table_preffix,
             ids_to_exclude=id_cols + ["DATE_EXP"])

In [34]:
df_clientes_dummies

Unnamed: 0,CUSTOMER_ID,CLI_EDAD,CLI_PROVINCIA_ARTIGAS,CLI_PROVINCIA_CANELONES,CLI_PROVINCIA_CERRO_LARGO,CLI_PROVINCIA_COLONIA,CLI_PROVINCIA_DURAZNO,CLI_PROVINCIA_FLORES,CLI_PROVINCIA_FLORIDA,CLI_PROVINCIA_LAVALLEJA,...,CLI_PROVINCIA_RIVERA,CLI_PROVINCIA_ROCHA,CLI_PROVINCIA_SALTO,CLI_PROVINCIA_SAN_JOSE,CLI_PROVINCIA_SORIANO,CLI_PROVINCIA_TACUAREMBO,CLI_PROVINCIA_TREINTA_Y_TRES,CLI_PROVINCIA_SUBTOTAL,DATE_EXP,CLI_DATE_CALC
0,53966010,61.413992,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,201805,20180605
1,53964986,67.738557,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,201805,20180605
2,53964888,41.454650,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,201805,20180605
3,53964645,29.977344,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,201805,20180605
4,53963514,36.704381,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,201805,20180605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72952,8855,71.281409,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,201805,20180605
72953,8686,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,201805,20180605
72954,8677,51.875124,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,201805,20180605
72955,8176,45.487587,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,201805,20180605


In [None]:
def create_master_clientes(cliente_activo: pd.DataFrame,
                           clientes: SQLPartitionedDataSet,
                           parameters: Dict,
                           date: str) -> pd.DataFrame:
    """Creates mastertable with customer features for one period of data

    Parameters
    ----------
    clientes:
        dataset defined in ``catalog.yml`` with raw customer information (e.g. demographic data)
    cliente_activo:
        dataset defined in ``catalog.yml`` with raw data information related to active clients at EoP
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``

    Returns
    -------
    pd.DataFrame
        mastertable with customer features for one period
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "clientes"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if f"master_{table_name}_2" in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_clientes_dummies = pd.read_parquet(match[0], engine="pyarrow")

    else:
        # If table is not found, then create it:
        # Calculate dates
        period_to_load = get_previous_month(date)

        df_cliente_activo = cliente_activo[["CUSTOMER_ID"]]

        # Read parameters
        df_clientes = clientes.filter_by_period(date=period_to_load).drop_duplicates()

        log.info("Merging with EOP...")
        df_clientes = pd.merge(df_cliente_activo,
                               df_clientes,
                               on="CUSTOMER_ID",
                               validate="1:m")

        # Drop duplicated entries based on DOMICILIO_INS_FACT and 

        log.info("Dropping duplicated entries...")
        df_clientes = df_clientes[df_clientes["DOMICILO_INS_FACT"] == "DOMICILIO DE INSTALACION"]

        df_clientes = df_clientes.sort_values(["CUSTOMER_ID", "FECHA_CREATE"], ascending=[False, True]).drop_duplicates(
            subset=["CUSTOMER_ID"], keep="last")

        log.info("Formatting variables...")
        # Format string variables
        for var in string_vars:
            df_clientes[var] = df_clientes[var].astype(str)
            format_string_variable(df_clientes, var)

        # Calculate age
        df_clientes["EDAD"] = (pd.to_datetime(df_clientes["DATE_EXP"], format="%Y%m") - df_clientes[
            "FECHANACIMIENTO"]) / np.timedelta64(1, "Y")

        # Dropping original variable
        df_clientes.drop("FECHANACIMIENTO", axis=1, inplace=True)

        # Impute categories
        log.info("Imputting categories...")
        impute_categories(df_clientes,
                          "PROVINCIA",
                          categories_dict["PROVINCIA"]
                          )

        # Create CP meta-category
        df_clientes["CP_META"] = df_clientes["CP"].str[:2]
        impute_categories(df_clientes,
                          "CP_META",
                          categories_dict["CP_META"])

        # Create PROVINCIA dummies
        log.info("Creating dummy variables....")
        df = []
        for var in vars_to_dummy:
            log.info(f'---- {var}')
            df.append(create_dummy_variables(df=df_clientes,
                                             vars_to_groupby=id_cols,
                                             var_to_dummy=var,
                                             include_total=False))

        df_dummies = reduce(lambda left, right: pd.merge(left, right, on=id_cols, how="outer"), df)

        # Join with num vars and expand
        df_clientes_dummies = pd.merge(df_clientes[id_cols + num_vars],
                                       df_dummies,
                                       on=id_cols,
                                       how="inner",
                                       validate="1:1")

        df_clientes_dummies["DATE_EXP"] = period_to_load
        df_clientes_dummies["DATE_CALC"] = date

        table_preffix = parameters["masters"]["clientes"]["table_preffix"]
        rename_table(df_clientes_dummies,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols + ["DATE_EXP"])

        log.info(f"Exporting {df_clientes_dummies.shape[0]} rows and {df_clientes_dummies.shape[1]} columns")

        if write_to_parquet:
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_clientes_dummies.to_parquet(file, engine="pyarrow")

    return df_clientes_dummies
