In [2]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [3]:
date='20180605'

In [4]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [5]:
cliente_activo=catalog.load("cliente_activo")

2020-12-30 17:48:09,651 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [6]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2020-12-30 17:48:12,256 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201805


  % ((self.server_version_info,))


In [7]:
eop=catalog.load("eop")

2020-12-30 17:48:14,993 - kedro.io.data_catalog - INFO - Loading data from `eop` (SQLPartitionedDataSet)...


In [8]:
dict_product = {147:"PLATA", 169:"PLATA",				
139:"ORO",132:"ORO",134:"ORO",133:"ORO",139:"ORO",	
130:"HBO",31:"HBO",27:"HBO",			
129:"FOX",28:"FOX",				
44:"DEPORTES",160:"DEPORTES",116:"DEPORTES",121:"DEPORTES",113:"DEPORTES",105:"DEPORTES",
135:"BRONCE",217:"BRONCE",217:"BRONCE",216:"BRONCE",216:"BRONCE",	
138:"ADULTOS",43:"ADULTOS",158:"ADULTOS",141:"ADULTOS",142:"ADULTOS",42:"ADULTOS"}

id_cols = ["CUSTOMER_ID", "DATE_EXP"]


In [14]:
# Initialize logger
log = initialize_logger()
# If table is not found, then create it:
log.info("Reading table...")

# Change date format if needed 
if len(date) == 8:
    date_exp = get_previous_month(date)
else:
    date_exp = str(date)
            
# Load active clientes
log.info("Reading table...")
df_clientes = eop.filter_by_period(date=date_exp).drop_duplicates()
df_clientes_activos = cliente_activo_df[["CUSTOMER_ID"]]

df_clientes = pd.merge(df_clientes,
                       df_clientes_activos,
                       on="CUSTOMER_ID",
                       how="inner",
                       validate="m:1")

# Calculate number of products per customer
log.info("Calculating number of products")
df_products = pd.pivot_table(data=df_clientes[["CUSTOMER_ID", "PRC_TIPO_ID"]],
                             index="CUSTOMER_ID",
                             columns="PRC_TIPO_ID",
                             aggfunc=len,
                             fill_value=0).reset_index()

df_products.rename(columns={1: "N_PREMIUM",
                            3: "N_BASICO",
                            5: "N_HARDWARE"}, inplace=True)

2020-12-30 18:04:49,824 - aa_engine_pkg.assets.utils.utilities - INFO - Reading table...
2020-12-30 18:04:49,827 - aa_engine_pkg.assets.utils.utilities - INFO - Reading table...
select DATE_EXP, CUSTOMER_ID, PRC_CODIGO, PRODUCTO, PRC_TIPO_ID, TEC_ID, MOP, TENURE from stg_uy_eop_customer where DATE_EXP = 201805
2020-12-30 18:05:01,184 - aa_engine_pkg.assets.utils.utilities - INFO - Calculating number of products


In [15]:
# Format and calculate mop variables
log.info("Calculating mop")
format_string_variable(df_tmp=df_clientes, var="MOP")

2020-12-30 18:05:19,766 - aa_engine_pkg.assets.utils.utilities - INFO - Calculating mop


In [16]:
df_clientes.head()

Unnamed: 0,DATE_EXP,CUSTOMER_ID,PRC_CODIGO,PRODUCTO,PRC_TIPO_ID,TEC_ID,MOP,TENURE
0,201805,53212468,129,FOX+ High Definition,1,25,INVOICE,10.0
1,201805,53212468,130,HBO Max - High Definition,1,25,INVOICE,10.0
2,201805,53212468,138,ADULT PACK,1,25,INVOICE,10.0
3,201805,53212468,139,ORO,3,25,INVOICE,10.0
4,201805,53212468,153,IRD HD ONLY,5,25,INVOICE,


In [19]:
condlist = [df_clientes.MOP.str.contains("BANC"),
            df_clientes.MOP.str.contains("TARJETA"),
            df_clientes.MOP.str.contains("INVOICE")                   
            ]

choicelist = ["DEBITO", "TARJETA", "INVOICE"]

df_clientes["MOP"] = np.select(condlist, choicelist, default="OTHER")

df_mop = create_dummy_variables(df_clientes,
                                vars_to_groupby=["CUSTOMER_ID"],
                                var_to_dummy="MOP",
                                values=["DEBITO", "INVOICE", "TARJETA"],
                                include_total=False,
                                include_subtotal=False)

['DEBITO']


In [21]:
# Format numeric vars
log.info("Calculating tenure")
df_clientes["TENURE"] = df_clientes["TENURE"].astype(float)
df_tenure = df_clientes.groupby("CUSTOMER_ID")["TENURE"].agg(np.nanmax).reset_index(name="TENURE")

# For non-hardware products, categorization of products
log.info("Creating product type variables")
products = pd.DataFrame.from_dict(dict_product, orient="index", columns=["SUP_PRODUCTO"]).reset_index()
products.rename(columns={"index": "PRC_CODIGO"}, inplace=True)

2020-12-30 18:11:33,752 - aa_engine_pkg.assets.utils.utilities - INFO - Calculating tenure
2020-12-30 18:11:33,842 - aa_engine_pkg.assets.utils.utilities - INFO - Creating product type variables


In [22]:
mask_software = (df_clientes["PRC_TIPO_ID"] != 5)
df_clientes = pd.merge(df_clientes.loc[mask_software],
                       products,
                       on="PRC_CODIGO",
                       how="left",
                       validate="m:1")
df_clientes["SUP_PRODUCTO"].fillna("OTHER", inplace=True)

df_product_types = pd.pivot_table(data=df_clientes[["CUSTOMER_ID", "SUP_PRODUCTO"]],
                                  index="CUSTOMER_ID",
                                  columns="SUP_PRODUCTO",
                                  aggfunc=len,
                                  fill_value=0).reset_index()

rename_table(df_product_types,
             preffix="N",
             ids_to_exclude=id_cols)


In [23]:
# Creating tech id variables
log.info("Creating technology variables")
df_clientes["TEC_ID"] = df_clientes["TEC_ID"].astype(str)
impute_categories(df_clientes, "TEC_ID", ["10", "25", "30"])
df_tech_id = pd.pivot_table(data=df_clientes[["CUSTOMER_ID", "TEC_ID"]],
                            index="CUSTOMER_ID",
                            columns="TEC_ID",
                            aggfunc=len,
                            fill_value=0).reset_index()
rename_table(df_tech_id,
             preffix="N_TECH",
             ids_to_exclude=id_cols)

# Merge together all dfs
df_list = [df_products, df_mop, df_tenure, df_product_types, df_tech_id]
df_clientes_grouped = reduce(
    lambda left, right: pd.merge(left, right, on=["CUSTOMER_ID"], how="outer", validate="1:1"), df_list)

df_clientes_grouped["DATE_EXP"] = date_exp
df_clientes_grouped["DATE_CALC"] = date

table_preffix = parameters["masters"]["eop"]["table_preffix"]
rename_table(df_clientes_grouped,
             preffix=table_preffix,
             ids_to_exclude=id_cols)

2020-12-30 18:15:55,141 - aa_engine_pkg.assets.utils.utilities - INFO - Creating technology variables


In [None]:
def create_master_eop(eop: SQLPartitionedDataSet,
                      cliente_activo: pd.DataFrame,
                      parameters: Dict,
                      date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data
    Parameters
    ----------
    cliente_activo:
        dataset defined in ``catalog.yml`` with list of active customers at EoP for the given period
    eop:
        dataset defined in ``catalog.yml`` with raw data information related to EoP customer state
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "eop"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name + "_" + date in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_clientes_grouped = pd.read_parquet(match[0], engine="pyarrow")

    else:
        # If table is not found, then create it:
        log.info("Reading table...")

        # Change date format if needed 
        if len(date) == 8:
            date_exp = get_previous_month(date)
        else:
            date_exp = str(date)

        # Load active clientes
        log.info("Reading table...")
        df_clientes = eop.filter_by_period(date=date_exp).drop_duplicates()
        df_clientes_activos = cliente_activo[["CUSTOMER_ID"]]

        df_clientes = pd.merge(df_clientes,
                               df_clientes_activos,
                               on="CUSTOMER_ID",
                               how="inner",
                               validate="m:1")

        # Calculate number of products per customer
        log.info("Calculating number of products")
        df_products = pd.pivot_table(data=df_clientes[["CUSTOMER_ID", "PRC_TIPO_ID"]],
                                     index="CUSTOMER_ID",
                                     columns="PRC_TIPO_ID",
                                     aggfunc=len,
                                     fill_value=0).reset_index()

        df_products.rename(columns={1: "N_PREMIUM",
                                    3: "N_BASICO",
                                    5: "N_HARDWARE"}, inplace=True)

        # Format and calculate mop variables
        log.info("Calculating mop")
        format_string_variable(df_tmp=df_clientes, var="MOP")

        condlist = [df_clientes.MOP.str.contains("BANC"),
                    df_clientes.MOP.str.contains("TARJETA"),
                    df_clientes.MOP.str.contains("INVOICE")                   
                    ]

        choicelist = ["DEBITO", "TARJETA", "INVOICE"]

        df_clientes["MOP"] = np.select(condlist, choicelist, default="OTHER")

        df_mop = create_dummy_variables(df_clientes,
                                        vars_to_groupby=["CUSTOMER_ID"],
                                        var_to_dummy="MOP",
                                        values=["DEBITO", "INVOICE", "TARJETA", "ANTICIPADO"],
                                        include_total=False,
                                        include_subtotal=False)

        # Format numeric vars
        log.info("Calculating tenure")
        df_clientes["TENURE"] = df_clientes["TENURE"].astype(float)
        df_tenure = df_clientes.groupby("CUSTOMER_ID")["TENURE"].agg(np.nanmax).reset_index(name="TENURE")

        # For non-hardware products, categorization of products
        log.info("Creating product type variables")
        products = pd.DataFrame.from_dict(dict_product, orient="index", columns=["SUP_PRODUCTO"]).reset_index()
        products.rename(columns={"index": "PRC_CODIGO"}, inplace=True)

        mask_software = (df_clientes["PRC_TIPO_ID"] != 5)
        df_clientes = pd.merge(df_clientes.loc[mask_software],
                               products,
                               on="PRC_CODIGO",
                               how="left",
                               validate="m:1")
        df_clientes["SUP_PRODUCTO"].fillna("OTHER", inplace=True)

        df_product_types = pd.pivot_table(data=df_clientes[["CUSTOMER_ID", "SUP_PRODUCTO"]],
                                          index="CUSTOMER_ID",
                                          columns="SUP_PRODUCTO",
                                          aggfunc=len,
                                          fill_value=0).reset_index()

        rename_table(df_product_types,
                     preffix="N",
                     ids_to_exclude=id_cols)

        # Creating tech id variables
        log.info("Creating technology variables")
        df_clientes["TEC_ID"] = df_clientes["TEC_ID"].astype(str)
        impute_categories(df_clientes, "TEC_ID", ["10", "25", "30"])
        df_tech_id = pd.pivot_table(data=df_clientes[["CUSTOMER_ID", "TEC_ID"]],
                                    index="CUSTOMER_ID",
                                    columns="TEC_ID",
                                    aggfunc=len,
                                    fill_value=0).reset_index()
        rename_table(df_tech_id,
                     preffix="N_TECH",
                     ids_to_exclude=id_cols)

        # Merge together all dfs
        df_list = [df_products, df_mop, df_tenure, df_product_types, df_tech_id]
        df_clientes_grouped = reduce(
            lambda left, right: pd.merge(left, right, on=["CUSTOMER_ID"], how="outer", validate="1:1"), df_list)

        df_clientes_grouped["DATE_EXP"] = date_exp
        df_clientes_grouped["DATE_CALC"] = date

        table_preffix = parameters["masters"]["eop"]["table_preffix"]
        rename_table(df_clientes_grouped,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols)

        if write_to_parquet:
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_clientes_grouped.to_parquet(file, engine="pyarrow")

    log.info(f"Exporting {df_clientes_grouped.shape[0]} rows and {df_clientes_grouped.shape[1]} columns")

    # Return
    return df_clientes_grouped