In [18]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [2]:
date='20180605'

In [3]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [19]:
cliente_activo=catalog.load("cliente_activo")

2020-12-29 12:21:50,736 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [5]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2020-12-29 12:17:54,251 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201805


  % ((self.server_version_info,))


In [6]:
num_vars = ["ARPU", "FLAG_DISCOUNT", "PROP_DISCOUNT"]

vars_to_group_by = ["CUSTOMER_ID"]

id_cols = ["CUSTOMER_ID", "DATE_EXP"]

past_periods = [1, 3, 6]

In [7]:
arpu_quality= catalog.load("arpu_quality")

2020-12-29 12:18:03,259 - kedro.io.data_catalog - INFO - Loading data from `arpu_quality` (SQLPartitionedDataSet)...


In [8]:
# Initialize logger
log = initialize_logger()

write_to_parquet = parameters["write_to_parquet"]
table_name = "arpu_quality"
overwrite = parameters["masters"][table_name]["overwrite"]

In [9]:
# Check if table was already created
files = get_mastertable_paths_by_period(parameters=parameters, period=date)
match = [str(file) for file in files if table_name in file]

In [10]:
len(match) > 0 and overwrite is False

False

In [35]:
# If not, create table
# Calculate period to load for active clients
log.info("Loading active customers")
df_clientes = cliente_activo_df

2020-12-29 12:30:34,239 - aa_engine_pkg.assets.utils.utilities - INFO - Loading active customers


In [36]:
# Getting periods to load
look_back_months = parameters["masters"]["global"]["look_back_months"]
periods_to_load = get_last_k_periods(date, look_back_months)
start_date = periods_to_load[-1]
periods_to_load = tuple(periods_to_load)
period_to_load = get_previous_month(date)

In [37]:
periods_to_load

('201806', '201805', '201804', '201803', '201802', '201801')

In [38]:
# Get arpu_quality table
log.info("Loading arpu")
df_aq = arpu_quality.filter_by_period(date=periods_to_load).drop_duplicates()

2020-12-29 12:30:38,189 - aa_engine_pkg.assets.utils.utilities - INFO - Loading arpu
select * from stg_uy_arpu_quality where CHARGES_YYYYMM in ('201806', '201805', '201804', '201803', '201802', '201801')


In [39]:
log.info("Merging tables")
df_aq = pd.merge(df_clientes,
                 df_aq,
                 on=["CUSTOMER_ID"],
                 how="inner",
                 validate="1:m")

2020-12-29 12:31:04,573 - aa_engine_pkg.assets.utils.utilities - INFO - Merging tables


In [40]:
# Fixing variables
df_aq["DISCOUNT_AMOUNT"].fillna(0, inplace=True)

# Create flag to check for discounts
df_aq['FLAG_DISCOUNT'] = np.where(df_aq['DISCOUNT_AMOUNT'] == 0, 0, 1)

# Change discount sign to positive
df_aq["DISCOUNT_AMOUNT"] = np.abs(df_aq["DISCOUNT_AMOUNT"])

# Generate discount ratio between discount and charges
create_evolution_variables(df=df_aq,
                           var_name='PROP_DISCOUNT',
                           denominator='CHARGES_AMOUNT',
                           numerator='DISCOUNT_AMOUNT')

# Scale value
df_aq["ARPU"] = scale_values(df=df_aq,
                             vars_to_groupby=["DATE_EXP"],
                             var_to_scale="ARPU",
                             by_ranking=True)

df_aq["PROP_DISCOUNT"] = scale_values(df=df_aq,
                                      vars_to_groupby=["DATE_EXP"],
                                      var_to_scale="PROP_DISCOUNT",
                                      by_ranking=True)

2020-12-29 12:31:14,553 - numexpr.utils - INFO - Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-12-29 12:31:14,554 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


In [41]:
# Calculate past variables
df_aq_past = add_relative_calculate_past(df_aq,
                                         id_cols=["CUSTOMER_ID"],
                                         date_col="DATE_EXP",
                                         start_date=start_date,
                                         end_date=period_to_load,
                                         periods=past_periods,
                                         period_freq="M",
                                         agg={'ARPU': [np.nanmean],
                                              'FLAG_DISCOUNT': [np.nansum],
                                              'PROP_DISCOUNT': [np.nanmean]},
                                         )

2020-12-29 12:31:18,762 - aa_engine_pkg.assets.utils.utilities - INFO - Adding relative date between 201801 and 201805
2020-12-29 12:31:19,859 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 1
2020-12-29 12:31:19,998 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 3
2020-12-29 12:31:20,272 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 6


In [42]:
# Create ratios
log.info("Calculating ratios")
create_evolution_variables(df=df_aq_past,
                           var_name='RATIO_ARPU_1_3',
                           numerator='ARPU_nanmean_1',
                           denominator='ARPU_nanmean_3')
create_evolution_variables(df=df_aq_past,
                           var_name='RATIO_ARPU_1_6',
                           numerator='ARPU_nanmean_1',
                           denominator='ARPU_nanmean_6')

# Add date variables
df_aq_past["DATE_EXP"] = period_to_load

2020-12-29 12:31:24,841 - aa_engine_pkg.assets.utils.utilities - INFO - Calculating ratios


In [43]:
# Change variable names 
table_preffix = parameters["masters"]["arpu_quality"]["table_preffix"]
rename_table(df_aq_past,
             preffix=table_preffix,
             ids_to_exclude=id_cols)

In [44]:
df_aq_past.head()

Unnamed: 0,CUSTOMER_ID,AQY_ARPU_nanmean_1,AQY_FLAG_DISCOUNT_nansum_1,AQY_PROP_DISCOUNT_nanmean_1,AQY_ARPU_nanmean_3,AQY_FLAG_DISCOUNT_nansum_3,AQY_PROP_DISCOUNT_nanmean_3,AQY_ARPU_nanmean_6,AQY_FLAG_DISCOUNT_nansum_6,AQY_PROP_DISCOUNT_nanmean_6,AQY_RATIO_ARPU_1_3,AQY_RATIO_ARPU_1_6,DATE_EXP
0,144472,0.953749,0.0,0.038905,0.952094,0.0,0.041554,0.952033,0.0,0.042571,1.001738,1.001802,201805
1,145360,0.963571,0.0,0.030811,0.964279,0.0,0.031307,0.964615,0.0,0.031821,0.999266,0.998918,201805
2,146099,0.904295,0.0,0.079089,0.902656,0.0,0.082472,0.902659,0.0,0.083819,1.001816,1.001812,201805
3,146123,,,,0.390366,0.0,0.472731,0.384929,0.0,0.479281,,,201805
4,146197,0.979763,0.0,0.017082,0.979985,0.0,0.017949,0.974408,0.0,0.023502,0.999774,1.005495,201805


In [31]:
f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"

'/data/uy_po/master/master_arpu_quality/master_arpu_quality_20180605.parquet'

In [45]:
if write_to_parquet:
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_aq_past.to_parquet(file, engine="pyarrow")

log.info(f"Exporting {df_aq_past.shape[0]} rows and {df_aq_past.shape[1]} columns")

2020-12-29 12:45:56,310 - aa_engine_pkg.assets.utils.utilities - INFO - Exporting 116786 rows and 13 columns


In [None]:
def create_master_arpu_quality(arpu_quality: SQLPartitionedDataSet,
                               cliente_activo: pd.DataFrame,
                               parameters: Dict,
                               date: str) -> pd.DataFrame:
    """Creates master table with ARPU features for one period of data

    Parameters
    ----------
    arpu_quality:
        dataset defined in ``catalog.yml`` with raw data information related to ARPU
    cliente_activo:
        dataset defined in ``catalog.yml`` with raw data information related to active clients at EoP
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``

    Returns
    -------
    pd.DataFrame
        Mastertable with ARPU features for one period
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "arpu_quality"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_aq_past = pd.read_parquet(match[0], engine="pyarrow")

    else:
        # If not, create table
        # Calculate period to load for active clients
        log.info("Loading active customers")
        df_clientes = cliente_activo

        # Getting periods to load
        look_back_months = parameters["masters"]["global"]["look_back_months"]
        periods_to_load = get_last_k_periods(date, look_back_months)
        start_date = periods_to_load[-1]
        periods_to_load = tuple(periods_to_load)
        period_to_load = get_previous_month(date)

        # Get arpu_quality table
        log.info("Loading arpu")
        df_aq = arpu_quality.filter_by_period(date=periods_to_load).drop_duplicates()

        log.info("Merging tables")
        df_aq = pd.merge(df_clientes,
                         df_aq,
                         on=["CUSTOMER_ID"],
                         how="inner",
                         validate="1:m")

        # Fixing variables
        df_aq["DISCOUNT_AMOUNT"].fillna(0, inplace=True)

        # Create flag to check for discounts
        df_aq['FLAG_DISCOUNT'] = np.where(df_aq['DISCOUNT_AMOUNT'] == 0, 0, 1)

        # Change discount sign to positive
        df_aq["DISCOUNT_AMOUNT"] = np.abs(df_aq["DISCOUNT_AMOUNT"])

        # Generate discount ratio between discount and charges
        create_evolution_variables(df=df_aq,
                                   var_name='PROP_DISCOUNT',
                                   denominator='CHARGES_AMOUNT',
                                   numerator='DISCOUNT_AMOUNT')

        # Scale value
        df_aq["ARPU"] = scale_values(df=df_aq,
                                     vars_to_groupby=["DATE_EXP"],
                                     var_to_scale="ARPU",
                                     by_ranking=True)

        df_aq["PROP_DISCOUNT"] = scale_values(df=df_aq,
                                              vars_to_groupby=["DATE_EXP"],
                                              var_to_scale="PROP_DISCOUNT",
                                              by_ranking=True)

        # Calculate past variables
        df_aq_past = add_relative_calculate_past(df_aq,
                                                 id_cols=["CUSTOMER_ID"],
                                                 date_col="DATE_EXP",
                                                 start_date=start_date,
                                                 end_date=period_to_load,
                                                 periods=past_periods,
                                                 period_freq="M",
                                                 agg={'ARPU': [np.nanmean],
                                                      'FLAG_DISCOUNT': [np.nansum],
                                                      'PROP_DISCOUNT': [np.nanmean]},
                                                 )
        # Create ratios
        log.info("Calculating ratios")
        create_evolution_variables(df=df_aq_past,
                                   var_name='RATIO_ARPU_1_3',
                                   numerator='ARPU_nanmean_1',
                                   denominator='ARPU_nanmean_3')
        create_evolution_variables(df=df_aq_past,
                                   var_name='RATIO_ARPU_1_6',
                                   numerator='ARPU_nanmean_1',
                                   denominator='ARPU_nanmean_6')

        # Add date variables
        df_aq_past["DATE_EXP"] = period_to_load

        # Change variable names 
        table_preffix = parameters["masters"]["arpu_quality"]["table_preffix"]
        rename_table(df_aq_past,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols)

        if write_to_parquet:
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_aq_past.to_parquet(file, engine="pyarrow")

        log.info(f"Exporting {df_aq_past.shape[0]} rows and {df_aq_past.shape[1]} columns")

    return df_aq_past
