In [1]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [2]:
date='20180605'

In [3]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [4]:
cliente_activo=catalog.load("cliente_activo")

2020-12-30 14:22:24,069 - kedro.io.data_catalog - INFO - Loading data from `cliente_activo` (SQLPartitionedDataSet)...


In [5]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

2020-12-30 14:22:25,056 - aa_engine_pkg.assets.utils.utilities - INFO - Creating cliente_activo...
select distinct CUSTOMER_ID from stg_uy_customer_status where UPPER(STATUS) LIKE '%ACTIVO%' and DATE_EXP = 201805


  % ((self.server_version_info,))


In [6]:
echi=catalog.load("echi")

2020-12-30 14:22:29,655 - kedro.io.data_catalog - INFO - Loading data from `echi` (SQLPartitionedDataSet)...


In [12]:
string_vars = ["DISPOSITION_DESC"]

groupby_cols = ["CUSTOMER_ID", "FECHA"]

id_cols = ["CUSTOMER_ID", "DATE_EXP", "DATE_CALC"]

past_periods = [7, 14, 21, 28, 84, 168]


In [13]:
# Initialize logger
log = initialize_logger()
# Read parameters
lookback_days = parameters["masters"]["global"]["look_back_days"]
start_date = pd.to_datetime(date) - timedelta(days=lookback_days)

# Calculate period to load for active clients
log.info("Loading table clientes")
period_to_load = get_previous_month(date)
df_clientes = cliente_activo_df[["CUSTOMER_ID"]]

log.info("Loading table echi")
df_echi = echi.filter_by(date=[start_date.strftime("%Y%m%d"),
                               date])

log.info("Merging with EOP...")
df_echi = pd.merge(df_clientes,
                   df_echi,
                   on="CUSTOMER_ID",
                   validate="1:m")

# Formatting variables
df_echi["FECHA"] = df_echi["TMP_FECHA"].dt.strftime("%Y%m%d")
log.info("Formatting string vars...")
for var in string_vars:
    format_string_variable(df_echi, var)


2020-12-30 15:02:25,164 - aa_engine_pkg.assets.utils.utilities - INFO - Loading table clientes
2020-12-30 15:02:25,167 - aa_engine_pkg.assets.utils.utilities - INFO - Loading table echi
select CLI_IBS_ID as CUSTOMER_ID, TMP_FECHA, DISPOSITION_DESC, SKILL_DESDE, DURATION, TALKTIME, HOLDABN from stg_uy_echi where TMP_FECHA >= to_date('20171219', 'yyyymmdd') and TMP_FECHA < to_date('20180605', 'yyyymmdd')
2020-12-30 15:02:44,082 - aa_engine_pkg.assets.utils.utilities - INFO - Merging with EOP...
2020-12-30 15:02:46,126 - aa_engine_pkg.assets.utils.utilities - INFO - Formatting string vars...


In [10]:
df_echi.head()

Unnamed: 0,CUSTOMER_ID,TMP_FECHA,DISPOSITION_DESC,SKILL_DESDE,DURATION,TALKTIME,HOLDABN,FECHA
0,145360,2018-02-24,ABANDONED,UY_V_IVR_ENC_IV_OVE,8,0,0,20180224
1,145360,2018-02-24,ANSWERED,UY_S_TST_TST_IB_POS,166,160,0,20180224
2,146197,2018-02-10,INTERFLOWED,NO INFORMADO,90,0,0,20180210
3,149934,2018-03-13,FORCED_DISCONNECT,UY_V_IVR_ENC_IV_OVE,1,0,0,20180313
4,149934,2018-03-13,ANSWERED,UY_V_IVR_ENC_IV_OVE,66,65,0,20180313


In [25]:
df_echi.DISPOSITION_DESC.unique()

array(['ABANDONED', 'ANSWERED', 'INTERFLOWED', 'FORCED_DISCONNECT'],
      dtype=object)

In [14]:
# Calculate daily stats
df_echi_daily = flatten_df(df_echi.groupby(groupby_cols).agg({"SKILL_DESDE": "nunique",
                                                              "DURATION": ["mean", "max"],
                                                              "TALKTIME": ["mean", "max"],
                                                              "HOLDABN": "sum"}))

df_echi_daily.rename(columns={"SKILL_DESDE_nunique": "N_INSTANCIAS",
                              "HOLDABN_sum": "N_HOLDS"}, inplace=True)

In [15]:
df_echi_daily.head()

Unnamed: 0,CUSTOMER_ID,FECHA,N_INSTANCIAS,DURATION_mean,DURATION_max,TALKTIME_mean,TALKTIME_max,N_HOLDS
0,8031,20180104,1,13.0,13,0.0,0,0
1,8031,20180118,2,67.0,130,65.5,129,0
2,8069,20180122,2,119.0,270,106.0,240,0
3,8686,20180303,2,660.333333,1900,657.666667,1894,0
4,8686,20180505,3,154.0,258,98.333333,257,0


In [16]:
# Create ratio between talktime and call duration
create_evolution_variables(df_echi_daily,
                           var_name="TALK_DUR_RATIO",
                           numerator="TALKTIME_mean",
                           denominator="DURATION_mean"
                           )

In [17]:
impute_categories(df_echi,
                  "DISPOSITION_DESC",
                  ["ABANDONED", "ANSWERED", "CONNECTED",
                   "FORCED_DISCONNECT", "INTERFLOWED"]
                  )

In [18]:
log.info("Creating dummies...")
log.info("---- DISPOSITION_DESC")
df_echi_dummies = create_dummy_variables(df_echi,
                                         var_to_dummy="DISPOSITION_DESC",
                                         vars_to_groupby=groupby_cols,
                                         include_total=False,
                                         include_subtotal=False)

2020-12-30 15:06:50,833 - aa_engine_pkg.assets.utils.utilities - INFO - Creating dummies...
2020-12-30 15:06:50,835 - aa_engine_pkg.assets.utils.utilities - INFO - ---- DISPOSITION_DESC


In [19]:
# Merge
df_echi_dummies = pd.merge(df_echi_daily,
                           df_echi_dummies,
                           on=groupby_cols,
                           how="inner",
                           validate="1:1")

In [20]:
df_echi_dummies.head()

Unnamed: 0,CUSTOMER_ID,FECHA,N_INSTANCIAS,DURATION_mean,DURATION_max,TALKTIME_mean,TALKTIME_max,N_HOLDS,TALK_DUR_RATIO,DISPOSITION_DESC_ABANDONED,DISPOSITION_DESC_ANSWERED,DISPOSITION_DESC_FORCED_DISCONNECT,DISPOSITION_DESC_INTERFLOWED
0,8031,20180104,1,13.0,13,0.0,0,0,7.692308e-11,0,0,0,1
1,8031,20180118,2,67.0,130,65.5,129,0,0.9776119,0,1,0,0
2,8069,20180122,2,119.0,270,106.0,240,0,0.8907563,0,1,0,0
3,8686,20180303,2,660.333333,1900,657.666667,1894,0,0.9959616,1,1,0,0
4,8686,20180505,3,154.0,258,98.333333,257,0,0.6385281,0,1,1,0


In [22]:
log.info("Calculating past variables...")
# Calculate past variables
df_echi_dummies_past = add_relative_calculate_past(df_echi_dummies,
                                                   date_col="FECHA",
                                                   id_cols=["CUSTOMER_ID"],
                                                   start_date=start_date,
                                                   end_date=date,
                                                   periods=past_periods,
                                                   agg={"N_INSTANCIAS": np.nansum,
                                                        "N_HOLDS": np.nansum,
                                                        "DURATION_mean": np.nanmean,
                                                        "DURATION_max": np.nanmax,
                                                        "TALK_DUR_RATIO": [np.nanmean, np.nanmax],
                                                        "DISPOSITION_DESC_ABANDONED": np.nansum,
                                                        "DISPOSITION_DESC_ANSWERED": np.nansum,
                                                        #"DISPOSITION_DESC_CONNECTED": np.nansum, En uruguay no se cuenta con este campo
                                                        "DISPOSITION_DESC_FORCED_DISCONNECT": np.nansum,
                                                        "DISPOSITION_DESC_INTERFLOWED": np.nansum
                                                        },
                                                   period_freq="D")

2020-12-30 15:08:56,567 - aa_engine_pkg.assets.utils.utilities - INFO - Calculating past variables...
2020-12-30 15:08:56,569 - aa_engine_pkg.assets.utils.utilities - INFO - Adding relative date between 2017-12-19 00:00:00 and 20180605
2020-12-30 15:08:56,624 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 7
2020-12-30 15:08:56,661 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 14
2020-12-30 15:08:56,707 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 21
2020-12-30 15:08:56,757 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 28
2020-12-30 15:08:56,809 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 84
2020-12-30 15:08:56,908 - aa_engine_pkg.assets.utils.utilities - INFO - Expanding table for period: 168


In [23]:
# Add date variables
df_echi_dummies_past["DATE_EXP"] = period_to_load
df_echi_dummies_past["DATE_CALC"] = date

# Change variable names 
table_preffix = parameters["masters"]["echi"]["table_preffix"]
rename_table(df_echi_dummies_past,
             preffix=table_preffix,
             ids_to_exclude=id_cols)

In [24]:
df_echi_dummies_past.head()

Unnamed: 0,CUSTOMER_ID,ECH_N_INSTANCIAS_nansum_7,ECH_N_HOLDS_nansum_7,ECH_DURATION_mean_nanmean_7,ECH_DURATION_max_nanmax_7,ECH_TALK_DUR_RATIO_nanmean_7,ECH_TALK_DUR_RATIO_nanmax_7,ECH_DISPOSITION_DESC_ABANDONED_nansum_7,ECH_DISPOSITION_DESC_ANSWERED_nansum_7,ECH_DISPOSITION_DESC_FORCED_DISCONNECT_nansum_7,...,ECH_DURATION_mean_nanmean_168,ECH_DURATION_max_nanmax_168,ECH_TALK_DUR_RATIO_nanmean_168,ECH_TALK_DUR_RATIO_nanmax_168,ECH_DISPOSITION_DESC_ABANDONED_nansum_168,ECH_DISPOSITION_DESC_ANSWERED_nansum_168,ECH_DISPOSITION_DESC_FORCED_DISCONNECT_nansum_168,ECH_DISPOSITION_DESC_INTERFLOWED_nansum_168,DATE_EXP,DATE_CALC
0,8031,,,,,,,,,,...,40.0,130,0.488806,0.977612,0,1,0,1,201805,20180605
1,8069,,,,,,,,,,...,119.0,270,0.890756,0.890756,0,1,0,0,201805,20180605
2,8686,,,,,,,,,,...,407.166667,1900,0.817245,0.995962,1,2,1,0,201805,20180605
3,8855,3.0,0.0,210.666667,559.0,0.169304,0.169304,0.0,1.0,0.0,...,623.738889,2966,0.618503,0.913462,0,6,2,3,201805,20180605
4,8864,3.0,0.0,66.333333,173.0,0.396985,0.396985,0.0,1.0,0.0,...,222.083333,1124,0.375341,0.740741,0,3,0,3,201805,20180605


In [None]:
def create_master_echi(echi: SQLPartitionedDataSet,
                       cliente_activo: pd.DataFrame,
                       parameters: Dict,
                       date: str) -> pd.DataFrame:
    """Creates master table with features related to interactions with call center for one period of data
    Parameters
    ----------
    cliente_activo: 
        dataset defined in ``catalog_raw.yml`` - list of active customers at EoP for the given period
    echi: 
        dataset defined in ``catalog_raw.yml`` with raw data information related to interactions with call center
    date: 
        period to process
    parameters: 
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame
        Master table with call center interactions features for one period
    """

    # Initialize logger
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "echi"

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_echi_dummies_past = pd.read_parquet(match[0], engine="pyarrow")

    else:
        # Read parameters
        lookback_days = parameters["masters"]["global"]["look_back_days"]
        start_date = pd.to_datetime(date) - timedelta(days=lookback_days)

        # Calculate period to load for active clients
        log.info("Loading table clientes")
        period_to_load = get_previous_month(date)
        df_clientes = cliente_activo[["CUSTOMER_ID"]]

        log.info("Loading table echi")
        df_echi = echi.filter_by(date=[start_date.strftime("%Y%m%d"),
                                       date])

        log.info("Merging with EOP...")
        df_echi = pd.merge(df_clientes,
                           df_echi,
                           on="CUSTOMER_ID",
                           validate="1:m")

        # Formatting variables
        df_echi["FECHA"] = df_echi["TMP_FECHA"].dt.strftime("%Y%m%d")
        log.info("Formatting string vars...")
        for var in string_vars:
            format_string_variable(df_echi, var)

        # Calculate daily stats
        df_echi_daily = flatten_df(df_echi.groupby(groupby_cols).agg({"SKILL_DESDE": "nunique",
                                                                      "DURATION": ["mean", "max"],
                                                                      "TALKTIME": ["mean", "max"],
                                                                      "HOLDABN": "sum"}))

        df_echi_daily.rename(columns={"SKILL_DESDE_nunique": "N_INSTANCIAS",
                                      "HOLDABN_sum": "N_HOLDS"}, inplace=True)

        # Create ratio between talktime and call duration
        create_evolution_variables(df_echi_daily,
                                   var_name="TALK_DUR_RATIO",
                                   numerator="TALKTIME_mean",
                                   denominator="DURATION_mean"
                                   )
        impute_categories(df_echi,
                          "DISPOSITION_DESC",
                          ["ABANDONED", "ANSWERED", "CONNECTED",
                           "FORCED_DISCONNECT", "INTERFLOWED"]
                          )

        log.info("Creating dummies...")
        log.info("---- DISPOSITION_DESC")
        df_echi_dummies = create_dummy_variables(df_echi,
                                                 var_to_dummy="DISPOSITION_DESC",
                                                 vars_to_groupby=groupby_cols,
                                                 include_total=False,
                                                 include_subtotal=False)

        # Merge
        df_echi_dummies = pd.merge(df_echi_daily,
                                   df_echi_dummies,
                                   on=groupby_cols,
                                   how="inner",
                                   validate="1:1")

        log.info("Calculating past variables...")
        # Calculate past variables
        df_echi_dummies_past = add_relative_calculate_past(df_echi_dummies,
                                                   date_col="FECHA",
                                                   id_cols=["CUSTOMER_ID"],
                                                   start_date=start_date,
                                                   end_date=date,
                                                   periods=past_periods,
                                                   agg={"N_INSTANCIAS": np.nansum,
                                                        "N_HOLDS": np.nansum,
                                                        "DURATION_mean": np.nanmean,
                                                        "DURATION_max": np.nanmax,
                                                        "TALK_DUR_RATIO": [np.nanmean, np.nanmax],
                                                        "DISPOSITION_DESC_ABANDONED": np.nansum,
                                                        "DISPOSITION_DESC_ANSWERED": np.nansum,
                                                        #"DISPOSITION_DESC_CONNECTED": np.nansum, En uruguay no se cuenta con este campo
                                                        "DISPOSITION_DESC_FORCED_DISCONNECT": np.nansum,
                                                        "DISPOSITION_DESC_INTERFLOWED": np.nansum
                                                        },
                                                   period_freq="D")

        # Add date variables
        df_echi_dummies_past["DATE_EXP"] = period_to_load
        df_echi_dummies_past["DATE_CALC"] = date

        # Change variable names 
        table_preffix = parameters["masters"]["echi"]["table_preffix"]
        rename_table(df_echi_dummies_past,
                     preffix=table_preffix,
                     ids_to_exclude=id_cols)
        log.info(f"Exporting {df_echi_dummies_past.shape[0]} rows and {df_echi_dummies_past.shape[1]} columns")

        if write_to_parquet:
            log.info(f"\n------ Writing {table_name} for period {date} to parquet ------")
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_echi_dummies_past.to_parquet(file, engine="pyarrow")

        log.info(f"Exporting {df_echi_dummies_past.shape[0]} rows and {df_echi_dummies_past.shape[1]} columns")

    return df_echi_dummies_past