In [None]:
# Load kedro environment (not needed in .py)
from pathlib import Path
from kedro.framework.context import load_context

# Load a context to be able to work in the notebook
#current_dir = Path.cwd()
current_dir = Path("/u01/share/cesar/aa_engine_uy/notebooks/")
proj_path = current_dir.parent
context = load_context(proj_path)
catalog = context.catalog
credentials = context.config_loader.get("credentials*","credentials*/**")
parameters = context.config_loader.get("parameters*","parameters*/**")

from aa_engine_pkg.assets.utils import *
from aa_engine_pkg.assets.core.data.kedro.catalog_expansion.partitioned_sql import SQLPartitionedDataSet

In [None]:
date='20180605'

In [None]:
def create_cliente_activo(cliente_activo: SQLPartitionedDataSet,
                          date: str) -> pd.DataFrame:
    """Creates master table with features related to EoP state of customers for one period of data

    Parameters
    ----------
    cliente_activo:
        dataset defined in ´catalog.yml´ - list of active customers at EoP for the given period
    date:
        period to process
    
    Returns
    -------
        Mastertable with information of clientes at EoP
    """

    # Initialize logger
    log = initialize_logger()

    # Load active clientes for period
    log.info(f"Creating cliente_activo...")
    period_to_load = get_previous_month(date)
    df_clientes_activos = cliente_activo.filter_by(date=period_to_load)

    # Return
    return df_clientes_activos

In [None]:
cliente_activo=catalog.load("cliente_activo")

In [None]:
cliente_activo_df= create_cliente_activo(cliente_activo,date)

In [1]:
campanas= catalog.load("campanas")

NameError: name 'catalog' is not defined

In [None]:

vars_to_dummy = ["OFERTA_COMBINADA"]

past_periods = [14, 21, 28, 84, 168, 252, 336]

In [None]:
def create_master_campanas(campanas: SQLPartitionedDataSet,
                           cliente_activo: pd.DataFrame,
                           parameters: Union[Dict, None],
                           date: str
                           ) -> pd.DataFrame:
    """Creates master table with offer features for one period of data
    Parameters
    ----------
    campanas:
        dataset defined in ``catalog_raw.yml`` with raw data information related to client's offers
    cliente_activo:
        dataset defined in ``catalog_raw.yml`` with raw data information related to active clients at EoP
    date:
        period to process
    parameters:
        set of project parameters defined in ``parameters.yml``
    Returns
    -------
    pd.DataFrame
        Master table with offer features for one period
    """

    # Read parameters
    log = initialize_logger()

    write_to_parquet = parameters["write_to_parquet"]
    table_name = "campanas"
    overwrite = parameters["masters"][table_name]["overwrite"]

    # Check if table was already created
    files = get_mastertable_paths_by_period(parameters=parameters, period=date)
    match = [str(file) for file in files if table_name in file]

    if len(match) > 0 and overwrite is False:
        # If table is found, read parquet:
        log.info(f"Reading {match[0]} table")
        df_final = pd.read_parquet(match[0], engine="pyarrow")

    else:
        # Read parameters
        log = initialize_logger()
        look_back_days = parameters["masters"]["campanas"]["look_back_days"]
        start_date = (pd.to_datetime(date) - timedelta(days=look_back_days)).strftime("%Y%m%d")

        # Calculate period to load for active clients
        period_to_load = get_previous_month(date)
        log.info(f"Loading campanas...")
        df_campanas = campanas.filter_by(date=[start_date, date]).drop_duplicates()

        df_clientes = cliente_activo

        df_campanas = pd.merge(df_clientes,
                               df_campanas,
                               on=["CUSTOMER_ID"],
                               how="inner")

        log.info("Creating variables...")
        # Calculate offer duration / days since start / until end
        df_campanas['OFFER_DURATION'] = (pd.to_datetime(df_campanas.END_DATE,
                                                        errors="coerce") - df_campanas.START_DATE) / np.timedelta64(1,
                                                                                                                    "D")
        df_campanas["DAYS_SINCE_START_OFFER"] = (pd.to_datetime(date) - df_campanas.START_DATE) / np.timedelta64(1, "D")
        df_campanas["DAYS_TO_END_OFFER"] = (pd.to_datetime(df_campanas.END_DATE, errors="coerce") - pd.to_datetime(
            date)) / np.timedelta64(1, "D")

        # Extract percentages of offers from their description
        df_campanas["PORC_OFFER"] = df_campanas["DESCRIPTION"].str.extract(r'(\d+)%')[0].astype(float)

        # Extract discount values of offers from their description
        df_campanas["VALUE_OFFER"] = df_campanas["DESCRIPTION"].str.extract(r'\$(\d+)')[0].astype(float)

        # Make a mask with position of values
        maskvalue = df_campanas["VALUE_OFFER"].notna()
        maskporc = df_campanas["PORC_OFFER"].notna()

        # Create a unique ranking scaled for Percentage and value
        maskall = dict(zip(["VALUE_OFFER", "PORC_OFFER"],
                           (maskvalue, maskporc)))
        for var, mask in maskall.items():
            df_campanas.loc[mask, "MONTO_OFFER_SCALED"] = scale_values(df=df_campanas.loc[mask, :],
                                                                       vars_to_groupby=["DATE_EXP"],
                                                                       var_to_scale=var,
                                                                       by_ranking=False)

            # Extract length of offer from their description
        tuple_len = df_campanas["DESCRIPTION"].str.extract(r'(?:(\d+)[ ]*M|(\d+)X)', re.IGNORECASE)[[0, 1]].fillna(
            0).astype(int)
        df_campanas["LENGTH_OFFER"] = tuple_len.sum(axis=1)

        # creo la variable tipo_oferta
        condlist = [df_campanas["DESCRIPTION"].str.upper().str.contains("DSCTO|RET|DESC", na=False),
                    df_campanas["DESCRIPTION"].str.upper().str.contains("RENTA", na=False),
                    df_campanas["DESCRIPTION"].str.upper().str.contains("UPGRADE", na=False),
                    df_campanas["DESCRIPTION"].str.upper().str.contains("NUEVO", na=False),
                    df_campanas["DESCRIPTION"].str.upper().str.contains("RECON")
                    ]
        choicelist = ["RETE",
                      "RENT",
                      "UPGR",
                      "NUEV",
                      "RECO"]
        df_campanas["TIPO_OFERTA"] = np.select(condlist, choicelist, default="OTROS")

        # creo variable tipo de producto de la oferta
        condlist = [df_campanas["DESCRIPTION"].str.upper().str.contains(
            "PREMIUM|FOX|HBO|EXXXOTICO|HOTPACK|ADULT|CLAXON|HUSTLER", na=False),
                    df_campanas["DESCRIPTION"].str.upper().str.contains("PLATA|ORO|BRONCE|SICO", na=False),
                    df_campanas["DESCRIPTION"].str.upper().str.contains("BUNDLE", na=False),
                    df_campanas["DESCRIPTION"].str.upper().str.contains("NET", na=False),
                    ]
        choicelist = ["PREMIUM",
                      "BASICO",
                      "BUNDLE",
                      "NET"]
        df_campanas["PRODUCTO_OFERTA"] = np.select(condlist, choicelist, default="OTROS")

        df_campanas["OFERTA_COMBINADA"] = df_campanas["TIPO_OFERTA"] + "_" + df_campanas["PRODUCTO_OFERTA"]
        df_campanas["FECHA"] = df_campanas["START_DATE"].dt.strftime("%Y%m%d")

        # creo las variables dummy
        df_list = []
        for var in vars_to_dummy:
            log.info(f'---- {var}')
            df_list.append(group_categorical_variables(df_campanas,
                                                       vars_to_groupby=["CUSTOMER_ID", "FECHA"],
                                                       var_to_count=var,
                                                       other_category=None))

        # Reduce list of list to create a data table
        df = reduce(lambda left, right: pd.merge(left, right, on=["CUSTOMER_ID", "FECHA"], how="outer"), df_list)

        ofer_cols = [c for c in df.columns if "OFERTA" in c]

        df["N_OFERTAS"] = df[ofer_cols].sum(axis=1)

        # Join with num vars and expand
        df_expanded_offers = add_relative_calculate_past(df=df_campanas[["CUSTOMER_ID",
                                                                         "FECHA",
                                                                         "DAYS_SINCE_START_OFFER",
                                                                         "DAYS_TO_END_OFFER",
                                                                         "LENGTH_OFFER",
                                                                         "MONTO_OFFER_SCALED"]],
                                                         id_cols=["CUSTOMER_ID"],
                                                         periods=past_periods,
                                                         agg=[np.nanmean, np.nanmin, np.nanmax],
                                                         date_col="FECHA",
                                                         start_date=start_date,
                                                         end_date=date,
                                                         period_freq="D")

        # Join with num vars and expand
        df_expanded_products = add_relative_calculate_past(df=df,
                                                           id_cols=["CUSTOMER_ID"],
                                                           periods=past_periods,
                                                           agg=[np.nansum],
                                                           date_col="FECHA",
                                                           start_date=start_date,
                                                           end_date=date,
                                                           period_freq="D")

        # Merge
        df_final = df_expanded_products.merge(df_expanded_offers, on="CUSTOMER_ID", validate="1:1")

        # Add date variables
        df_final["DATE_EXP"] = period_to_load
        df_final["DATE_CALC"] = date

        # Rename table
        rename_table(df=df_final,
                     preffix=parameters["masters"]["campanas"]["table_preffix"],
                     ids_to_exclude=["CUSTOMER_ID", "DATE_EXP", "DATE_CALC"]
                     )

        # Return
        log.info(f"Exporting {df_final.shape[0]} rows and {df_final.shape[1]} columns")

        if write_to_parquet:
            file = f"{parameters['paths']['master_path']}master_{table_name}/master_{table_name}_{date}.parquet"
            df_final.to_parquet(file, engine="pyarrow")

    return df_final