**This notebook is for:**
1. MASTER.

In [1]:
import sys, os 
prj_path = os.path.dirname(os.getcwd())
utils_path = prj_path + '/utils'
if  not utils_path in sys.path:
    print('adding utils to path ')
    sys.path.insert(1, utils_path)

adding utils to path 


In [2]:
from utilities import *
from processing_functions import *

from google_cloud import BigQuery, Storage
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

import os
import sys

import boto3
import base64

#Cliente BigQuery
AUTH_BIGQUERY = base64.b64decode(os.environ['SECRET_AUTH_BIGQUERY_MODEL'])
bq = BigQuery(AUTH_BIGQUERY)
storage =Storage(AUTH_BIGQUERY)

#Load config.
files = glob.glob(f"{utils_path}/*.yml") 
conf = load_conf(files)

In [3]:
# site = conf['SITE']
# backbone_table = conf['BACKBONE_TABLE_NAME']
# time = ('2022-01-05', '2022-02-23')

# master_vars_path = conf['MASTERS_VARS_PATHS']
# mater_path_gcp = conf['MASTER_BUCKET_PATH']

# prj, bu, model = conf['PROYECT'],conf['BUSINESS_UNIT'],conf['MODEL']
# bucket_path = f'{prj}/{bu}/{model}/{site}/'

In [18]:
def master_processing(conf:dict,
                      site: str,
                      proc_funs: dict,
                      to_parquet: bool = True
                     ):
    """Create a master table for training a model.
    - Call all the variables stored on a gcp bucket and join to a 
    model population.
    - Run a set of processing function declared on processing_functions.py 
    and stored in a dictionary (e.g {variable: fun}, where variable need to be declared 
    in the conf file 'PUSH_VARS': proc_pushvars})
    Parameters:
    conf: dictionary with parameters.
    to_pkl: Flag True will write the file (in pickle format) in a local path.
   """
    #Some configurations.
    backbone_table = conf['BACKBONE_TABLE_NAME']
    master_vars_path = conf['MASTERS_VARS_PATHS']
    mater_path_gcp = conf['MASTER_BUCKET_PATH']
    master_local_path = conf['MASTER_LOCAL_PATH']
    prj, bu, model = conf['PROYECT'],conf['BUSINESS_UNIT'],conf['MODEL']
    bucket_path = f'{prj}/{bu}/{model}/{site}/'
    vars_ = conf['PREFIX_MASTERS'].keys()
    temporal_dir = random_name(10)
    logger.info(f'Ancilary Temporal dir: {temporal_dir}')

    for var in vars_:
        logger.info(f">>> DOWNLOAD {var}")
        temporal_subdir = f"{temporal_dir}/{var}"
        os.makedirs(f"{temporal_subdir}",exist_ok=True)
        logger.info(f"Temporal subdir created: {temporal_subdir}.")

        download_(temporal_dir = temporal_subdir,
              bucket_path = bucket_path,
              bucket_var = master_vars_path[var],
              is_prod = False)

        omnesprouno(temporal_dir = temporal_subdir,
                    parquet_name = f'{var}')

        logger.info("Remove temp files.")
        remove_content(dir_path = temporal_subdir, 
                       exceptions = f'{var}.parquet')
       
    vars_ = conf['PREFIX_MASTERS'].keys()
    logger.info(f'>>> DOWNLOAD MODEL POPULATION.')
    master = model_population(backbone_table = backbone_table, site = site)
    logger.info(f'>>> MODEL POPULATION SIZE {master.CUS_CUST_ID.nunique()}.')
    logger.info(f'>>> PROCESSING VARS')
    for var in vars_:
        temporal_subdir = f"{temporal_dir}/{var}"
        df = pd.read_parquet(temporal_subdir +f'/{var}.parquet')
        df = proc_funs[var](df.copy())
        master = master.merge(df, on = ['CUS_CUST_ID','SENT_DATE'],
                              how = 'left',
                              validate='1:1')
        del df
    
    logger.info(f"-------- Final NaN Report --------")
    report = master.isnull().sum()
    report = report.loc[report>0]
    logger.info(f"{list(report.to_dict().items())}")

    master['W'] = remap_column(serie = master['EVENT_TYPE'],dict_ = {'sent':1, 'control':0})
    master.drop(columns='EVENT_TYPE', inplace=True)
    master.rename(columns={"CONVERSION": "Y"}, inplace=True)
    
    if to_parquet:
        output_path = f"{os.getcwd()}{master_local_path}"
        os.makedirs(output_path,exist_ok=True)
        format_time = datetime.now().strftime('%Y%m%d%H%M')
        logger.info(f"Writing master ABT on {output_path}{format_time}.parquet")
        master.to_parquet(f"{output_path}{format_time}.parquet")
    
    #Ready for write ABT on GCP.
    subdir = 'train'      
    logger.info(f"Writing master ABT on {bucket_path}{subdir}{mater_path_gcp}/{format_time}.parquetl") 
    storage.upload_file(source_file = f"{output_path}{format_time}.parquet",
                        gs_destination_path = f"{bucket_path}{subdir}{mater_path_gcp}/{format_time}.parquet" )
    
    logger.info("rm -r {temporal_dir}")    
    shutil.rmtree(temporal_dir)
    shutil.rmtree(output_path)
    
    return master

In [19]:
proc_funs = {'PUSH_VARS': proc_pushvars,'BEHAVIOUR_VARS':proc_behaviour}
master_processing(conf = conf,
                  site = 'MLB',
                  proc_funs = proc_funs,
                  to_parquet = True)
                  


2022-03-22 14:33:59 Ancilary Temporal dir: RSybTOQLXK
2022-03-22 14:33:59 >>> DOWNLOAD PUSH_VARS
2022-03-22 14:33:59 Temporal subdir created: RSybTOQLXK/PUSH_VARS.
2022-03-22 14:33:59 Looking in: bucket_path: gs://marketing-modelling/ML/NB/MLB/ and subdir /dataset/PUSH_VARS
2022-03-22 14:33:59 Listed files on : RSybTOQLXK/PUSH_VARS
2022-03-22 14:33:59 file : RSybTOQLXK/PUSH_VARS/000000000000.parquet
2022-03-22 14:33:59 Reading and load to memory
2022-03-22 14:33:59 Total Concat DataFrame size : (694536, 5)
2022-03-22 14:33:59 Total DataFrame size to write: (694536, 5)
2022-03-22 14:33:59 Writing on  : RSybTOQLXK/PUSH_VARS/PUSH_VARS.parquet
2022-03-22 14:33:59 Remove temp files.
2022-03-22 14:33:59 >>> DOWNLOAD BEHAVIOUR_VARS
2022-03-22 14:33:59 Temporal subdir created: RSybTOQLXK/BEHAVIOUR_VARS.
2022-03-22 14:33:59 Looking in: bucket_path: gs://marketing-modelling/ML/NB/MLB/ and subdir /dataset/BEHAVIOUR_VARS
2022-03-22 14:34:00 Listed files on : RSybTOQLXK/BEHAVIOUR_VARS
2022-03-22 14

Unnamed: 0,CUS_CUST_ID,SENT_DATE,Y,PUSHVAR_R_PUSH_LAST_15D_VS_REST_OF_MONTH,PUSHVAR_TOTAL_PUSH_LAST_7D,PUSHVAR_RECENCY,PUSHVAR_R_PUSH_RANK,BEHAVIOURVARS_LOG_DATE_COUNT_7D,BEHAVIOURVARS_AVG_PRICE_ITEM_7D,BEHAVIOURVARS_RECENCY,...,BEHAVIOURVARS_MAX_FROM_INSTALL_ML,BEHAVIOURVARS_MIN_FROM_INSTALL_ML,BEHAVIOURVARS_MAX_FROM_INSTALL_MP,BEHAVIOURVARS_MIN_FROM_INSTALL_MP,BEHAVIOURVARS_TOTAL_PAYMENTS_AMT_SUM,BEHAVIOURVARS_WALLET_PAYMENTS_AMT_SUM,BEHAVIOURVARS_AGGREGATOR_PAYMENTS_AMT_SUM,BEHAVIOURVARS_TOTAL_PAYMENTS_COUNT,BEHAVIOURVARS_OS_IS_ANDROID,W
0,1045397197,2022-01-05,0,-1,0,8,0.324189,0,0.000000,8,...,-1.0,-1.0,-1.0,-1.0,0.000000,0.0,0.000000,0,1,1
1,1051004037,2022-01-05,0,-1,1,1,0.324189,1,298.130000,1,...,1.0,1.0,-1.0,-1.0,0.000000,0.0,0.000000,0,1,1
2,1025618491,2022-01-05,0,0,1,7,0.715074,4,115.172500,2,...,9.0,31.0,27.0,42.0,39.604744,0.0,5.542258,1,1,1
3,1050792165,2022-01-05,0,-1,0,8,0.324189,0,0.000000,8,...,1.0,1.0,-1.0,-1.0,0.000000,0.0,0.000000,0,1,1
4,1051189371,2022-01-05,1,-1,0,8,0.324189,0,0.000000,8,...,1.0,1.0,-1.0,-1.0,0.000000,0.0,0.000000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694531,155993910,2022-03-15,0,-1,3,3,0.324189,3,110.548889,1,...,5.0,5.0,-1.0,-1.0,0.000000,0.0,0.000000,0,1,0
694532,1082119485,2022-03-15,0,-1,0,8,0.324189,2,144.360000,1,...,14.0,14.0,-1.0,-1.0,0.000000,0.0,0.000000,0,1,0
694533,1088438839,2022-03-15,0,-1,2,2,0.324189,2,165.566852,3,...,3.0,3.0,-1.0,-1.0,0.000000,0.0,0.000000,0,1,0
694534,1087719125,2022-03-15,0,-1,2,2,0.324189,1,79.000000,4,...,4.0,4.0,-1.0,-1.0,19.782570,0.0,5.542258,1,1,0


In [7]:
master = pd.read_parquet('MASTER_TABLE/202203221355.parquet')

In [8]:
master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 694536 entries, 0 to 694535
Data columns (total 37 columns):
 #   Column                                         Non-Null Count   Dtype         
---  ------                                         --------------   -----         
 0   CUS_CUST_ID                                    694536 non-null  int64         
 1   SENT_DATE                                      694536 non-null  datetime64[ns]
 2   Y                                              694536 non-null  int64         
 3   PUSHVAR_R_PUSH_LAST_15D_VS_REST_OF_MONTH       694536 non-null  int64         
 4   PUSHVAR_TOTAL_PUSH_LAST_7D                     694536 non-null  int64         
 5   PUSHVAR_RECENCY                                694536 non-null  int64         
 6   PUSHVAR_R_PUSH_RANK                            694536 non-null  float64       
 7   BEHAVIOURVARS_LOG_DATE_COUNT_7D                694536 non-null  int64         
 8   BEHAVIOURVARS_AVG_PRICE_ITEM_7D             

In [9]:
pd.crosstab(index=master.Y.map({0:'not_buy',
                                1:'buy'}),
            columns = master.W.map({0:'control',
                                    1:'treatment'}),
            normalize=False)

W,control,treatment
Y,Unnamed: 1_level_1,Unnamed: 2_level_1
buy,1097,13197
not_buy,69035,611207


In [10]:
pd.crosstab(index=master.Y.map({0:'not_buy',
                                1:'buy'}),
            columns = master.W.map({0:'control',
                                1:'treatment'}),
            normalize='columns')

W,control,treatment
Y,Unnamed: 1_level_1,Unnamed: 2_level_1
buy,0.015642,0.021135
not_buy,0.984358,0.978865


In [None]:
time_format = datetime.strftime(datetime.today(),"%Y%m%d_%H%M")
os.makedirs(f'{temporal_dir}/MASTER/',exist_ok=True)
master.to_parquet(f'{temporal_dir}/MASTER/{time_format}.parquet')

In [None]:
master = pd.read_parquet('zawciyCmhE/MASTER/20220302_1209.parquet')

In [None]:
master.Y.value_counts(normalize=True)*100

In [None]:
master.W.value_counts(normalize=True)*100

In [None]:
pd.crosstab(index=master.Y.map({0:'not_buy',
                                1:'buy'}),
            columns = master.W.map({0:'control',
                                    1:'treatment'}),
            normalize='columns')

In [None]:
pd.crosstab(index=master.Y.map({0:'not_buy',
                                1:'buy'}),
            columns = master.W.map({0:'control',
                                    1:'treatment'}),
            normalize=False)