In [ ]:
%%configure -f
{
"conf": {
    "spark.sql.autoBroadcastJoinThreshold": -1,
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 8
   }
}

In [ ]:
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, FloatType, StringType, BooleanType
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, StandardScalerModel, StringIndexerModel

In [ ]:
batch_id = ''
transformed_data_path = ''
prepped_data_path = ''
features_path = ''
model_path = ''
id_feat = ''
date_feat = ''
first_year = ''
allowed_null_pct = ''
training = ''
time_slice_folder = ''

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'transformed_data_path': transformed_data_path,
    'prepped_data_path': prepped_data_path,
    'features_path': features_path,
    'model_path': model_path,
    'id_feat': id_feat,
    'date_feat': date_feat,
    'first_year': first_year,
    'allowed_null_pct': allowed_null_pct,
    'training': training,
    'time_slice_folder': time_slice_folder,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
if transformed_data_path != "":
    transformed_data_path = transformed_data_path + "/" + time_slice_folder
    logger.info(f'transformed_data_path = {transformed_data_path}')
if prepped_data_path != "":
    prepped_data_path = "/".join(prepped_data_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + prepped_data_path.split("/")[-1]
    logger.info(f'prepped_data_path = {prepped_data_path}')
if features_path != "":
    features_path = "/".join(features_path.split("/")[:-1]) + "/" + time_slice_folder + "/" + features_path.split("/")[-1]
    logger.info(f'features_path = {features_path}')
if model_path != "":
    model_path = model_path + "/" + time_slice_folder
    logger.info(f'model_path = {model_path}')

In [ ]:
# Casting parameters
#id_feat = eval(id_feat)
id_feat = [i for i in id_feat.split(",")]
first_year = int(first_year)
allowed_null_pct = float(allowed_null_pct)
training = eval(training)

In [ ]:
schema = StructType([
    StructField('issuer_id', StringType(), False),
    StructField('issued_date', DateType(), False),
    StructField('number_of_transactions', IntegerType(), False),
    StructField('total_buyers', IntegerType(), False),
    StructField('total_credit_notes',FloatType(), False),
    StructField('total_debit_notes',FloatType(), False),
    StructField('total_invoices',FloatType(), False),
    StructField('total_purchase_invoices',FloatType(), False),
    StructField('total_export_invoices',FloatType(), False),
    StructField('number_of_transactions_to_self',FloatType(), False),
    StructField('total_voucher_to_self',FloatType(), False),
    StructField('total_taxable_services',FloatType(), False),
    StructField('total_non_taxable_services',FloatType(), False),
    StructField('total_taxable_goods',FloatType(), False),
    StructField('total_non_taxable_goods',FloatType(), False),
    StructField('total_taxable',FloatType(), False),
    StructField('total_non_taxable',FloatType(), False),
    StructField('total_sales',FloatType(), False),
    StructField('total_discounts',FloatType(), False),
    StructField('total_voucher',FloatType(), False),
    StructField('total_tax',FloatType(), False),
    StructField('number_of_purchases',IntegerType(), False),
    StructField('total_suppliers',FloatType(), False),
    StructField('total_purchases',FloatType(), False),
    StructField('pagerank_score',FloatType(), False),
    StructField('taxpayer_type',StringType(), False),
    StructField('taxpayer_size',StringType(), False),
    StructField('main_activity',StringType(), False),
    StructField('sec1_activity',StringType(), False),
    StructField('sec2_activity',StringType(), False),
    StructField('employees_number',StringType(), False), #IntegerType()
    StructField('legal_reg_date',StringType(), False),  #DateType()
    StructField('tax_reg_date',StringType(), False), #DateType()
    StructField('e_inv_enroll_date',StringType(), False), #DateType()
    StructField('total_capital',StringType(), False), #FloatType()
    StructField('reported_assets',StringType(), False), #BooleanType()
    StructField('social_capital',StringType(), False), #FloatType()
    StructField('total_assets',StringType(), False), #FloatType()
    StructField('total_fixed_assets',StringType(), False), #FloatType()
    StructField('total_liabilities',StringType(), False), #FloatType()
    StructField('gross_income',StringType(), False), #FloatType()
    StructField('net_income',StringType(), False), #FloatType()
    StructField('total_vat_sales',StringType(), False), #FloatType()
    StructField('credited_einvoicing_value',StringType(), False), #FloatType()
    StructField('state',StringType(), False),
    StructField('municipality',StringType(), False),
    StructField('city',StringType(), False),
    StructField('ratio_sales_purchases',FloatType(), False),
    StructField('ratio_tax_sales',FloatType(), False),
    StructField('ratio_sales_employees',FloatType(), False),
    StructField('ratio_buyers_suppliers',FloatType(), False),
    StructField('ratio_in_out',FloatType(), False),
    StructField('act01',FloatType(), False),
    StructField('total_voucher_act01',FloatType(), False),
    StructField('act02',FloatType(), False),
    StructField('total_voucher_act02',FloatType(), False),
    StructField('act03',FloatType(), False),
    StructField('total_voucher_act03',FloatType(), False),
    StructField('act04',FloatType(), False),
    StructField('total_voucher_act04',FloatType(), False),
    StructField('act05',FloatType(), False),
    StructField('total_voucher_act05',FloatType(), False),
    StructField('min_distance_from_supplier',IntegerType(), False),
    StructField('min_distance_from_customer',IntegerType(), False),
    StructField('min_depth_of_supply_chain',IntegerType(), False),
    StructField('min_place_in_supply_chain',FloatType(), False),
    StructField('max_distance_from_supplier',IntegerType(), False),
    StructField('max_distance_from_customer',IntegerType(), False),
    StructField('max_depth_of_supply_chain',IntegerType(), False),
    StructField('max_place_in_supply_chain',FloatType(), False),
    StructField('issuer_id_indexed', IntegerType(), False)
])

In [ ]:
#Removing schema for now, until upsteam schema is formalized.
#df = spark.read.schema(schema).parquet(
#Dropping issuer_id because now index of issuer_id is used for id_feat. 
#May be better to transform the column in here and drop issuer_id_indexed from the transformed datasets.
with tracer.span('Loading transformed data'):
    df = spark.read.parquet(
        transformed_data_path,
        header=True
    ).drop('issuer_id')
    m = df.count()
    logger.info(f'Number of records: {m}')

In [ ]:
for row in schema:
    #For now continue until schema is fully stable.
    continue
    column = row.name
    dataType = row.dataType
    #exclude dateType for now
    if dataType == DateType(): continue
    df = df.withColumn(column,F.col(column).cast(dataType))

In [ ]:
# Removing features with high percentaje of null values
allowed_null_feats = []
for feat in df.columns:
    null_pct = df.where(F.isnull(feat)).count()/m 
    if null_pct <= allowed_null_pct:
        allowed_null_feats.append(feat)
    else:
        logger.info(f'Feature {feat} has {null_pct*100:.2f}% of null values')

df_allowed_null = df.select(allowed_null_feats)

In [ ]:
# Removing null values
df_notnull = df_allowed_null

for feat in df_notnull.schema.fieldNames():
    df_notnull = df_notnull.where(~F.isnull(feat))

logger.info(f'Not null records: {df_notnull.count():,}')

In [ ]:
# Removing records previous to first year parameter
df_recent = df_notnull.where(F.year(date_feat) >= first_year)
logger.info(f'Number of records since {first_year}: {df_recent.count():,}')

In [ ]:
# Date data augmentation
df_augmented = df_recent.withColumn('_dayofweek', F.dayofweek(date_feat))
df_augmented = df_augmented.withColumn('_dayofmonth', F.dayofmonth(date_feat))
df_augmented = df_augmented.withColumn('_dayofyear', F.dayofyear(date_feat))
df_augmented = df_augmented.withColumn('_weekofyear', F.weekofyear(date_feat))
df_augmented = df_augmented.withColumn('_month', F.month(date_feat))
df_augmented = df_augmented.withColumn('_quarter', F.quarter(date_feat))
df_augmented = df_augmented.withColumn('_year', F.year(date_feat))

In [ ]:
# Date to int
date_feats = [x['name'] for x in df_augmented.schema.jsonValue()['fields'] if x['type']=='date']

df_date_int = df_augmented

for feat in date_feats:
    logger.info(f'Casting date feature {feat} to int ...')
    df_date_int = df_date_int.withColumn(feat+'_int', F.unix_timestamp(feat))


In [ ]:
string_feats = [x['name'] for x in df_date_int.schema.jsonValue()['fields'] if x['type']=='string']
string_feats_indexed = [feat+'_indexed' for feat in string_feats]
if training:
    indexer = StringIndexer(inputCols=string_feats, outputCols=string_feats_indexed, stringOrderType='frequencyDesc')
    model = indexer.fit(df_date_int)
    model.write().overwrite().save(model_path + '/' + '_ijungle_indexer.pkl')
else:
    model = StringIndexerModel.load(model_path + '/' + '_ijungle_indexer.pkl')
df_string_indexed = model.transform(df_date_int)

In [ ]:
# Assemble features to scalate
columns = df_string_indexed.schema.fieldNames()
feats_to_remove = id_feat + date_feats + string_feats
feats = [feat for feat in columns if not feat in feats_to_remove]
assembler = VectorAssembler(inputCols=feats, outputCol='feats')
df_assembled = assembler.transform(df_string_indexed).select(id_feat + ['feats'])

In [ ]:
# Store features 
if training:
    spark.createDataFrame(zip(range(len(feats)), feats),['id','feat']).write.mode('overwrite').parquet(features_path)

In [ ]:
# Scalate features
if training:
    scaler = StandardScaler(inputCol='feats', outputCol='scaled')
    model = scaler.fit(df_assembled)
    model.write().overwrite().save(model_path + '/' + '_ijungle_scaler.pkl')
else:
    model = StandardScalerModel.load(model_path + '/' + '_ijungle_scaler.pkl')
df_scaled = model.transform(df_assembled).select(id_feat+['scaled'])

In [ ]:
# Write scaled data as parquet files
df_scaled.write.mode('overwrite').parquet(prepped_data_path)