# Data Processing
This notebook main task is to perform data pre-processing

## Importing Libraries

In [1]:
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql import functions as F

from azure.storage.blob import ContainerClient, BlobClient, BlobServiceClient
from io import BytesIO, StringIO
from datetime import datetime, timedelta
from notebookutils import mssparkutils

# set the notebook completed flag
notebook_completed_status = 'not_completed'

## Import common constants and variables
Importing constants from a notebook

In [2]:
%run common/constants

## Data Ingestion
Ingesting the VAT Txx dataset from Azure Data Lake Store (ADLS)


In [3]:
%%pyspark

# read the specific fields from ADLS
try:

    # read the current vat tax dataframe based on the most updtaed day
    # setup the main ADLS connection string
    PATH = f'abfss://{STAGING_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/{VAT_TAX_FOLDER}/'
    
    # get the latest day loaded into ADLS
    vat_tax_files = mssparkutils.fs.ls(PATH)
    dates_folder = []

    for file in vat_tax_files:
        dates_folder.append(datetime.strptime(file.name, '%Y-%m-%d'))
    if len(dates_folder) < 1:
        raise Exception(f'{PATH} has no date (day) refrenced')    
    
    # get the most current date 
    current_date = max(dates_folder).strftime('%Y-%-m-%-d')

    # now setup the connection string with current date
    CONNECTION_STR = f'abfss://{STAGING_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/{VAT_TAX_FOLDER}/{current_date}/'

    tax_data = spark.read.load(path=CONNECTION_STR, format='parquet', header=True)
    dataframe_records = tax_data.count()

    # perform a empty dataframe test and throw exception if required
    if dataframe_records <= RECORDS_THRESH:
        raise Exception('Dataframe has low records count')

    tax_data.show(10)
    print('Raw current tax datafrem rows: ', dataframe_records, 'and columns: ', len(tax_data.columns))

except Exception as error:
    print(f'Error in {error}')
    raise ValueError(error)

In [8]:
tax_data.head()

## Data pre-processing
Perform data pre-processing including, type casting, remove nulls, etc

In [168]:
try:
    # 1. drop duplicate rows across all columns
    tax_data = tax_data.drop_duplicates()
    # save the duplate rows data quality also
    duplicates_ratio = tax_data.distinct().count() / tax_data.count()
    print(f'Dataframe completenss ratio (duplicates) is {duplicates_ratio}')

    # 2. Calculate null values for each column
    df_null_ratio = tax_data.agg(*[(1-F.count(c) / F.count('*')).alias(c) for c in tax_data.columns]).toPandas()
    drop_columns = list(df_null_ratio.loc[:, (df_null_ratio >= COLUMN_QUALITY_THRESH).any()].columns)

    print(f'{len(drop_columns)} columns will be dropped for null test.')
    tax_data = tax_data.drop(*drop_columns)
    # save the parameter for MLOps data quality trigger
    data_quality_nulls = len(drop_columns) / len(tax_data.columns)

    # 3. drop columns for low variances
    df_low_variance = tax_data.agg(*[F.count_distinct(c).alias(c).alias(c) for c in tax_data.columns]).toPandas()
    drop_columns = list(df_low_variance.loc[:, (df_low_variance <= COLUMN_VARIANCE_THRESH).any()].columns)

    print(f'{len(drop_columns)} columns will be dropped for low variance test.')
    tax_data = tax_data.drop(*drop_columns)

    # save the parameter for MLOps data quality trigger
    data_quality_variance = len(drop_columns) / len(tax_data.columns)

    print(f'New dataframe row: {tax_data.count()}, columns: {len(tax_data.columns)}')

except Exception as error:
    print(f'Error in {error}')   
    raise ValueError(error)

## Data Quality Analysis Output
Performing data quality analysis and exporting the results to the pipeline for MLOps trigger and pipeline conditional triggers

In [169]:
try:
    # set the data quality pass flag
    data_qualtity = 'data_quality_passed'

    # perfrom a basic data quality test to determine if exception should be thrown
    if ((data_quality_nulls >= COLUMN_QUALITY_THRESH) or (data_quality_variance >= COLUMN_VARIANCE_THRESH)):
        data_qualtity = 'data_quality_failed'
        raise Exception(f'The dataframe did not pass data quality check')
    else:
        data_qualtity = 'data_quality_passed'
        
except Exception as error:
    print(f'Error in {error}')
    raise ValueError(error)

## Write to Datastore
Writing results to the processed Azure Datalake store

In [170]:
try:
    # Instantiate a BlobServiceClient using a connection string
    CONNECTION_STR = f'abfss://{DATA_PROCESSED_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/{VAT_TAX_PROCESSED_FOLDER}/{current_date}' 
   
    # prepare to write
    tax_data.write.format('csv').option('header', True).option('encoding', 'utf-8').mode('overwrite').save(CONNECTION_STR)

except Exception as error:
    print(f'Error in {error}') 
    raise ValueError(error)

# this line has be be outside the try except block
# exit the notebook with the data_quality value
mssparkutils.notebook.exit(data_qualtity)