# Data Drift Analyser
This notebook is used to perfrom data drifts based on given dataframes

In [25]:
data_quality = ''

## Importing Libraries

In [29]:
from pyspark.sql import SparkSession
import numpy as np
import pandas as pd
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql import functions as F

from azure.storage.blob import ContainerClient, BlobClient, BlobServiceClient
from io import BytesIO, StringIO
from datetime import datetime, timedelta
from ast import literal_eval
from notebookutils import mssparkutils
import matplotlib.pyplot as plt 

# data drift
from alibi_detect.cd import KSDrift

# set the notebook completed flag
notebook_completed_status = 'not_completed'

## Import common constants and variables
Importing constants from a notebook

In [30]:
%run /common/constants

## Data Ingestion
Ingesting 2 sets of dataets from the ADLS containers:
1. The current VAT TAx daily dataset
2. The previous (baseline) VAT tax dataset

In [32]:
%%pyspark

# read the specific fields from ADLS
try:
    # read the current vat tax dataframe based on the most updtaed day
    # setup the main ADLS connection string
    PATH = f'abfss://{STAGING_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/{VAT_TAX_FOLDER}/'
    
    # get the latest day loaded into ADLS
    vat_tax_files = mssparkutils.fs.ls(PATH)
    dates_folder = []

    for file in vat_tax_files:
        dates_folder.append(datetime.strptime(file.name, '%Y-%m-%d'))
    if len(dates_folder) < 1:
        raise Exception(f'{PATH} has no date (day) refrenced')    
    
    # get the most current date and previous day 
    current_date = max(dates_folder).strftime('%Y-%-m-%-d')
    previous_date = (max(dates_folder) - timedelta(days= 1)).strftime('%Y-%-m-%-d')

    # now setup the connection string with current date
    CONNECTION_STR_CURRENT_DAY = f'abfss://{STAGING_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/{VAT_TAX_FOLDER}/{current_date}/'
    # read the datasets
    tax_data = spark.read.load(path=CONNECTION_STR_CURRENT_DAY, format='parquet', header=True)
    dataframe_records = tax_data.count()

    # read the previous day
    CONNECTION_STR_PREVIOUS_DAY = f'abfss://{STAGING_CONTAINER}@{STORAGE_ACCOUNT}.dfs.core.windows.net/{VAT_TAX_FOLDER}/{previous_date}/'
    tax_data_baseliine = spark.read.load(path=CONNECTION_STR_PREVIOUS_DAY, format='parquet', header=True)
    dataframe_baseline_records = tax_data_baseliine.count()

    tax_data.show(10)
    print('Raw current tax datafrem rows: ', dataframe_records, 'and columns: ', len(tax_data.columns))
    print('Raw baseline tax datafrem rows: ', dataframe_baseline_records, 'and columns: ', len(tax_data_baseliine.columns))

except Exception as error:
    print(f'Error in {error}')
    raise ValueError(error)

## Data Drift Analysis
Performing statstical-based data drift, comparing current dataset with previous day (baseline) dataset
The Kolmogorov-Smirnov (K-S) test method is implemented

In [34]:
try:
    def rank_feature_drift(preds, feature_names, p_val=0.05):
        """ Rank likely drift contribution by feature.
        """
        try:
            drift_by_feature = pd.DataFrame()
            
            vals = preds["data"]["p_val"]

            # First check the number of features and prediction p-values match
            try:
                assert len(feature_names) == len(vals)
            except AssertionError:
                print("Ensure prediction is being run with all features.")

            # Sort from lowest to highest p-value
            # Lowest p-value indicates greatest confidence in distribution difference
            sort_index = np.argsort(vals)  # argsort is in ascending order by default
            features_sorted = [feature_names[idx] for idx in sort_index]
            vals_sorted = vals[sort_index]

            # Drift by feature
            drift_by_feature = pd.DataFrame(
                dict(
                    {
                        "feature": features_sorted,
                        "p_val": vals_sorted,
                        "is_significant_drift": vals_sorted < p_val,
                    }
                )
            )

        except Exception as error:
            print(f'Error in {error}')
            raise ValueError(error)

        finally:
            return drift_by_feature

    # convert the spark dataframe to pandas (both the current and the previous)
    tax_data_target_df = tax_data.toPandas()
    tax_data_baseliine_df = tax_data_baseliine.toPandas() 

    # select which features to perform the K-S test
    drift_feature_list = ['IncomeTax','IncomeTaxTarget','ValueAddedTax','TargetValueAddedTax','CorporationTax','TargetCorporationTax']

    # setup the feature for drift analysis
    feature_target = tax_data_target_df[drift_feature_list].dropna().to_numpy()
    feature_baseline = tax_data_baseliine_df[drift_feature_list].dropna().to_numpy()

    # Initialise the drift detector using the K-S method
    KS_drift_model = KSDrift(x_ref=feature_baseline, p_val=0.05, alternative='two-sided')

    # Perform the drift by feature_baseline
    KS_drift_predict = KS_drift_model.predict(feature_target, return_p_val=True,return_distance=True)
    KS_drift_by_feature = rank_feature_drift(KS_drift_predict, drift_feature_list)
    KS_drift_df = pd.DataFrame(list(zip(drift_feature_list, KS_drift_predict.get('data').get('distance'), KS_drift_predict.get('data').get('p_val'))), 
    columns=['feature', 'drift_score', 'p_value']).sort_values(by='drift_score',ascending=False)

    # show the results and plot
    print(KS_drift_df)
    ax = KS_drift_df.plot(x='feature', y='drift_score', kind='bar', figsize=(12,8), fontsize=12, legend=False)
    ax.set_title('Feature drift scores', fontsize=16)

    # plot the results
except Exception as error:
    print(f'Error in {error}')
    raise ValueError(error)

Prepare the drift resutls

In [35]:
# prepare the results of the drift analysis
try:
    # prepare the train or inference flag
    train_or_inference = 'train'

    # get the overall drift (its the average of the drifts for each feature
    overall_drift = True if KS_drift_predict.get('data').get('is_drift') == '1' else False

    # decide for train or inference based on drift
    train_or_inference = 'train' if overall_drift else 'inference'

except Exception as error:
    print(f'Error in {error}')
    raise ValueError(error)

# this line has be be outside the try except block
# exit the notebook with the data_quality value
mssparkutils.notebook.exit(train_or_inference)