![Alt text](images/banner.png)

## Connection and Data Validation Notebook

### Load the Credentials

These environment variables are automatically set in WS Pipelines and are needed to access various services.

In [1]:
import os
TOKEN = os.getenv("USER_ACCESS_TOKEN")

In [4]:
## Imports

!pip install tensorflow-data-validation==1.14.0
!pip install ibm-watson-studio-pipelines==0.2.12
!pip install python-dotenv==1.0.0

Collecting tensorflow-data-validation==1.14.0
  Downloading tensorflow_data_validation-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.2.0
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow<3,>=2.13
  Downloading tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tfx-bsl<1.15,>=1.14.0
  Downloading tfx_bsl-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.5/22.5 MB[0m [3

In [5]:
from botocore.client import Config
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import tensorflow_data_validation as tfdv
import numpy as np
import pandas as pd
from ibm_watson_studio_pipelines import WSPipelines
import warnings


warnings.filterwarnings("ignore")

2023-10-27 13:15:59.297635: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-27 13:15:59.297683: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-27 13:15:59.297714: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Loading Variables and Utils from common python file

In this section we load the variables and functions from the common python file. This file contains the variables and functions that are common to all the notebooks in this project.

In [6]:
import vars_and_utils as vars_and_utils

## Load the Training Data 

this will check if the training data exists within a defined db2 table. If it does not exist, it will load the data from the web and store it in the project space as a .csv file.

In [7]:
gcr_df = vars_and_utils.load_german_credit_risk_data()

## Encode for ease of use with OpenScale
gcr_df['Risk'] = gcr_df['Risk'].map({'Risk':1,'No Risk':0})
gcr_df.head()

not implemented
Error while loading data from db2. downloading csv file to filesystem instead
Downloaded and saved as data/german_credit_data_biased_training.csv
loading data to pandas dataframe


Unnamed: 0,CheckingStatus,LoanDuration,CreditHistory,LoanPurpose,LoanAmount,ExistingSavings,EmploymentDuration,InstallmentPercent,Sex,OthersOnLoan,...,OwnsProperty,Age,InstallmentPlans,Housing,ExistingCreditsCount,Job,Dependents,Telephone,ForeignWorker,Risk
0,0_to_200,31,credits_paid_to_date,other,1889,100_to_500,less_1,3,female,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,0
1,less_0,18,credits_paid_to_date,car_new,462,less_100,1_to_4,2,female,none,...,savings_insurance,37,stores,own,2,skilled,1,none,yes,0
2,less_0,15,prior_payments_delayed,furniture,250,less_100,1_to_4,2,male,none,...,real_estate,28,none,own,2,skilled,1,yes,no,0
3,0_to_200,28,credits_paid_to_date,retraining,3693,less_100,greater_7,3,male,none,...,savings_insurance,32,none,own,1,skilled,1,none,yes,0
4,no_checking,28,prior_payments_delayed,education,6235,500_to_1000,greater_7,3,male,none,...,unknown,57,none,own,2,skilled,1,none,yes,1


## Data Validation 

In [8]:
@dataclass
class Datavalidation:
    """
    
    Data Validation Class
    
    """
    dataframe : pd.DataFrame
    mask_per :int
    
    
    def split_data(self,seed=32):
        """
        Split Data into Train and Test Splits
        
        """
        np.random.seed(seed)
        mask = np.random.rand(len(self.dataframe)) <= self.mask_per
        training_data = gcr_df[mask]
        testing_data = gcr_df[~mask]

        print(f"No. of training examples: {training_data.shape[0]}")
        print(f"No. of testing examples: {testing_data.shape[0]}")
        
        return training_data, testing_data
    
    # TODO: Replace with Db2/fileystem
    def save_data_in_filesystem(self,df,filename):
        """
        Save Data in Filesystem

        Passed filename should involve path

        """
        try:
            df.to_csv(filename,index=False)
            print(f"File {filename} persisted successfully")
        except Exception as e:
            print(e)
            print(f"File serialization for {filename} failed")
    
    def generate_statistics(self,df):
        """
        
        Generate Statistics on a given Dataframe
        
        """
        train_stats = tfdv.generate_statistics_from_dataframe(df)
        tfdv.visualize_statistics(train_stats)
        return train_stats
    
    def inferSchema(self,stats):
        
        """
        InferSchema on a given Dataframe
        
        """
        schema = tfdv.infer_schema(statistics=stats)
        tfdv.display_schema(schema=schema)
        return schema
    
    def compare_statistics(self,lhs,rhs):
        """
        
        Compare Statistics between a test dataframe and reference Schema
        
        """
        # Compare evaluation data with training data
        tfdv.visualize_statistics(lhs_statistics=lhs, rhs_statistics=rhs,
                                  lhs_name='TEST_DATASET', rhs_name='TRAIN_DATASET')
        
        
    def check_for_anomalies(self,testable_stats,ref_schema):
        """
        
        Check for any anomalies based on statistics and schema and values
        
        """
        anomalies = tfdv.validate_statistics(statistics=testable_stats, schema=ref_schema)
        tfdv.display_anomalies(anomalies)
        if len(anomalies.anomaly_info.items()) > 0:
            logger.error("Anomalies found in dataset...")
            logger.error(str(self.anomalies.anomaly_info.items()))
            return True
        else:
            return False

###  Split Data into Train and Eval Splits to Check for Consistency

In [9]:
classvalidate = Datavalidation(dataframe=gcr_df,mask_per=0.8) 

training_data, testing_data = classvalidate.split_data()

No. of training examples: 3995
No. of testing examples: 1005


## Generate Training Stats on both Splits

In [10]:
train_stats = classvalidate.generate_statistics(training_data)
test_stats = classvalidate.generate_statistics(testing_data)

## Infer Data Schemas

In [11]:
train_schema = classvalidate.inferSchema(train_stats)
test_schema = classvalidate.inferSchema(test_stats)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'CheckingStatus',STRING,required,,'CheckingStatus'
'LoanDuration',INT,required,,-
'CreditHistory',STRING,required,,'CreditHistory'
'LoanPurpose',STRING,required,,'LoanPurpose'
'LoanAmount',INT,required,,-
'ExistingSavings',STRING,required,,'ExistingSavings'
'EmploymentDuration',STRING,required,,'EmploymentDuration'
'InstallmentPercent',INT,required,,-
'Sex',STRING,required,,'Sex'
'OthersOnLoan',STRING,required,,'OthersOnLoan'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'CheckingStatus',"'0_to_200', 'greater_200', 'less_0', 'no_checking'"
'CreditHistory',"'all_credits_paid_back', 'credits_paid_to_date', 'no_credits', 'outstanding_credit', 'prior_payments_delayed'"
'LoanPurpose',"'appliances', 'business', 'car_new', 'car_used', 'education', 'furniture', 'other', 'radio_tv', 'repairs', 'retraining', 'vacation'"
'ExistingSavings',"'100_to_500', '500_to_1000', 'greater_1000', 'less_100', 'unknown'"
'EmploymentDuration',"'1_to_4', '4_to_7', 'greater_7', 'less_1', 'unemployed'"
'Sex',"'female', 'male'"
'OthersOnLoan',"'co-applicant', 'guarantor', 'none'"
'OwnsProperty',"'car_other', 'real_estate', 'savings_insurance', 'unknown'"
'InstallmentPlans',"'bank', 'none', 'stores'"
'Housing',"'free', 'own', 'rent'"


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'CheckingStatus',STRING,required,,'CheckingStatus'
'LoanDuration',INT,required,,-
'CreditHistory',STRING,required,,'CreditHistory'
'LoanPurpose',STRING,required,,'LoanPurpose'
'LoanAmount',INT,required,,-
'ExistingSavings',STRING,required,,'ExistingSavings'
'EmploymentDuration',STRING,required,,'EmploymentDuration'
'InstallmentPercent',INT,required,,-
'Sex',STRING,required,,'Sex'
'OthersOnLoan',STRING,required,,'OthersOnLoan'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'CheckingStatus',"'0_to_200', 'greater_200', 'less_0', 'no_checking'"
'CreditHistory',"'all_credits_paid_back', 'credits_paid_to_date', 'no_credits', 'outstanding_credit', 'prior_payments_delayed'"
'LoanPurpose',"'appliances', 'business', 'car_new', 'car_used', 'education', 'furniture', 'other', 'radio_tv', 'repairs', 'retraining', 'vacation'"
'ExistingSavings',"'100_to_500', '500_to_1000', 'greater_1000', 'less_100', 'unknown'"
'EmploymentDuration',"'1_to_4', '4_to_7', 'greater_7', 'less_1', 'unemployed'"
'Sex',"'female', 'male'"
'OthersOnLoan',"'co-applicant', 'guarantor', 'none'"
'OwnsProperty',"'car_other', 'real_estate', 'savings_insurance', 'unknown'"
'InstallmentPlans',"'bank', 'none', 'stores'"
'Housing',"'free', 'own', 'rent'"


## Compare Eval and Train Data 

In [12]:
classvalidate.compare_statistics(lhs=test_stats,rhs=train_stats)

## Check For Data Anomalies 

### Check eval data for errors by validating the eval data stats using the previously inferred schema.

In [13]:
anomaly_status = classvalidate.check_for_anomalies(test_stats,train_schema)
anomaly_status

False

## Save Train and Test Data for Data Preparation Stage

In [14]:
# TODO: Replace with Db2/fileystem
if not anomaly_status:
    classvalidate.save_data_in_filesystem(df=training_data,filename=vars_and_utils.train_data_path)
    classvalidate.save_data_in_filesystem(df=testing_data,filename=vars_and_utils.test_data_path)

File data/train_gcr.csv persisted successfully
File data/test_gcr.csv persisted successfully


## Check if the validation steps were successful
This checks if anomalies were found and if the data was successfully split into train and eval splits and stored as files.

In [15]:
def validation_successfull(train_data_path, test_data_path):
    if anomaly_status: # no anomalies
        return False
    elif not os.path.exists(train_data_path): # train data file exists
        return False
    elif not os.path.exists(test_data_path): # test data file exists
        return False
    else:
        print ("validation of the data successfull")
        return True
    
validation_successfull(vars_and_utils.train_data_path, vars_and_utils.test_data_path)

validation of the data successfull


True

## Register the output variables for the next pipeine stage
every notebook outputs a "was_successful" boolean variable. The logic behind this is different for every notebook and can be altered to fit the needs of the project.
If needed additional variables can be created here but they also need to registered as output variables in the Watson Pipelines UI.

In [16]:
validation_params = {}
validation_params['was_succesfull'] = validation_successfull(vars_and_utils.train_data_path, vars_and_utils.test_data_path)

pipelines_client = WSPipelines.from_token(TOKEN)
pipelines_client.store_results(validation_params)

validation of the data successfull
Running outside of Watson Studio Pipeline - storing results in the local filesystem for testing purposes...

  output paths:
    - "was_succesfull": .ibm_watson_studio_pipelines/results/was_succesfull


<ibm_cloud_sdk_core.detailed_response.DetailedResponse at 0x7f866329cb80>