In [8]:
# Importing libraries
import pandas as pd
import numpy as np
from datetime import date

# MIS Cleaner
Each below cell will be uniquely assined for data cleaning to each MIS. In future if ShiftPay will tie-up with other financial organization then based on the success event columns we need to define a function to perform data cleaning.

Which columns needs to be cleaned?\
Check out the payin_payout_rules.csv file, in this file we defined for every product type what are the columns that will help us to define the successful sales events.

In [25]:
# Data cleaning for mis_bank_A
def mis_bank_A_cleaner(df):
    # Dropping records in case of NULL/blanks
    df = (
        df
        .dropna(subset=['TrackingId', 'AccountOpeningDate', 'InvestedAmount'], how='any')
        .reset_index(drop=True)
        )

    # Ensuring correct column data types
    df['LeadId'] = df['TrackingId'].str[-8:] # Extracting last 8 characters from TrackingId
    df['AccountOpeningDate'] = pd.to_datetime(df['AccountOpeningDate'], format='%d %m %Y')
    df['InvestedAmount'] = df['InvestedAmount'].astype(float)

    # Handling NULLs and missing values
    df['SourceType'] = df['SourceType'].fillna('Bank_A')
    df['MediumType'] = df['MediumType'].fillna('Medium_A')
    df['ProductType'] = df['ProductType'].fillna('Savings Account')
    df.drop_duplicates()
    return df


In [1]:
# Data cleaning for mis_bank_B
def mis_bank_B_cleaner(df):
    # Dropping records in case of NULL/blanks
    df = (
        df
        .dropna(subset=['TrackingId','DisbursedAmount','LoanDisbursedDate'], how='any')
        .reset_index(drop=True)
        )

    # Ensuring correct column data types
    df['LeadId'] = df['TrackingId'].str[-8:] # Extracting last 8 characters from TrackingId
    df['LoanDisbursedDate'] = pd.to_datetime(df['LoanDisbursedDate'], format='%d %m %Y')
    df['DisbursedAmount'] = df['DisbursedAmount'].astype(float)

    # Handling NULLs and missing values
    df['SourceType'] = df['SourceType'].fillna('Bank_A')
    df['MediumType'] = df['MediumType'].fillna('Medium_A')
    df['ProductType'] = df['ProductType'].fillna('Savings Account')
    df.drop_duplicates()
    return df


# Supportive Files Cleaner

We'll perform some data cleaning operations on supportive dataframes to ensure no error while processing.

In [2]:
# Data cleaning for supportive files
def rules_cleaner(df):
    # Handling missing values and ensuring correct column data types
    df['KPI1PayinPercentage'] = df['KPI1PayinPercentage'].fillna(0).astype(float)
    df['KPI1PayoutPercentage'] = df['KPI1PayoutPercentage'].fillna(0).astype(float)
    df['KPI1PayinAmount'] = df['KPI1PayinAmount'].fillna(0).astype(float)
    df['KPI1PayoutAmount'] = df['KPI1PayoutAmount'].fillna(0).astype(float)
    df['MinInvestedAmountBound'] = df['MinInvestedAmountBound'].fillna(0).astype(float)
    df['KPI2PayinAmount'] = df['KPI2PayinAmount'].fillna(0).astype(float)
    df['KPI2PayoutAmount'] = df['KPI2PayoutAmount'].fillna(0).astype(float)
    df['KPIStartDate'] = pd.to_datetime(df['KPIStartDate'], format='%d %m %Y')
    df['KPIEndDate'] = pd.to_datetime(df['KPIEndDate'], format='%d %m %Y')
    df = df.drop('SuccessCriteria', axis=1)
    return df

def leads_cleaner(df):
    df['LeadCreatedAt'] = pd.to_datetime(df['LeadCreatedAt']).dt.date
    return df

def errors_cleaner(df):
    df['CreatedAt'] = pd.to_datetime(df['CreatedAt']).dt.date
    df = df[['CreatedAt', 'MIS']]
    return df