# Loan Default Predictor Model

##  Data Exploration and Pre-processing:

### Import libraries and datasets:

In [16]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn import preprocessing 
import seaborn as sns
import xgboost as xgb # type: ignore
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split,GridSearchCV
import joblib
from sklearn.metrics import f1_score
%matplotlib inline

In [13]:
merchantDetails = pd.DataFrame({

})

loanScheduleDetails = pd.DataFrame({
 
})

loanLedgerDetails = pd.DataFrame({
 
})

transactionDetails = pd.DataFrame({

})

### Formatting

In [None]:
# Rename columns with same name

merchantDetails.rename(columns={'id': 'merchant_code', 'created_at': 'merchant_created_at', 'updated_at':'merchant_updated_at'}, inplace=True)

loanScheduleDetails.rename(columns={'id': 'loan_schedule_id', 'created_at': 'loan_schedule_created_at', 'updated_at':'loan_schedule_updated_at', 'paid_date':'loan_schedule_paid_date'}, inplace=True)

loanLedgerDetails.rename(columns={'id': 'loan_id', 'created_at': 'loan_created_at', 'updated_at':'loan_updated_at', 'deleted_at':'loan_deleted_at', 'transaction_date':'loan_transaction_date', 'transaction_type':'loan_transaction_type', 'description':'loan_description'}, inplace=True)

transactionDetails.rename(columns={'id': 'transaction_id', 'created_at': 'transaction_created_at'}, inplace=True)

# Convert date columns to datetime format

merchantDetails[['merchant_created_at', 'merchant_updated_at']] = merchantDetails.apply(pd.to_datetime)

loanScheduleDetails[['loan_schedule_created_at', 'loan_schedule_updated_at', 'loan_schedule_paid_date', 'schedule_date']] = loanScheduleDetails.apply(pd.to_datetime)

loanLedgerDetails[['loan_created_at', 'loan_updated_at', 'loan_deleted_at']] = merchantDetails.apply(pd.to_datetime)

transactionDetails[['transaction_datetime', 'transaction_created_at']] = transactionDetails.apply(pd.to_datetime)

# Use loans only older than 3 months

# Calculate the cutoff date for loans older than 3 months
cutoff_date = pd.Timestamp.today() - pd.DateOffset(months=3)

loanSchedule = loanScheduleDetails[loanScheduleDetails['schedule_date'] < cutoff_date]

loanLedger = loanLedgerDetails[loanLedgerDetails['loan_created_at'] < cutoff_date]

# Merge the data on 'merchant_code' to get only merchants who have or have had loans

merchants = pd.merge(merchantDetails, loanSchedule[['merchant_code']], on='merchant_code', how='inner')

# Filter transactionDetails to only include merchants who have taken loans and transactions prior to loan disbursement

transactions = pd.merge(transactionDetails, loanSchedule[['merchant_code', 'loan_schedule_created_at']], on='merchant_code', how='inner')

transactions = transactions[transactions['transaction_created_at'] < transactions['loan_schedule_created_at']]

### Examining Data:

#### Columns and data types:

In [14]:
merchants.head()
loans.head()
transactions.head()



Unnamed: 0,loan_request_id,merchant_code,outstanding_amount,paid,created_at,schedule_rank
0,1001,M123,500.0,200.0,2023-07-15,1
1,1002,M456,1200.0,600.0,2023-08-01,2
2,1003,M789,800.0,300.0,2023-08-10,1
3,1004,M101,300.0,100.0,2023-08-15,1
4,1005,M202,1500.0,800.0,2023-09-01,2


In [None]:
merchants.info()
loans.info()
transactions.info()

#### Dropping non-required columns:

In [None]:
merchants=merchants.drop([''],axis=1)
loans=loans.drop([''],axis=1)
transactions=transactions.drop([''],axis=1)

#### Checking and handling missing values:

In [None]:
missing_info = merchants.isnull().sum()
missing_info = loans.isnull().sum()
missing_info = transactions.isnull().sum()

### Feature engineering:

In [None]:
df = pd.DataFrame()
df['merchant_code'] = merchants['merchant_code']

#### Loan Size

In [None]:
# Filter loan ledger where transaction type is 'DISBURSEMENT'

disburse_df = loanLedger[loanLedger['loan_transaction_type'] == 'DISBURSEMENT']

df = pd.merge(df, disburse_df[['merchant_code', 'debit']], on='merchant_code', how='left')

#### Account age at disbursement

In [None]:
# Filter loan ledger where transaction type is 'DISBURSEMENT'

disbursement_df = loanLedger[loanLedger['loan_transaction_type'] == 'DISBURSEMENT']

# Create a mapping from merchants to map merchant_created_at by merchant_code

merchant_creation_map = merchants.set_index('merchant_code')['merchant_created_at'].to_dict()

# Map merchant_created_at to disbursement_df based on merchant_code

disbursement_df['merchant_created_at'] = disbursement_df['merchant_code'].map(merchant_creation_map)

# Calculate the difference in days between merchant_created_at and transaction_date

disbursement_df['relationage'] = (disbursement_df['loan_transaction_date'] - disbursement_df['merchant_created_at']).dt.days

# Select only relevant columns: 'merchant_code' and 'age_at_disbursement'
disbursement_age_df = disbursement_df[['merchant_code', 'relationage']]

# Merge 'age_at_disbursement' into the existing DataFrame
df = pd.merge(df, disbursement_age_df, on='merchant_code', how='left')

#### Unique Customers

In [None]:
# Group by 'merchant_code' and count unique 'pan' (customers)

unique_customers_df = transactions.groupby('merchant_code')['pan'].nunique().reset_index()

unique_customers_df.rename(columns={'pan': 'unique_customers'}, inplace=True)

df = pd.merge(df, unique_customers_df, on='merchant_code', how='left')

#### Transaction Characteristics

In [None]:
def calculate_trans_char(transactions):

    # Total
    total_freq = transactions.groupby('merchant_code').size().reset_index(name='total_transactions')
    total_value = transactions.groupby('merchant_code')['amount'].sum().reset_index(name='total_value')
    sales_avg = transactions.groupby('merchant_code')['amount'].mean().reset_index(name='sales_avg')

        # Filter transactions for 60 and 90 days before disbursement
    disbursement_dates = loanLedger[loanLedger['loan_transaction_type'] == 'DISBURSEMENT']

    trans_disburse = pd.merge(transactions, disbursement_dates[['merchant_code', 'loan_transaction_date']], on='merchant_code', how='left')

    transactions_30_days = trans_disburse[
        (trans_disburse['transaction_datetime'] >= (trans_disburse['disbursement_date'] - pd.Timedelta(days=30))) &
        (trans_disburse['transaction_datetime'] < trans_disburse['disbursement_date'])
    ]

    transactions_60_days = trans_disburse[
        (trans_disburse['transaction_datetime'] >= (trans_disburse['disbursement_date'] - pd.Timedelta(days=60))) &
        (trans_disburse['transaction_datetime'] < trans_disburse['disbursement_date'])
    ]

    transactions_90_days = trans_disburse[
        (trans_disburse['transaction_datetime'] >= (trans_disburse['disbursement_date'] - pd.Timedelta(days=90))) &
        (trans_disburse['transaction_datetime'] < trans_disburse['disbursement_date'])
    ]
    
    transactions_60_90_days = transactions[
        (trans_disburse['transaction_datetime'] >= (trans_disburse['disbursement_date'] - pd.Timedelta(days=90))) &
        (trans_disburse['transaction_datetime'] < (trans_disburse['disbursement_date'] - pd.Timedelta(days=60)))
]

    # Calculate frequency for 60 days before disbursement
    freq_60_days = transactions_60_days.groupby('merchant_code').size().reset_index(name='freq_60_days')
    value_60_days = transactions_60_days.groupby('merchant_code')['amount'].sum().reset_index(name='value_60_days')
    sales_avg_60_days = transactions_60_days.groupby('merchant_code')['amount'].mean().reset_index(name='sales_avg_60_days')

    # Calculate frequency for 90 days before disbursement
    freq_90_days = transactions_90_days.groupby('merchant_code').size().reset_index(name='freq_90_days')
    value_90_days = transactions_90_days.groupby('merchant_code')['amount'].sum().reset_index(name='value_90_days')
    sales_avg_90_days = transactions_90_days.groupby('merchant_code')['amount'].mean().reset_index(name='sales_avg_90_days')
    
    std_dev_90_days = transactions_90_days.groupby('merchant_code')['amount'].std().reset_index(name='std_dev_90_days')
    
    variability = pd.merge(std_dev_90_days, sales_avg_90_days, on='merchant_code', how='left')
    
    variability['cv_90_days'] = (variability['std_dev_90_days'] / variability['mean_90_days']) * 100

    
    sales_30_days = transactions_30_days.groupby('merchant_code')['amount'].sum().reset_index(name='sales_30_days')
    sales_60_90_days = transactions_60_90_days.groupby('merchant_code')['amount'].sum().reset_index(name='sales_60_90_days')
    sales_growth = pd.merge(sales_30_days, sales_60_90_days, on='merchant_code', how='outer')
    
    sales_growth.fillna(0, inplace=True)
    
    sales_growth['sales_growth'] = np.where(
    sales_growth['sales_60_90_days'] != 0,
    (sales_growth['sales_30_days'] - sales_growth['sales_60_90_days']) / sales_growth['sales_60_90_days'] * 100,
    np.nan
    )

    # Merge all the averages into a single DataFrame
    trans_char = pd.merge(total_freq, total_value, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, sales_avg, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, freq_60_days, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, value_60_days, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, sales_avg_60_days, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, freq_90_days, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, value_90_days, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, sales_avg_90_days, on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, sales_growth[['merchant_code', 'sales_growth']], on='merchant_code', how='left')
    trans_char = pd.merge(trans_char, variability[['merchant_code', 'cv_90_days']], on='merchant_code', how='left')

    return trans_char


transaction_char_df = calculate_trans_char(transactions)

df = pd.merge(df, transaction_char_df, on='merchant_code', how='left')

df['avg_size'] = df['total_value'] / df['total_freq']

#### Sales Growth

In [None]:
df['loan_sales_ratio'] = df['debit'] / df['value_90_days']
df['loan_sales_ratio'] = df['loan_sales_ratio'].replace([np.inf, -np.inf], np.nan)



In [None]:
pf = df['merchant_code', 'loan_sales_ratio', 'sales_growth']

### Checking for outliers and removing them:


Features: total transaction volume, number of transactions, average transaction size, transaction frequency, transaction amount variability, sales growth rate, decline in transaction volume, weekend transaction percentage, peak sales hours, transaction type distribution, average transaction growth, sales variability, sales seasonality, number of unique customers, customer retention rate, Herfindahl-Hirschman Index (HHI) of customer concentration, decline in transaction frequency, refund percentage, days since last transaction, percent of high-ticket transactions, transaction patterns on holidays or off-business hours, merchant performance relative to peers, merchant market share in district, post-loan sales growth, transaction volume volatility post-loan, change in average transaction size post-loan, sales volatility post-loan, percentage of refunded transactions post-loan, business type, business location, length of relationship, merchant size

Balancing the dataset:

## Notes: