In [None]:
# /lakehouse/default/Files/Bronze


In [1]:
# Install & import required packages
!pip install faker --quiet

import pandas as pd
import numpy as np
from faker import Faker
import random

# For reproducibility
faker = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)


StatementMeta(, 6cc9d46f-af06-42ec-b711-bcea117ca260, 3, Finished, Available, Finished)

In [None]:
# Sample code used to generate csv files
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string
import os

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Constants
NUM_LOANS = 10000
CUSTOMER_IDS = [f"C{str(i).zfill(5)}" for i in range(1, 3001)]  # 3000 customers
BRANCH_IDS = [f"B{str(i).zfill(3)}" for i in range(1, 51)]      # 50 branches
PRODUCT_IDS = [f"P{str(i).zfill(3)}" for i in range(1, 11)]     # 10 products
LOAN_TYPES = ['2W', '4W', 'Commercial', 'Tractor', 'Used']

def random_date(start, end):
    return start + timedelta(days=np.random.randint(0, (end - start).days))

# Date range for disbursement
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 12, 31)

# Generate Loan Data
loan_data = {
    "LoanID": [f"L{str(i).zfill(5)}" for i in range(1, NUM_LOANS + 1)],
    "CustomerID": np.random.choice(CUSTOMER_IDS, NUM_LOANS),
    "BranchID": np.random.choice(BRANCH_IDS, NUM_LOANS),
    "ProductID": np.random.choice(PRODUCT_IDS, NUM_LOANS),
    "DisbursedDate": [random_date(start_date, end_date) for _ in range(NUM_LOANS)],
    "LoanAmount": np.round(np.random.uniform(50000, 1500000, NUM_LOANS), 2),
    "TenureMonths": np.random.choice([6, 12, 18, 24, 36, 48, 60], NUM_LOANS),
    "InterestRate": np.round(np.where(np.random.rand(NUM_LOANS) < 0.02, np.nan, np.random.uniform(7.5, 15.0, NUM_LOANS)), 2),
    "Status": np.random.choice(['Active', 'Closed', 'Default'], NUM_LOANS, p=[0.7, 0.25, 0.05]),
    "RiskScore": np.where(np.random.rand(NUM_LOANS) < 0.03, np.nan, np.round(np.random.uniform(60, 99, NUM_LOANS), 2)),
    "LoanType": np.random.choice(LOAN_TYPES, NUM_LOANS)
}

df_loans = pd.DataFrame(loan_data)

# Add ClosureDate and EarlyClosureFlag based on Status
df_loans["ClosureDate"] = df_loans.apply(
    lambda row: random_date(row.DisbursedDate, datetime(2024, 6, 30)) if row.Status != 'Active' else pd.NaT, axis=1
)
df_loans["EarlyClosureFlag"] = df_loans.apply(
    lambda row: np.random.choice([True, False], p=[0.1, 0.9]) if row.Status == 'Closed' else np.nan, axis=1
)

# Save as CSV
output_path = "/Files/Bronze"
df_loans.to_csv(output_path, index=False)
output_path


StatementMeta(, 182f734a-8fb5-4a60-8d0a-d728d957c7dd, 6, Finished, Available, Finished)

In [2]:
import pandas as pd
import numpy as np

# Load Loan & EMI Data
loans = pd.read_csv("/lakehouse/default/Files/Bronze/Loans.csv")
emis = pd.read_csv("/lakehouse/default/Files/Bronze/EMIs.csv")

# Step 1: Total EMIs, Overdue EMIs, and Overdue Amount per LoanID
emis['IsOverdue'] = emis['Status'] == 'Overdue'
overdue_summary = emis.groupby('LoanID').agg(
    OverdueCount=('IsOverdue', 'sum'),
    OverdueAmount=('EMIAmount', lambda x: x[emis.loc[x.index, 'Status'] == 'Overdue'].sum())
).reset_index()

# Step 2: Merge Loan Risk Score
forecast_df = loans[['LoanID', 'RiskScore']].merge(overdue_summary, on='LoanID', how='left')
forecast_df.fillna({'OverdueCount': 0, 'OverdueAmount': 0}, inplace=True)

# Step 3: Simulate Forecast Fields
forecast_df['Forecast_OverdueAmount'] = forecast_df['OverdueAmount'] + np.random.randint(500, 5000, size=len(forecast_df))
forecast_df['Forecast_NPA_Percent'] = round((forecast_df['Forecast_OverdueAmount'] / loans['LoanAmount']) * 100, 2)
forecast_df['PredictedRiskScore'] = forecast_df['RiskScore'] + np.random.randint(-3, 5, size=len(forecast_df))

# Step 4: Priority Recovery Flag
forecast_df['PriorityRecoveryFlag'] = forecast_df.apply(
    lambda row: 'Yes' if row['PredictedRiskScore'] >= 80 and row['OverdueCount'] > 1 else 'No',
    axis=1
)

# Final Columns
final_forecast = forecast_df[[
    'LoanID', 
    'Forecast_OverdueAmount', 
    'Forecast_NPA_Percent', 
    'PredictedRiskScore', 
    'PriorityRecoveryFlag'
]]

# Save as CSV
final_forecast.to_csv("/lakehouse/default/Files/Bronze/ForecastData.csv", index=False)


StatementMeta(, 182f734a-8fb5-4a60-8d0a-d728d957c7dd, 4, Finished, Available, Finished)

In [3]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/Bronze/ForecastData.csv"
df = pd.read_csv("/lakehouse/default/Files/Bronze/ForecastData.csv")
display(df)


StatementMeta(, 182f734a-8fb5-4a60-8d0a-d728d957c7dd, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f0eba21d-40b7-4008-ab80-a2ce7153fda0)