In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# ------------------------------
# Parameters
# ------------------------------
n_rows = 1000
business_ids = [f"SMB{100+i}" for i in range(1, 51)]  # 50 SMEs
lenders = ["Bank A", "Bank B", "NBFC LoanCo", "Private Lender", "Bank C"]
loan_types = ["Working Capital", "Term Loan", "Credit Line", "Equipment Loan"]
statuses = ["Current", "Overdue", "Defaulted", "Closed"]
strategies = ["Snowball", "Avalanche", "Refinancing", "Consolidation"]

# ------------------------------
# Generate synthetic debt dataset
# ------------------------------
data = []
start_date = datetime(2024, 1, 1)

for i in range(1, n_rows+1):
    debt_id = f"D{i:04d}"
    business_id = random.choice(business_ids)
    lender = random.choice(lenders)
    loan_type = random.choice(loan_types)
    
    # Principal between 50K and 1M
    principal = np.random.randint(50000, 1000000)
    
    # Outstanding less or equal to principal
    outstanding = np.random.randint(1000, principal)
    
    # Interest rate between 5% and 25%
    interest_rate = round(np.random.uniform(5, 25), 2)
    
    # EMI: around 2–10% of outstanding
    monthly_installment = int(outstanding * np.random.uniform(0.02, 0.1))
    
    # Due date within 2024–2025
    due_date = start_date + timedelta(days=random.randint(0, 400))
    
    # Status logic
    status = random.choice(statuses)
    if outstanding == 0:
        status = "Closed"
    
    # Collateral: Yes for big loans, else random
    collateral = "Yes" if principal > 300000 else random.choice(["Yes", "No"])
    
    # Risk score logic
    risk_score = np.random.randint(20, 80)
    
    # Add outliers / risky cases
    if random.random() < 0.05:  # 5% chance
        interest_rate = round(np.random.uniform(25, 40), 2)  # abnormal high rate
        risk_score = np.random.randint(80, 100)
        status = "Overdue"
    
    if random.random() < 0.03:  # 3% chance
        outstanding = principal  # unpaid loan
        risk_score = 95
        status = "Defaulted"
    
    # Optimization strategy based on status/risk
    if status == "Overdue" or status == "Defaulted":
        strategy = random.choice(["Consolidation", "Refinancing"])
    elif interest_rate > 20:
        strategy = "Refinancing"
    elif risk_score > 70:
        strategy = "Avalanche"
    else:
        strategy = random.choice(strategies)
    
    data.append([debt_id, business_id, lender, loan_type, principal, outstanding,
                 interest_rate, monthly_installment, due_date.strftime("%Y-%m-%d"),
                 status, collateral, risk_score, strategy])

# ------------------------------
# Create DataFrame
# ------------------------------
df = pd.DataFrame(data, columns=[
    "Debt_ID", "Business_ID", "Lender", "Loan_Type",
    "Principal_Amount", "Outstanding_Amount", "Interest_Rate(%)",
    "Monthly_Installment", "Due_Date", "Status",
    "Collateral", "Risk_Score", "Optimization_Strategy"
])

# ------------------------------
# Save to CSV
# ------------------------------
csv_filename = "sme_debt_dataset.csv"
df.to_csv(csv_filename, index=False)

print(f"✅ Dataset generated and saved as {csv_filename} with {len(df)} rows.")

# ------------------------------
# Quick check: Show first 5 rows
# ------------------------------
df.head()


✅ Dataset generated and saved as sme_debt_dataset.csv with 1000 rows.


Unnamed: 0,Debt_ID,Business_ID,Lender,Loan_Type,Principal_Amount,Outstanding_Amount,Interest_Rate(%),Monthly_Installment,Due_Date,Status,Collateral,Risk_Score,Optimization_Strategy
0,D0001,SMB144,Bank A,Equipment Loan,751766,599099,10.35,57145,2024-05-19,Current,Yes,51,Refinancing
1,D0002,SMB112,Bank B,Term Loan,712126,712126,6.4,32830,2024-11-23,Defaulted,Yes,95,Refinancing
2,D0003,SMB119,Bank B,Credit Line,584670,541462,20.87,19870,2025-01-18,Defaulted,Yes,34,Consolidation
3,D0004,SMB133,Bank A,Credit Line,426044,242100,15.56,12106,2024-06-25,Current,Yes,32,Consolidation
4,D0005,SMB135,Bank B,Credit Line,549501,385487,9.86,34917,2024-01-17,Overdue,Yes,47,Consolidation
