In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# ------------------------------
# Parameters
# ------------------------------
n_rows = 1000
business_ids = [f"SMB{100+i}" for i in range(1, 51)]  # 50 SMEs
categories = ["Sales Revenue", "Inventory Purchase", "Payroll", "Rent", "Loan Repayment", "Miscellaneous"]
payment_methods = ["Bank Transfer", "Credit", "Cash", "Online Wallet"]

# ------------------------------
# Generate synthetic cashflow dataset
# ------------------------------
data = []
start_date = datetime(2024, 1, 1)

for i in range(1, n_rows+1):
    txn_id = f"TXN{i:04d}"
    date = start_date + timedelta(days=random.randint(0, 400))
    business_id = random.choice(business_ids)
    category = random.choice(categories)
    
    # Base inflow/outflow
    if category == "Sales Revenue":
        cash_in = np.random.randint(5000, 50000)
        cash_out = 0
    elif category in ["Inventory Purchase", "Payroll", "Rent", "Loan Repayment"]:
        cash_in = 0
        cash_out = np.random.randint(2000, 30000)
    else:  # Miscellaneous
        if random.random() > 0.5:
            cash_in = np.random.randint(500, 10000)
            cash_out = 0
        else:
            cash_in = 0
            cash_out = np.random.randint(500, 10000)
    
    # Introduce outliers (rare extreme values)
    if random.random() < 0.02:  # 2% chance
        if cash_in > 0:
            cash_in *= np.random.randint(5, 20)
        if cash_out > 0:
            cash_out *= np.random.randint(5, 20)
    
    net_cashflow = cash_in - cash_out
    
    # Introduce cashflow problems (forced negative situations)
    if random.random() < 0.05:  # 5% chance
        net_cashflow -= np.random.randint(20000, 100000)  # extra debt pressure
        cash_out += abs(net_cashflow)
    
    payment_method = random.choice(payment_methods)
    note = ""
    if net_cashflow < 0:
        note = "Warning: Negative cashflow"
    elif cash_in > 50000:
        note = "Large inflow spike"
    elif cash_out > 40000:
        note = "Unusual expense spike"
    else:
        note = random.choice(["Normal transaction", "Seasonal effect", "Market fluctuation"])
    
    data.append([txn_id, date.strftime("%Y-%m-%d"), business_id, category,
                 cash_in, cash_out, net_cashflow, payment_method, note])

# ------------------------------
# Create DataFrame
# ------------------------------
df = pd.DataFrame(data, columns=[
    "Transaction_ID", "Date", "Business_ID", "Category",
    "Cash_Inflow", "Cash_Outflow", "Net_Cashflow",
    "Payment_Method", "Notes"
])

# ------------------------------
# Save to CSV
# ------------------------------
csv_filename = "sme_cashflow_dataset.csv"
df.to_csv(csv_filename, index=False)

print(f"✅ Dataset generated and saved as {csv_filename} with {len(df)} rows.")

# ------------------------------
# Quick check: Show first 5 rows
# ------------------------------
df.head()


✅ Dataset generated and saved as sme_cashflow_dataset.csv with 1000 rows.


Unnamed: 0,Transaction_ID,Date,Business_ID,Category,Cash_Inflow,Cash_Outflow,Net_Cashflow,Payment_Method,Notes
0,TXN0001,2024-05-29,SMB123,Rent,0,21341,-21341,Credit,Warning: Negative cashflow
1,TXN0002,2024-07-06,SMB111,Inventory Purchase,0,28373,-28373,Credit,Warning: Negative cashflow
2,TXN0003,2024-07-18,SMB145,Sales Revenue,39919,11005,11005,Cash,Seasonal effect
3,TXN0004,2025-01-02,SMB134,Sales Revenue,20418,0,20418,Cash,Seasonal effect
4,TXN0005,2024-10-26,SMB140,Miscellaneous,0,6628,-6628,Bank Transfer,Warning: Negative cashflow
