In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# ------------------------------
# Parameters
# ------------------------------
n_rows = 1000
business_ids = [f"SMB{200+i}" for i in range(1, 51)]  # Changed IDs for uniqueness
categories = ["Sales Revenue", "Inventory Purchase", "Payroll", "Rent", "Loan Repayment", "Miscellaneous"]
payment_methods = ["Bank Transfer", "Credit", "Cash", "Online Wallet"]

# ------------------------------
# Generate synthetic cashflow dataset
# ------------------------------
data = []
start_date = datetime(2024, 6, 1)  # shifted start date for variety

for i in range(1, n_rows+1):
    txn_id = f"TXN_NEW{i:05d}"  # different txn format
    date = start_date + timedelta(days=random.randint(0, 450))
    business_id = random.choice(business_ids)
    category = random.choice(categories)
    
    # Base inflow/outflow
    if category == "Sales Revenue":
        cash_in = np.random.randint(6000, 60000)  # shifted ranges
        cash_out = 0
    elif category in ["Inventory Purchase", "Payroll", "Rent", "Loan Repayment"]:
        cash_in = 0
        cash_out = np.random.randint(2500, 35000)
    else:  # Miscellaneous
        if random.random() > 0.5:
            cash_in = np.random.randint(1000, 12000)
            cash_out = 0
        else:
            cash_in = 0
            cash_out = np.random.randint(1000, 12000)
    
    # Introduce outliers (rare extreme values)
    if random.random() < 0.02:  
        if cash_in > 0:
            cash_in *= np.random.randint(10, 25)  # different multiplier range
        if cash_out > 0:
            cash_out *= np.random.randint(10, 25)
    
    net_cashflow = cash_in - cash_out
    
    # Introduce cashflow problems (forced negative situations)
    if random.random() < 0.05:
        net_cashflow -= np.random.randint(30000, 120000)  
        cash_out += abs(net_cashflow)
    
    payment_method = random.choice(payment_methods)
    note = ""
    if net_cashflow < 0:
        note = "Warning: Negative cashflow"
    elif cash_in > 60000:
        note = "Large inflow spike"
    elif cash_out > 50000:
        note = "Unusual expense spike"
    else:
        note = random.choice(["Normal transaction", "Seasonal effect", "Market fluctuation", "Customer delay"])
    
    data.append([txn_id, date.strftime("%Y-%m-%d"), business_id, category,
                 cash_in, cash_out, net_cashflow, payment_method, note])

# ------------------------------
# Create DataFrame
# ------------------------------
df = pd.DataFrame(data, columns=[
    "Transaction_ID", "Date", "Business_ID", "Category",
    "Cash_Inflow", "Cash_Outflow", "Net_Cashflow",
    "Payment_Method", "Notes"
])

# ------------------------------
# Save to CSV
# ------------------------------
csv_filename = "sme_cashflow_dataset_v2.csv"
df.to_csv(csv_filename, index=False)

print(f"✅ New dataset generated and saved as {csv_filename} with {len(df)} rows.")

# ------------------------------
# Quick check: Show first 5 rows
# ------------------------------
df.head()


✅ New dataset generated and saved as sme_cashflow_dataset_v2.csv with 1000 rows.


Unnamed: 0,Transaction_ID,Date,Business_ID,Category,Cash_Inflow,Cash_Outflow,Net_Cashflow,Payment_Method,Notes
0,TXN_NEW00001,2024-07-27,SMB241,Payroll,0,27282,-27282,Credit,Warning: Negative cashflow
1,TXN_NEW00002,2024-07-11,SMB240,Rent,0,9543,-9543,Online Wallet,Warning: Negative cashflow
2,TXN_NEW00003,2025-07-20,SMB217,Payroll,0,28006,-28006,Credit,Warning: Negative cashflow
3,TXN_NEW00004,2024-12-18,SMB237,Payroll,0,6706,-6706,Cash,Warning: Negative cashflow
4,TXN_NEW00005,2024-09-16,SMB229,Miscellaneous,5322,0,5322,Credit,Market fluctuation
