In [1]:
import pandas as pd
import numpy as np
import random

# Set a seed for reproducibility
random.seed(42)
np.random.seed(42)

# Number of records
num_records = 1000

# Generate synthetic data
customer_id = range(1, num_records + 1)
income = np.random.normal(loc=60000, scale=20000, size=num_records).astype(int)
income = np.maximum(income, 20000) # Ensure minimum income of 20,000

age = np.random.randint(20, 65, size=num_records)

loan_amount_factors = {
    'Home': 0.8,
    'Car': 0.2,
    'Education': 0.1,
    'Debt Consolidation': 0.3,
    'Other': 0.15
}
loan_purpose = random.choices(['Home', 'Car', 'Education', 'Debt Consolidation', 'Other'], k=num_records)
loan_amount = []
for i in range(num_records):
    base_loan = np.random.normal(loc=15000, scale=7000)
    loan_amount.append(max(2000, int(base_loan * (1 + loan_amount_factors[loan_purpose[i]])))) # Ensure minimum loan and adjust by purpose

credit_score = np.random.normal(loc=650, scale=80, size=num_records).astype(int)
credit_score = np.clip(credit_score, 300, 850) # Clip credit score to FICO range

employment_status_choices = ['Employed', 'Employed', 'Employed', 'Student', 'Unemployed', 'Retired']
employment_status = random.choices(employment_status_choices, weights=[0.6, 0.6, 0.6, 0.1, 0.05, 0.05], k=num_records)


# Generate 'default' based on a simple heuristic (more complex in real ML)
# Lower income, younger age (unless student), lower credit score, unemployed, high loan_amount relative to income tend to default
default = []
for i in range(num_records):
    # Base probability of default
    prob_default = 0.05 # 5% base default rate

    # Adjust based on factors
    if income[i] < 40000:
        prob_default += 0.1
    if age[i] < 25 and employment_status[i] != 'Student':
        prob_default += 0.08
    if credit_score[i] < 600:
        prob_default += 0.2
    elif credit_score[i] < 650:
        prob_default += 0.05
    if employment_status[i] == 'Unemployed':
        prob_default += 0.3
    if employment_status[i] == 'Student' and loan_purpose[i] != 'Education': # Students getting non-education loans
        prob_default += 0.1
    if loan_amount[i] > income[i] * 0.3: # Loan amount relatively high compared to income
        prob_default += 0.15

    # Clip probability to stay within [0, 1]
    prob_default = max(0, min(1, prob_default))

    # Determine default based on probability
    default.append(1 if random.random() < prob_default else 0)

# Create DataFrame
df_large = pd.DataFrame({
    'customer_id': customer_id,
    'income': income,
    'age': age,
    'loan_amount': loan_amount,
    'credit_score': credit_score,
    'employment_status': employment_status,
    'loan_purpose': loan_purpose,
    'default': default
})

print("--- Generated DataFrame Info ---")
df_large.info()
print("\n--- First 5 Rows ---")
print(df_large.head())
print("\n--- Default Distribution ---")
print(df_large['default'].value_counts(normalize=True))
print("\n--- Credit Score Distribution ---")
print(df_large['credit_score'].describe())

--- Generated DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   customer_id        1000 non-null   int64 
 1   income             1000 non-null   int64 
 2   age                1000 non-null   int32 
 3   loan_amount        1000 non-null   int64 
 4   credit_score       1000 non-null   int64 
 5   employment_status  1000 non-null   object
 6   loan_purpose       1000 non-null   object
 7   default            1000 non-null   int64 
dtypes: int32(1), int64(5), object(2)
memory usage: 58.7+ KB

--- First 5 Rows ---
   customer_id  income  age  loan_amount  credit_score employment_status  \
0            1   69934   63        10614           678          Employed   
1            2   57234   59        15755           677          Employed   
2            3   72953   30        20982           483          Employed   
3    

In [3]:
df_large.to_csv('credit_data.csv', index=False)