In [1]:
import numpy as np 
import pandas as pd
from sklearn.datasets import make_classification

In [2]:
#Set random seed for reproducibility
np.random.seed(42)

In [4]:
#Generate synthetic data

n_samples = 10000
X, y = make_classification(n_samples=n_samples, n_features=8, n_informative=5,
                          n_redundant=2, n_classes=2, weights=[0.7, 0.3],
                          random_state=42)

In [5]:
#Create a DataFrame

df = pd.DataFrame(X, columns=['income', 'age', 'employment_length', 'debt_to_income',
                             'credit_score', 'num_credit_lines', 'num_late_payments',
                             'loan_amount'])

In [7]:
#Add some noise and scale features to make them more realistic

df['income'] = np.exp(df['income'] * 2) *1000 + 20000
df['age'] = df['age'] * 10 + 40
df['employment_length'] = np.abs(df['employment_length'] * 5 + 2).astype(int)
df['debt_to_income'] = np.abs(df['debt_to_income'] * 0.2 + 0.3)
df['credit_score'] = np.clip(df['credit_score'] * 100 + 650, 300,850).astype(int)
df['num_credit_lines'] = np.abs(df['num_credit_lines'] *3 + 5).astype(int)
df['num_late_payments'] = np.abs(df['num_late_payments'] * 2).astype(int)
df['loan_amount'] = np.exp(df['loan_amount']) * 1000 + 5000

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
#Add target variable (0 for repaid, 1 for default)

df['loan_status'] = y

In [10]:
print(df.head())
print("\nDataset Info:")
print(df.info())

   income         age  employment_length  debt_to_income  credit_score  \
0     inf  485.564294                  5        0.258730           522   
1     inf  546.890627                  0        0.183107           850   
2     inf  401.933287                  0        0.534357           701   
3     inf  427.923534                  1        0.089264           850   
4     inf  677.152125                  8        0.082136           533   

   num_credit_lines  num_late_payments  loan_amount  loan_status  
0                10                  2  5073.069530            0  
1                 7                  5  6084.382345            0  
2                 3                  0  7249.086480            0  
3                13                  6  5564.358483            0  
4                 0                  2  5176.617390            0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Nul

In [13]:
#Save to CSV

df.to_csv('credit_scoring_dataset.csv', index=False)
print("\nDataset saved 'credit_scoring_dataset.csv'")


Dataset saved 'credit_scoring_dataset.csv'
