In [3]:
import loan_default_data_processor as lp


In [4]:
# Step 1: Load Processed Data
processed_data_input_path = "/Users/mbq/Desktop/Project_Erdos/Data/processed_loan_data.csv"
df = lp.load_processed_data(processed_data_input_path)
print(f"Loaded data with shape: {df.shape}")

# Step 2: Split Features and Target
X, y = lp.split_features_target(df, target_column='loan_outcome')
print(f"Features shape: {X.shape}, Target shape: {y.shape}")

# Step 3: Perform Train-Test Split with Stratification
X_train, X_test, y_train, y_test = lp.perform_train_test_split(X, y, test_size=0.30, random_state=42)
print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")
print(f"Training target distribution:\n{y_train.value_counts()}")
print(f"Testing target distribution:\n{y_test.value_counts()}")

# Step 4: Identify Numerical Columns for Rescaling
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(f"Numerical columns to scale: {numerical_cols}")

# Step 5: Rescale Numerical Features
X_train_scaled, X_test_scaled, scaler = lp.rescale_features(X_train, X_test, numerical_cols)
print("Rescaling complete.")

# Step 6: Balance the Training Data using Undersampling
X_train_balanced, y_train_balanced, undersampler = lp.balance_training_data(X_train_scaled, y_train, random_state=42)
print(f"Balanced training set shape: {X_train_balanced.shape}, {y_train_balanced.shape}")
print(f"Balanced training target distribution:\n{y_train_balanced.value_counts()}")

# Step 7: Save the Processed Data
lp.save_processed_data(X_train_balanced, X_test_scaled, y_train_balanced, y_test, processed_dir='processed_data')

# Step 8: Save the Scaler and Undersampler Objects
lp.save_scaler_and_sampler(scaler, undersampler, model_dir='models')

print("Data processing complete.")


Loaded data with shape: (1136033, 180)
Features shape: (1136033, 179), Target shape: (1136033,)
Training set shape: (795223, 179), Testing set shape: (340810, 179)
Training target distribution:
loan_outcome
0    644485
1    150738
Name: count, dtype: int64
Testing target distribution:
loan_outcome
0    276208
1     64602
Name: count, dtype: int64
Numerical columns to scale: ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'loan_term', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_with