In [1]:
import pandas as pd
from pathlib import Path

In [2]:
DATA_DIR= Path(".")
train_path = DATA_DIR / "credit_train.csv"
test_path = DATA_DIR / "credit_test.csv"
out_path = DATA_DIR / "bank_loans_master.csv" 

In [3]:
def load_normalise(path):
    df=pd.read_csv(path)
    df.columns=[c.strip().lower().replace(' ','_') for c in df.columns]
    return df

In [4]:
print("Loading and normalising train...")
train_norm=load_normalise(train_path)
print("Loading and normalising test...")
test_norm=load_normalise(test_path)

Loading and normalising train...
Loading and normalising test...


In [5]:
print("Reading files...")
train = train_norm
test=test_norm

Reading files...


In [6]:
print("Train shape: ",train.shape)
print("Test shape: ",test.shape)

Train shape:  (100514, 19)
Test shape:  (10353, 18)


In [7]:
train_cols= set(train.columns)
test_cols = set(test.columns)

In [8]:
print("Columns only in train: ",sorted(list(train_cols - test_cols)))
print("columns only in the test: ",sorted(list(train_cols-test_cols)))

Columns only in train:  ['loan_status']
columns only in the test:  ['loan_status']


In [9]:
train['__source']='train'
test['__source']='test'

In [10]:
all_cols=list(dict.fromkeys(list(train.columns)+list(test.columns)))
train = train.reindex(columns=all_cols)
test=test.reindex(columns=all_cols)

In [11]:
master = pd.concat([train,test],ignore_index=True)
print("Master shape: ",master.shape)

Master shape:  (110867, 20)


In [12]:
print("\nValue counts for __source:\n", master['__source'].value_counts())
print("\nTop missing counts:\n", master.isna().sum().sort_values(ascending=False).head(20))


Value counts for __source:
 __source
train    100514
test      10353
Name: count, dtype: int64

Top missing counts:
 months_since_last_delinquent    59314
credit_score                    22002
annual_income                   22002
loan_status                     10867
years_in_current_job             5516
bankruptcies                     1093
tax_liens                         878
maximum_open_credit               869
loan_id                           867
current_credit_balance            867
number_of_credit_problems         867
number_of_open_accounts           867
monthly_debt                      867
years_of_credit_history           867
customer_id                       867
purpose                           867
home_ownership                    867
term                              867
current_loan_amount               867
__source                            0
dtype: int64


In [13]:
master.to_csv(out_path,index=False)
print(f"\nSaved combined master csv to {out_path.resolve()}")


Saved combined master csv to /Users/starboy/Documents/Projects/Bank_loan_status/Dataset/bank_loans_master.csv


In [14]:
import os
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
print("CSV files in current directory:")
for f in csv_files:
    print(f"  • {f}")


CSV files in current directory:
  • credit_test.csv
  • bank_loans_master.csv
  • credit_train.csv
