Import libraries

In [1]:
import pandas as pd
import numpy as np

Load the dataset

In [2]:
data_path='credit_train.csv'

Normalise column names

In [3]:
def load_normalize(path):
    df = pd.read_csv(path)
    df.columns = [c.lower().strip().replace(' ','_') for c in df.columns]
    return df

In [4]:
df=load_normalize(data_path)

Basic Exploration

In [5]:
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst 5 rows: ")
df.head()

Dataset shape: (100514, 19)
Columns: ['loan_id', 'customer_id', 'loan_status', 'current_loan_amount', 'term', 'credit_score', 'annual_income', 'years_in_current_job', 'home_ownership', 'purpose', 'monthly_debt', 'years_of_credit_history', 'months_since_last_delinquent', 'number_of_open_accounts', 'number_of_credit_problems', 'current_credit_balance', 'maximum_open_credit', 'bankruptcies', 'tax_liens']

First 5 rows: 


Unnamed: 0,loan_id,customer_id,loan_status,current_loan_amount,term,credit_score,annual_income,years_in_current_job,home_ownership,purpose,monthly_debt,years_of_credit_history,months_since_last_delinquent,number_of_open_accounts,number_of_credit_problems,current_credit_balance,maximum_open_credit,bankruptcies,tax_liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0


Check Datatypes

In [6]:
print(f"\nData info: {df.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100514 entries, 0 to 100513
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   loan_id                       100000 non-null  object 
 1   customer_id                   100000 non-null  object 
 2   loan_status                   100000 non-null  object 
 3   current_loan_amount           100000 non-null  float64
 4   term                          100000 non-null  object 
 5   credit_score                  80846 non-null   float64
 6   annual_income                 80846 non-null   float64
 7   years_in_current_job          95778 non-null   object 
 8   home_ownership                100000 non-null  object 
 9   purpose                       100000 non-null  object 
 10  monthly_debt                  100000 non-null  float64
 11  years_of_credit_history       100000 non-null  float64
 12  months_since_last_delinquent  46859 non-null

In [7]:
print(f"\nMissing Values: ")
print(df.isnull().sum())


Missing Values: 
loan_id                           514
customer_id                       514
loan_status                       514
current_loan_amount               514
term                              514
credit_score                    19668
annual_income                   19668
years_in_current_job             4736
home_ownership                    514
purpose                           514
monthly_debt                      514
years_of_credit_history           514
months_since_last_delinquent    53655
number_of_open_accounts           514
number_of_credit_problems         514
current_credit_balance            514
maximum_open_credit               516
bankruptcies                      718
tax_liens                         524
dtype: int64


Create a copy for cleaning

In [8]:
dfc = df.copy()

In [9]:
print(f"Null values for loan_status is: {dfc['loan_status'].isnull().sum()}")

Null values for loan_status is: 514


In [10]:
dfc=dfc.dropna(subset=['loan_status'])
print(f"After removing missing loan status: {len(dfc)} records")

After removing missing loan status: 100000 records


In [11]:
print(dfc.isnull().sum())

loan_id                             0
customer_id                         0
loan_status                         0
current_loan_amount                 0
term                                0
credit_score                    19154
annual_income                   19154
years_in_current_job             4222
home_ownership                      0
purpose                             0
monthly_debt                        0
years_of_credit_history             0
months_since_last_delinquent    53141
number_of_open_accounts             0
number_of_credit_problems           0
current_credit_balance              0
maximum_open_credit                 2
bankruptcies                      204
tax_liens                          10
dtype: int64


Data Range Validation

In [12]:
print("Validating data ranges before imputation: ")

print(f"Credit Score Range: {dfc['credit_score'].min():.0f} to {dfc['credit_score'].max():.0f}")
print(f"Loan Amount Range: ${dfc['current_loan_amount'].min():,.0f} to ${dfc['current_loan_amount'].max():,.0f}")
print(f"Annual Income Range: ${dfc['annual_income'].min():,.0f} to ${dfc['annual_income'].max():,.0f}")

Validating data ranges before imputation: 
Credit Score Range: 585 to 7510
Loan Amount Range: $10,802 to $99,999,999
Annual Income Range: $76,627 to $165,557,393


Credit scores must be from 300-850

In [13]:
print(f"\n1. Fixing Credit Scores:")
invalid_credit_count = ((dfc['credit_score'] < 300) | (dfc['credit_score'] > 850)).sum()
print(f"   Invalid credit scores found: {invalid_credit_count}")


1. Fixing Credit Scores:
   Invalid credit scores found: 4551


In [14]:
dfc.loc[dfc['credit_score'] < 300, 'credit_score'] = 300
dfc.loc[dfc['credit_score'] > 850, 'credit_score'] = 850
print(f"   ✓ Capped credit scores to 300-850 range")

   ✓ Capped credit scores to 300-850 range


Fix Loan Amounts (remove 999999999 placeholders)

In [15]:
print(f"\n2. Fixing Loan Amounts:")
placeholder_loans = (dfc['current_loan_amount'] >= 99999999).sum()
print(f"   Placeholder loan amounts found: {placeholder_loans}")


2. Fixing Loan Amounts:
   Placeholder loan amounts found: 11484


replace with realistic median

In [16]:
realistic_loans = dfc[(dfc['current_loan_amount'] > 1000) & (dfc['current_loan_amount'] < 1000000)]['current_loan_amount']
realistic_loan_median = realistic_loans.median()

In [17]:
dfc.loc[dfc['current_loan_amount'] >= 99999999, 'current_loan_amount'] = realistic_loan_median
print(f"   ✓ Replaced {placeholder_loans} placeholder values with ${realistic_loan_median:,.0f}")


   ✓ Replaced 11484 placeholder values with $267,344


Fix extreme incomes

In [18]:
print(f"\n3. Fixing Annual income")
extreme_income_count = (dfc['annual_income'] > 10000000).sum()
if extreme_income_count > 0:
    reasonable_incomes = dfc[(dfc['annual_income'] >= 10000) & (dfc['annual_income'] <= 1000000)]['annual_income']
    reasonable_income_median = reasonable_incomes.median()
    
    dfc.loc[dfc['annual_income'] > 10000000, 'annual_income'] = reasonable_income_median
    print(f"   ✓ Fixed {extreme_income_count} extreme income values")


3. Fixing Annual income
   ✓ Fixed 68 extreme income values


In [19]:
print(f"\nData ranges corrected!")
print(f"New Credit Score Range: {dfc['credit_score'].min():.0f} to {dfc['credit_score'].max():.0f}")
print(f"New Loan Amount Range: ${dfc['current_loan_amount'].min():,.0f} to ${dfc['current_loan_amount'].max():,.0f}")
print(f"New Income Range: ${dfc['annual_income'].min():,.0f} to ${dfc['annual_income'].max():,.0f}")



Data ranges corrected!
New Credit Score Range: 585 to 850
New Loan Amount Range: $10,802 to $789,250
New Income Range: $76,627 to $9,939,280


In [20]:
numerical_cols=[
                'credit_score',
                'annual_income',
                'monthly_debt',
                'years_of_credit_history',
                'months_since_last_delinquent',
                'number_of_open_accounts',
                'number_of_credit_problems',
                'current_credit_balance',
                'maximum_open_credit',
                'bankruptcies',
                'tax_liens']

In [21]:
imputation_cols = [col for col in numerical_cols if col != 'current_loan_amount']

In [22]:
for col in imputation_cols:
    missing_count = dfc[col].isnull().sum()
    if missing_count > 0:
        # Use median of existing values (now that extremes are removed)
        median_val = dfc[col].median()
        dfc[col] = dfc[col].fillna(median_val)
        print(f"   ✓ {col}: Imputed {missing_count:,} values with {median_val}")

print("\nNumerical imputation completed with realistic values!")

   ✓ credit_score: Imputed 19,154 values with 724.0
   ✓ annual_income: Imputed 19,154 values with 1171882.0
   ✓ months_since_last_delinquent: Imputed 53,141 values with 32.0
   ✓ maximum_open_credit: Imputed 2 values with 467874.0
   ✓ bankruptcies: Imputed 204 values with 0.0
   ✓ tax_liens: Imputed 10 values with 0.0

Numerical imputation completed with realistic values!


In [23]:
dfc.isnull().sum()

loan_id                            0
customer_id                        0
loan_status                        0
current_loan_amount                0
term                               0
credit_score                       0
annual_income                      0
years_in_current_job            4222
home_ownership                     0
purpose                            0
monthly_debt                       0
years_of_credit_history            0
months_since_last_delinquent       0
number_of_open_accounts            0
number_of_credit_problems          0
current_credit_balance             0
maximum_open_credit                0
bankruptcies                       0
tax_liens                          0
dtype: int64

Categorical imputation

In [24]:
impute_count = dfc['years_in_current_job'].isnull().sum()
dfc['years_in_current_job'].fillna('Unknown', inplace=True)
print(f"Imputed {impute_count} missing ‘years_in_current_job’ values with 'Unknown'")

Imputed 4222 missing ‘years_in_current_job’ values with 'Unknown'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfc['years_in_current_job'].fillna('Unknown', inplace=True)


In [25]:
print(dfc.isnull().sum())

loan_id                         0
customer_id                     0
loan_status                     0
current_loan_amount             0
term                            0
credit_score                    0
annual_income                   0
years_in_current_job            0
home_ownership                  0
purpose                         0
monthly_debt                    0
years_of_credit_history         0
months_since_last_delinquent    0
number_of_open_accounts         0
number_of_credit_problems       0
current_credit_balance          0
maximum_open_credit             0
bankruptcies                    0
tax_liens                       0
dtype: int64


In [26]:
dfc.to_csv("Cleaned_data.csv",index=False)
print("Cleaned dataset is saved")

Cleaned dataset is saved


In [27]:
dfc.to_csv('credit_cleaned.csv', index=False)
print("Saved fully cleaned data as 'credit_train_fully_cleaned.csv'")

Saved fully cleaned data as 'credit_train_fully_cleaned.csv'
