In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# 1. Load Data
df = pd.read_csv('../data/raw_data.csv', low_memory=False)
print(f"Dataset Shape: {df.shape}")

Dataset Shape: (100000, 28)


In [3]:
print("\n--- 1. AGE: Checking for Special Characters & Outliers ---")
# The PDF mentions ages like "23_" and "-500"
# Let's find rows where Age is NOT a simple number
non_numeric_age = df[pd.to_numeric(df['Age'], errors='coerce').isna()]['Age'].unique()
print(f"Examples of dirty 'Age' values:\n{non_numeric_age[:10]}")
print(f"Min Age (if strings allowed): {df['Age'].min()}")



--- 1. AGE: Checking for Special Characters & Outliers ---
Examples of dirty 'Age' values:
['28_' '34_' '30_' '24_' '33_' '35_' '31_' '40_' '37_' '54_']
Min Age (if strings allowed): -500


In [5]:
print("\n--- 2. ANNUAL_INCOME: Checking for Regex issues ---")
# The PDF mentions "19114.12_" and currency symbols
# Let's inspect values that contain underscores
dirty_income = df[df['Annual_Income'].astype(str).str.contains('_', na=False)]['Annual_Income'].unique()
print(f"Examples of dirty 'Annual_Income' values:\n{dirty_income[:10]}")


--- 2. ANNUAL_INCOME: Checking for Regex issues ---
Examples of dirty 'Annual_Income' values:
['34847.84_' '30689.89_' '35547.71_' '34081.38_' '114838.41_' '88640.24_'
 '54392.16_' '8701.545_' '25546.26_' '92047.08_']


In [6]:
print("\n--- 3. NUM_OF_LOAN: Checking for invalid formats ---")
# PDF mentions this is loaded as a string due to characters
dirty_loans = df[df['Num_of_Loan'].astype(str).str.contains('_', na=False)]['Num_of_Loan'].unique()
print(f"Examples of dirty 'Num_of_Loan' values:\n{dirty_loans[:10]}")


--- 3. NUM_OF_LOAN: Checking for invalid formats ---
Examples of dirty 'Num_of_Loan' values:
['0_' '3_' '2_' '5_' '8_' '9_' '4_' '7_' '1_' '6_']


In [7]:
print("\n--- 4. NUM_BANK_ACCOUNTS: Extreme Outliers ---")
# PDF mentions outliers like 1798 accounts.
# We force convert to numeric first, coercing errors to NaN for this quick check
accounts = pd.to_numeric(df['Num_Bank_Accounts'], errors='coerce')
max_accounts = accounts.max()
print(f"Maximum Bank Accounts found: {max_accounts}")
print(f"Count of users with > 100 accounts: {len(accounts[accounts > 100])}")


--- 4. NUM_BANK_ACCOUNTS: Extreme Outliers ---
Maximum Bank Accounts found: 1798
Count of users with > 100 accounts: 1246


In [15]:
print("\n--- 5. TARGET VARIABLE: Class Balance ---")
print(df['Credit_Score'].value_counts(normalize=True))


--- 5. TARGET VARIABLE: Class Balance ---
Credit_Score
Standard    0.53174
Poor        0.28998
Good        0.17828
Name: proportion, dtype: float64


In [9]:

import sys
import os

# Add the 'src' directory to Python's path
# This steps up one level from 'notebooks' (..) and goes into 'src'
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

print(sys.path[-1]) # Should print /.../project_alpha/src

/home/shubham/Documents/projects/ml-zoomcamp/project_alpha/src


In [11]:
from project_alpha.cleaning import RegexCleaner, OutlierCapper

In [12]:
sample_df = df[['Age', 'Annual_Income', 'Num_of_Loan', 'Num_Bank_Accounts']].head(20).copy()

print("--- Before Cleaning ---")
print(sample_df.iloc[0]) # Should show string values


--- Before Cleaning ---
Age                        23
Annual_Income        19114.12
Num_of_Loan                 4
Num_Bank_Accounts           3
Name: 0, dtype: object


In [13]:
# Initialize and Apply RegexCleaner
cleaner = RegexCleaner(columns=['Age', 'Annual_Income', 'Num_of_Loan', 'Num_Bank_Accounts'])
sample_cleaned = cleaner.transform(sample_df)

print("\n--- After Regex Cleaning ---")
print(sample_cleaned.iloc[0]) # Should be floats
print(f"Age Type: {sample_cleaned['Age'].dtype}")


--- After Regex Cleaning ---
Age                     23.00
Annual_Income        19114.12
Num_of_Loan              4.00
Num_Bank_Accounts        3.00
Name: 0, dtype: float64
Age Type: int64


In [14]:
# 2. Test OutlierCapper
# Force an outlier for testing
sample_cleaned.loc[0, 'Age'] = -500 
sample_cleaned.loc[0, 'Num_Bank_Accounts'] = 1500

capper = OutlierCapper()
capper.fit(sample_cleaned) # Learns the median
final_sample = capper.transform(sample_cleaned)

print("\n--- After Outlier Handling ---")
print(f"Old Age: -500 -> New Age: {final_sample.loc[0, 'Age']}")
print(f"Old Accounts: 1500 -> New Accounts: {final_sample.loc[0, 'Num_Bank_Accounts']}")


--- After Outlier Handling ---
Old Age: -500 -> New Age: 28
Old Accounts: 1500 -> New Accounts: 20


In [15]:
#features

from project_alpha.features import MissingValueImputer, FeatureEngineer

In [16]:
# 1. Test Missing Value Imputation
# Create a sample with a missing balance
print("\n--- Testing Missing Value Imputer ---")
test_missing = sample_cleaned.copy()
test_missing['Occupation'] = ['Engineer', 'Teacher'] * 10  # Dummy occupations
test_missing['Monthly_Balance'] = [np.nan, 300.0] * 10    # Create holes

imputer = MissingValueImputer()
imputer.fit(test_missing)
test_filled = imputer.transform(test_missing)

print(f"Original Missing Count: {test_missing['Monthly_Balance'].isna().sum()}")
print(f"Filled Missing Count: {test_filled['Monthly_Balance'].isna().sum()}")
print(f"Imputed Value (Engineer): {test_filled.iloc[0]['Monthly_Balance']}") 
# Note: Since fit() saw np.nan for Engineer in this tiny sample, it might use global median or 0 depending on data. 
# In the real full dataset fit, it will work correctly.


--- Testing Missing Value Imputer ---
Original Missing Count: 10
Filled Missing Count: 0
Imputed Value (Engineer): 300.0


In [18]:
# 2. Test Feature Engineering
print("\n--- Testing Financial Ratios ---")
# We need 'Outstanding_Debt' which wasn't in our small sample_cleaned earlier
# Let's grab it from the main df and clean it quickly for the test
sample_w_debt = df[['Annual_Income', 'Outstanding_Debt', 'Num_Credit_Card', 'Monthly_Inhand_Salary']].head(5).copy()
cleaner_debt = RegexCleaner(columns=['Annual_Income', 'Outstanding_Debt', 'Num_Credit_Card', 'Monthly_Inhand_Salary'])
sample_w_debt = cleaner_debt.transform(sample_w_debt)

engineer = FeatureEngineer()
sample_engineered = engineer.transform(sample_w_debt)

print(sample_engineered[['DTI_Ratio', 'Utilization_Proxy', 'Income_Stability']].head())


--- Testing Financial Ratios ---
   DTI_Ratio  Utilization_Proxy  Income_Stability
0   0.042374           0.040497          0.145644
1   0.042374           0.040497          0.999948
2   0.042374           0.040497          0.999948
3   0.042374           0.040497          0.999948
4   0.042374           0.040497          0.145644
