# Data SetUp

## Imports

In [8]:
import pandas as pd
import numpy as np
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub

## DataSet DownLoad & SetUp

In [12]:
# --- Directories ---
raw_data_dir = Path("raw_data")
processed_data_dir = Path("processed_data")

# Create directories if not exist
raw_data_dir.mkdir(exist_ok=True)
processed_data_dir.mkdir(exist_ok=True)

# File path for raw dataset
raw_dataset = raw_data_dir / "loan_approval_dataset.csv"

# Check and download dataset if not already available
if raw_dataset.exists():
    print("✔️ Dataset is already downloaded.")
else:
    dataset_path = Path(kagglehub.dataset_download("architsharma01/loan-approval-prediction-dataset"))

    if not dataset_path.exists():
        raise FileNotFoundError("⚠ Dataset not found.")

    # Handle possible "Data" subfolder
    data_root = dataset_path / "Data" if (dataset_path / "Data").exists() else dataset_path

    # Copy dataset into raw_data folder
    for item in data_root.iterdir():
        target = raw_data_dir / item.name
        if item.is_file():
            shutil.copy2(item, target)

    print("✔️ Dataset successfully downloaded.")


✔️ Dataset is already downloaded.


## Load Dataset

In [7]:
raw_df = pd.read_csv(raw_dataset)
print("✅ Data loaded successfully!")
raw_df.head()


✅ Data loaded successfully!


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


## Data PreProcessing

In [10]:
# 1. Clean column names
raw_df.columns = raw_df.columns.str.strip()

# 2. Drop non-informative columns
processed_df = raw_df.drop(columns=['loan_id'])

# 3. Handle missing values safely (no inplace warnings)
for col in processed_df.columns:
    if processed_df[col].dtype == "object":
        processed_df[col] = processed_df[col].fillna(processed_df[col].mode()[0])
    else:
        processed_df[col] = processed_df[col].fillna(processed_df[col].median())

# 4. Encode categorical variables
processed_df['education'] = processed_df['education'].map({" Graduate": 1, " Not Graduate": 0})
processed_df['self_employed'] = processed_df['self_employed'].map({" Yes": 1, " No": 0})
processed_df['loan_status'] = processed_df['loan_status'].map({" Approved": 1, " Rejected": 0})

# 5. Standardize numerical features
numerical_features = [
    'no_of_dependents', 'income_annum', 'loan_amount', 'loan_term',
    'cibil_score', 'residential_assets_value', 'commercial_assets_value',
    'luxury_assets_value', 'bank_asset_value'
]

scaler = StandardScaler()
processed_df[numerical_features] = scaler.fit_transform(processed_df[numerical_features])

print("✔️ Preprocessing completed (no warnings).")
processed_df.head()




Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,-0.294102,1,0,1.617979,1.633052,0.192617,1.032792,-0.780058,2.877289,0.832028,0.930304,1
1,-1.473548,0,1,-0.34175,-0.324414,-0.508091,-1.061051,-0.733924,-0.631921,-0.694993,-0.515936,0
2,0.295621,1,0,1.439822,1.610933,1.594031,-0.54484,-0.0573,-0.107818,1.99652,2.407316,0
3,0.295621,1,0,1.119139,1.721525,-0.508091,-0.771045,1.649637,-0.381263,0.897943,0.899533,0
4,1.475067,0,1,1.689242,1.002681,1.594031,-1.264055,0.757724,0.735304,1.568075,0.007172,0


## Train/Test Split

In [11]:
# Separate features (X) and target (y)
X = processed_df.drop(columns=['loan_status'])
y = processed_df['loan_status']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✔️ Data successfully split:")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples:  {X_test.shape[0]}")


✔️ Data successfully split:
Training samples: 3415
Testing samples:  854


## Save Processed Data

In [13]:
# Save full preprocessed dataset
processed_df.to_csv(processed_data_dir / "loan_approval_preprocessed.csv", index=False)

# Save train/test splits
train_df = X_train.copy()
train_df["loan_status"] = y_train
train_df.to_csv(processed_data_dir / "train.csv", index=False)

test_df = X_test.copy()
test_df["loan_status"] = y_test
test_df.to_csv(processed_data_dir / "test.csv", index=False)

print("✔️ Processed dataset and splits saved successfully.")


✔️ Processed dataset and splits saved successfully.


In [14]:
# --- Quick check of saved files ---

print("Files saved in:", processed_data_dir)
for f in processed_data_dir.iterdir():
    print(" -", f.name)


Files saved in: processed_data
 - loan_approval_preprocessed.csv
 - test.csv
 - train.csv
