1️⃣ Setup and Imports

In [1]:
# If running in Google Colab, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np   # For numerical operations
import matplotlib.pyplot as plt  # For visualization
import os  # For file operations
import pickle  # For saving and loading Python objects

# Importing Scikit-learn modules
from sklearn.model_selection import train_test_split  # For splitting datasets
from sklearn.base import BaseEstimator, TransformerMixin  # For custom transformers
from sklearn.pipeline import Pipeline  # For creating ML pipelines


Mounted at /content/drive


2️⃣ Download and Load the Dataset

In [2]:
# Download the dataset (if applicable)
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Ejs0yaRm3NxFOVIhwQphoDz8voJl6NQx' -O loanpred_train.csv

# Load the dataset into a Pandas DataFrame
df = pd.read_csv("loanpred_train.csv")

# Display the first few rows to check the data
df.head()


--2025-02-17 22:34:03--  https://docs.google.com/uc?export=download&id=1Ejs0yaRm3NxFOVIhwQphoDz8voJl6NQx
Resolving docs.google.com (docs.google.com)... 173.194.194.139, 173.194.194.138, 173.194.194.113, ...
Connecting to docs.google.com (docs.google.com)|173.194.194.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1Ejs0yaRm3NxFOVIhwQphoDz8voJl6NQx&export=download [following]
--2025-02-17 22:34:04--  https://drive.usercontent.google.com/download?id=1Ejs0yaRm3NxFOVIhwQphoDz8voJl6NQx&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.201.132, 2607:f8b0:4001:c01::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.201.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33679 (33K) [application/octet-stream]
Saving to: ‘loanpred_train.csv’


2025-02-17 22:34:06 (83.7 MB/s) - ‘loanpred_train.csv’ saved [3

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001535,Male,No,0,Graduate,No,3254,0.0,50.0,360.0,1.0,Urban,Y
1,LP001792,Male,Yes,1,Graduate,No,3315,0.0,96.0,360.0,1.0,Semiurban,Y
2,LP002443,Male,Yes,2,Graduate,No,3340,1710.0,150.0,360.0,0.0,Rural,N
3,LP002517,Male,Yes,1,Not Graduate,No,2653,1500.0,113.0,180.0,0.0,Rural,N
4,LP001894,Male,Yes,0,Graduate,No,2620,2223.0,150.0,360.0,1.0,Semiurban,Y


3️⃣ Data Exploration

In [3]:
# Display basic dataset information
df.info()

# Check for missing values
df.isnull().sum()

# Display summary statistics
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            491 non-null    object 
 1   Gender             480 non-null    object 
 2   Married            488 non-null    object 
 3   Dependents         483 non-null    object 
 4   Education          491 non-null    object 
 5   Self_Employed      464 non-null    object 
 6   ApplicantIncome    491 non-null    int64  
 7   CoapplicantIncome  491 non-null    float64
 8   LoanAmount         471 non-null    float64
 9   Loan_Amount_Term   479 non-null    float64
 10  Credit_History     448 non-null    float64
 11  Property_Area      491 non-null    object 
 12  Loan_Status        491 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 50.0+ KB


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,491.0,491.0,471.0,479.0,448.0
mean,5529.997963,1569.537271,147.309979,341.286013,0.850446
std,6457.784318,2789.523475,87.096507,65.855043,0.357032
min,210.0,0.0,9.0,12.0,0.0
25%,2906.0,0.0,100.0,360.0,1.0
50%,3859.0,1032.0,128.0,360.0,1.0
75%,5825.0,2241.0,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


4️⃣ Data Preprocessing

In [5]:
# Filling missing numerical values with the mean
for column in df.select_dtypes(include=np.number):
    # Check if the column has any missing values before attempting to fill them
    if df[column].isnull().any():
        df[column].fillna(df[column].mean(), inplace=True)

# Filling missing categorical values with the mode
for col in df.select_dtypes(include=["object"]).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Convert categorical columns to category data type for better memory usage
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype("category")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


5️⃣ Train-Test Split

In [8]:
# Define features (X) and target variable (y)
X = df.drop(columns=["Loan_Status"])  # Replace 'TargetColumn' with the actual column name
y = df["Loan_Status"]

# Split data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


6️⃣ Custom Transformer and Pipeline

In [9]:
# Define a custom transformer (example: standardize a numeric column)
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda x: (x - x.mean()) / x.std() if x.dtype in [np.float64, np.int64] else x)

# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ("custom_transform", CustomTransformer())
])

# Apply the transformation
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)
X_test_transformed = preprocessing_pipeline.transform(X_test)




7️⃣ Save Processed Data for Future Use

In [10]:
# Save the processed dataset for later use
with open("processed_data.pkl", "wb") as f:
    pickle.dump((X_train_transformed, X_test_transformed, y_train, y_test), f)

# Load data from the pickle file (if needed later)
with open("processed_data.pkl", "rb") as f:
    X_train_loaded, X_test_loaded, y_train_loaded, y_test_loaded = pickle.load(f)
