#### Imports

In [None]:
import pandas as pd
import numpy as np
import joblib
import boto3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

#### Load Data & Train-Test Split

In [7]:
#load dataset
df = pd.read_csv('loan-data.csv')

#features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

#rain-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution - Train: {y_train.value_counts().to_dict()}")

Training set: (36000, 17)
Test set: (9000, 17)
Class distribution - Train: {0: 28000, 1: 8000}


#### Feature Groups

In [8]:
numeric_features = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
categorical_ohe = ['person_gender', 'employment_type', 'person_home_ownership', 'loan_intent', 'account_type']
ordinal_features = ['person_education', 'previous_loan_defaults_on_file']
ordinal_categories = [['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate'], ['No', 'Yes']]

#### Preprocessing Pipeline

In [9]:
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat_ohe', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_ohe),
    ('cat_ord', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
    ]), ordinal_features)
], remainder='drop')

print("Preprocessing pipeline created successfully")

Preprocessing pipeline created successfully


#### Train Best Model (Random Forest)

In [10]:
#create full pipeline with preprocessing + model
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight='balanced_subsample',
        random_state=42
    ))
])

#train model
print("Training Random Forest model...")
rf_pipeline.fit(X_train, y_train)
print("Model training completed!")

Training Random Forest model...
Model training completed!


#### Model Evaluation

In [11]:
#predictions on test set
y_pred = rf_pipeline.predict(X_test)

#calculate metrics
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

print("="*70)
print("MODEL PERFORMANCE ON TEST SET")
print("="*70)
print(f"Precision: {test_precision:.4f} (Primary Metric)")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")
print("="*70)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

MODEL PERFORMANCE ON TEST SET
Precision: 0.9046 (Primary Metric)
Recall:    0.7440
F1-Score:  0.8165

Confusion Matrix:
[[6843  157]
 [ 512 1488]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      7000
           1       0.90      0.74      0.82      2000

    accuracy                           0.93      9000
   macro avg       0.92      0.86      0.88      9000
weighted avg       0.92      0.93      0.92      9000



#### Save Models Locally

In [12]:
#save complete pipeline (preprocessor + model)
joblib.dump(rf_pipeline, 'loan_approval_pipeline.pkl')
print("Saved: loan_approval_pipeline.pkl")
#save preprocessor separately (optional, for flexibility)
joblib.dump(preprocessor, 'loan_preprocessing_pipeline.pkl')
print("Saved: loan_preprocessing_pipeline.pkl")
#save trained model separately (optional)
joblib.dump(rf_pipeline.named_steps['classifier'], 'loan_approval_model.pkl')
print("Saved: loan_approval_model.pkl")

print("\nAll models saved successfully!")

Saved: loan_approval_pipeline.pkl
Saved: loan_preprocessing_pipeline.pkl
Saved: loan_approval_model.pkl

All models saved successfully!


#### Upload to S3

In [None]:
#initialize S3 client
s3 = boto3.client('s3')
bucket_name = '' #change to your bucket name

#upload files to S3
print("Uploading models to S3...")

s3.upload_file('loan_approval_pipeline.pkl', 
               bucket_name, 
               'models/loan_approval_pipeline.pkl')
print(f"Uploaded to s3://{bucket_name}/models/loan_approval_pipeline.pkl")

s3.upload_file('loan_preprocessing_pipeline.pkl', 
               bucket_name, 
               'models/loan_preprocessing_pipeline.pkl')
print(f"Uploaded to s3://{bucket_name}/models/loan_preprocessing_pipeline.pkl")

s3.upload_file('loan_approval_model.pkl', 
               bucket_name, 
               'models/loan_approval_model.pkl')
print(f"Uploaded to s3://{bucket_name}/models/loan_approval_model.pkl")

print("\nAll models successfully uploaded to S3!")

#### Test Model Loading (Verification)

In [None]:
#verify models can be loaded
loaded_pipeline = joblib.load('loan_approval_pipeline.pkl')

#test prediction on a sample
sample_prediction = loaded_pipeline.predict(X_test.iloc[:5])
print("Sample predictions:", sample_prediction)
print("Actual values:    ", y_test.iloc[:5].values)
print("\nModel loading and prediction successful!")