In [1]:
import argparse
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import mlflow
import mlflow.xgboost
from mlflow.tracking import MlflowClient

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("BuildInference") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/10 17:06:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load Model

In [4]:
def load_model(model_name="credit_model_LR_3", snapshot_date_str="2024-08-01", fallback_dir="/app/airflow/model_bank/"):
    """
    Load model from MLflow Registry or fallback to pickle file.
    
    Args:
        model_name: Name of the model in MLflow Registry
        snapshot_date: Snapshot date string (e.g., "2024-08-01") for pickle fallback
        fallback_dir: Directory containing pickle files
    
    Returns:
        Loaded model object
    """
    snapshot_date = snapshot_date_str.replace("-", "_")
    
    # Try loading from MLflow Registry
    try:
        mlflow.set_tracking_uri("http://mlflow:5000")
        print(f"üîç Attempting to load model from MLflow Registry: {model_name}")
        
        model = mlflow.sklearn.load_model(f"models:/{model_name}/latest")
        print(f"‚úÖ Successfully loaded model from MLflow Registry")
        
        # Try to load scaler artifact if needed
        try:
            client = MlflowClient()
            versions = client.search_model_versions(f"name='{model_name}'")
            if versions:
                latest_version = versions[0]
                run_id = latest_version.run_id
                scaler_path = mlflow.artifacts.download_artifacts(
                    run_id=run_id, 
                    artifact_path="preprocessing/temp_scaler.pkl"
                )
                with open(scaler_path, 'rb') as f:
                    scaler = pickle.load(f)
                print(f"‚úÖ Successfully loaded scaler from MLflow")
                return model, scaler
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not load scaler from MLflow: {e}")
            return model, None
            
        return model, None
        
    except Exception as e:
        print(f"‚ùå Failed to load from MLflow Registry: {e}")
        
        # Fallback to pickle file
        pickle_filename = f"credit_model_LR_{snapshot_date}.pkl"
        pickle_path = f"{fallback_dir}{pickle_filename}"
        
        try:
            print(f"üîç Attempting to load from pickle: {pickle_path}")
            with open(pickle_path, 'rb') as f:
                model_data = pickle.load(f)
            
            # Handle different pickle formats
            if isinstance(model_data, dict):
                model = model_data.get('model')
                scaler = model_data.get('scaler')
                print(f"‚úÖ Successfully loaded model and scaler from pickle (dict format)")
            elif isinstance(model_data, tuple):
                model, scaler = model_data
                print(f"‚úÖ Successfully loaded model and scaler from pickle (tuple format)")
            else:
                model = model_data
                scaler = None
                print(f"‚úÖ Successfully loaded model from pickle (single object)")
            
            return model, scaler
            
        except FileNotFoundError:
            print(f"‚ùå Pickle file not found: {pickle_path}")
            raise Exception(f"Could not load model from MLflow or pickle file: {pickle_path}")
        except Exception as e:
            print(f"‚ùå Failed to load from pickle: {e}")
            raise

In [5]:
# Usage example:
if __name__ == "__main__":
    # Try to load model
    model, scaler = load_model(
        model_name="credit_model_LR_3",
        snapshot_date_str="2024-08-01",
        fallback_dir="/app/airflow/model_bank/"
    )
    
    print(f"\nModel type: {type(model).__name__}")
    if scaler:
        print(f"Scaler type: {type(scaler).__name__}")
    
    # Test prediction if model loaded successfully
    if model:
        import numpy as np
        n_features = model.n_features_in_ if hasattr(model, 'n_features_in_') else 10
        dummy_data = np.random.rand(1, n_features)
        
        if scaler:
            dummy_data = scaler.transform(dummy_data)
        
        prediction = model.predict(dummy_data)
        print(f"\n‚úÖ Test prediction: {prediction}")

üîç Attempting to load model from MLflow Registry: credit_model_LR_3
‚ùå Failed to load from MLflow Registry: No such file or directory: '/mlflow/mlruns/6/models/m-300e00dc120a402cbc245d53cd9d574b/artifacts/.'
üîç Attempting to load from pickle: /app/airflow/model_bank/credit_model_LR_2024_08_01.pkl
‚úÖ Successfully loaded model and scaler from pickle (dict format)

Model type: LogisticRegression

‚úÖ Test prediction: [0]


## LOAD DATA

In [6]:
# --- set up config ---
config = {}
config["model_name"] = "LR"

In [7]:
# --- load feature store ---
FEATURE_DIR = "/app/datamart/gold/feature_store"
features_store_sdf = spark.read.parquet(FEATURE_DIR)
print("row_count:",features_store_sdf.count())

features_store_sdf.show(1)

                                                                                

row_count: 12500


                                                                                

+-----------+----------------+-------------+----------------+------------------+-----------+----------------------+-------------------+-------------------------------------------------+------------------------------------------------+-------------------------------------------------+------------------------------------------------+--------------------------------------------------+-------------------------------------------------+-------------------+---------------+--------------+----------------------+--------------------------------+--------------------------+-----------------------------+--------------------------+-------------------------+------------------------------------+------------------------+-------------+----------------+--------------+--------------+--------------+--------------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+-------------+-------------+-------------+-------------+-----------

### LOOP THROUGH ALL DAYS

In [8]:
for snapshot_date in ['2024-09-01', '2024-10-01', '2024-11-01', '2024-12-01', '2025-01-01']:
    
    config["snapshot_date_str"] = snapshot_date
    config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")
    
    try:
        features_sdf = features_store_sdf.filter((col("snapshot_date") == config["snapshot_date"]))
    except Exception as e:
        print(f"‚ö†Ô∏è Using application_date instead of snapshot_date due to: {e}")
        features_sdf = features_store_sdf.filter((col("application_date") == config["snapshot_date"]))
    
    print("extracted features_sdf", features_sdf.count(), config["snapshot_date"])
    
    features_pdf = features_sdf.toPandas()
    
    #---------------------------------------------------
    # PREPROCESS DATA
    #---------------------------------------------------
    
    # prepare X_inference
    feature_cols = [fe_col for fe_col in features_pdf.columns if fe_col not in ['Customer_ID', 'application_date', 'snapshot_date']]
    X_features = features_pdf[feature_cols]
    
    X_features.head()
    
    # Apply scaler if it exists
    if scaler is not None:
        print("üîÑ Applying scaler transformation...")
        X_scaled = scaler.transform(X_features)
        print(f"‚úÖ Features scaled: {X_features.shape} -> {X_scaled.shape}")
    else:
        print("‚ö†Ô∏è  No scaler found, using raw")
        # --- Scaling ---
        X_scaled = X_features
    
    #---------------------------------------------------
    # MAKE INFERENCE & PREPARE OUTPUT
    #---------------------------------------------------
    
    y_inference = model.predict_proba(X_scaled)[:, 1]
    y_inference
    
    # prepare output
    y_inference_pdf = features_pdf[["Customer_ID","snapshot_date"]].copy()
    y_inference_pdf["model_name"] = config["model_name"]
    y_inference_pdf["model_predictions"] = y_inference
    
    #---------------------------------------------------
    # SAVE TO GOLD LAYER
    #---------------------------------------------------
    
    # create gold datalake
    snapshot_date_path = config["snapshot_date_str"].replace('-','_')
    prediction_directory = f"/app/datamart/gold/model_predictions/{config["model_name"]}/"
    print(prediction_directory)
    
    if not os.path.exists(prediction_directory):
        os.makedirs(prediction_directory)
    
    filepath = os.path.join(prediction_directory, f"predictions_{snapshot_date_path}.csv")
    y_inference_pdf.to_csv(filepath, index=False)

extracted features_sdf 493 2024-09-01 00:00:00
‚ö†Ô∏è  No scaler found, using raw
/app/datamart/gold/model_predictions/LR/




extracted features_sdf 456 2024-10-01 00:00:00
‚ö†Ô∏è  No scaler found, using raw
/app/datamart/gold/model_predictions/LR/




extracted features_sdf 488 2024-11-01 00:00:00
‚ö†Ô∏è  No scaler found, using raw
/app/datamart/gold/model_predictions/LR/




extracted features_sdf 515 2024-12-01 00:00:00
‚ö†Ô∏è  No scaler found, using raw
/app/datamart/gold/model_predictions/LR/




extracted features_sdf 526 2025-01-01 00:00:00
‚ö†Ô∏è  No scaler found, using raw
/app/datamart/gold/model_predictions/LR/




In [9]:
spark.stop()

print('\n\n---completed job---\n\n')




---completed job---




In [8]:
# extract feature store
try:
    features_sdf = features_store_sdf.filter((col("snapshot_date") == config["snapshot_date"]))
except Exception as e:
    print(f"‚ö†Ô∏è Using application_date instead of snapshot_date due to: {e}")
    features_sdf = features_store_sdf.filter((col("application_date") == config["snapshot_date"]))

print("extracted features_sdf", features_sdf.count(), config["snapshot_date"])

features_pdf = features_sdf.toPandas()

extracted features_sdf 493 2024-09-01 00:00:00


## PREPROCESS DATA

In [9]:
# --- preprocess data for modeling ---
# prepare X_inference
feature_cols = [fe_col for fe_col in features_pdf.columns if fe_col not in ['Customer_ID', 'application_date', 'snapshot_date']]
X_features = features_pdf[feature_cols]

In [10]:
X_features.head()

Unnamed: 0,Annual_Income,Outstanding_Debt,Credit_History_Age,Num_of_Loan,Num_of_Delayed_Payment,Delay_from_due_date,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Large_value_payments,Credit_Mix_Standard,Credit_Mix_Good,Credit_Mix_Bad,Type_of_Loan_Auto_Loan,Type_of_Loan_Credit_Builder_Loan,Type_of_Loan_Personal_Loan,Type_of_Loan_Home_Equity_Loan,Type_of_Loan_Mortgage_Loan,Type_of_Loan_Student_Loan,Type_of_Loan_Debt_Consolidation_Loan,Type_of_Loan_Payday_Loan,is_occu_known,age_band_Unknown,age_band_18_24,age_band_25_34,age_band_35_44,age_band_45_54,age_band_55,fe_1_sum_all,fe_2_sum_all,fe_3_sum_all,fe_4_sum_all,fe_5_sum_all,fe_6_sum_all,fe_7_sum_all,fe_8_sum_all,fe_9_sum_all,fe_10_sum_all,fe_11_sum_all,fe_12_sum_all,fe_13_sum_all,fe_14_sum_all,fe_15_sum_all,fe_16_sum_all,fe_17_sum_all,fe_18_sum_all,fe_19_sum_all,fe_20_sum_all,fe_1,fe_2,fe_3,fe_4,fe_5,fe_6,fe_7,fe_8,fe_9,fe_10,fe_11,fe_12,fe_13,fe_14,fe_15,fe_16,fe_17,fe_18,fe_19,fe_20
0,42153.128906,1027.640015,341.0,3,9,10,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,154868.234375,242.75,359.0,0,3,14,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,33209.269531,743.650024,209.0,2,4,24,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,14937.490234,3699.439941,165.0,7,15,43,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,90894.078125,49.52,239.0,4,13,9,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# Apply scaler if it exists
if scaler is not None:
    print("üîÑ Applying scaler transformation...")
    X_scaled = scaler.transform(X_features)
    print(f"‚úÖ Features scaled: {X_features.shape} -> {X_scaled.shape}")
else:
    print("‚ö†Ô∏è  No scaler found, using StandardScaler")
    # --- Scaling ---
    X_scaled = X_features

print('X_features', X_features.shape[0])

‚ö†Ô∏è  No scaler found, using StandardScaler
X_features 493


In [29]:
# --- model prediction inference ---
# load model
# model = model_artefact["model"]

# predict model
y_inference = model.predict_proba(X_scaled)[:, 1]
y_inference



array([0.00000000e+000, 1.71681326e-275, 1.00000000e+000, 0.00000000e+000,
       2.28692236e-283, 7.93198390e-200, 0.00000000e+000, 1.25513932e-276,
       5.96289160e-201, 5.62359156e-304, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 9.99972144e-001, 0.00000000e+000,
       0.00000000e+000, 2.50726778e-069, 0.00000000e+000, 1.68110205e-063,
       1.91593517e-060, 9.99999860e-001, 2.71116846e-137, 1.36993768e-076,
       6.92609659e-063, 0.00000000e+000, 3.83152365e-010, 2.31672666e-023,
       1.00000000e+000, 0.00000000e+000, 1.16097631e-203, 1.14559691e-071,
       9.99999502e-001, 0.00000000e+000, 5.09166590e-126, 5.28449752e-023,
       0.00000000e+000, 7.24890884e-151, 4.07639720e-306, 2.76562427e-164,
       0.00000000e+000, 0.00000000e+000, 5.24073582e-075, 0.00000000e+000,
       0.00000000e+000, 1.77718069e-069, 0.00000000e+000, 1.05665848e-230,
       2.13920490e-299, 1.18591176e-192, 3.41591799e-130, 4.33416447e-266,
       0.00000000e+000, 1

In [15]:
# prepare output
y_inference_pdf = features_pdf[["Customer_ID","snapshot_date"]].copy()
y_inference_pdf["model_name"] = config["model_name"]
y_inference_pdf["model_predictions"] = y_inference

In [43]:
y_inference_pdf.head()

Unnamed: 0,Customer_ID,snapshot_date,model_name,model_predictions
0,CUS_0x1087,2024-09-01,LR,0.0
1,CUS_0x1138,2024-09-01,LR,0.0
2,CUS_0x121b,2024-09-01,LR,1.716813e-275
3,CUS_0x1232,2024-09-01,LR,1.0
4,CUS_0x140e,2024-09-01,LR,0.0


In [53]:
# --- save model inference to datamart gold table ---
# create gold datalake
snapshot_date_path = config["snapshot_date_str"].replace('-','_')
prediction_directory = f"/app/datamart/gold/model_predictions/{config["model_name"]}/predictions_{snapshot_date_path}"
print(prediction_directory)

if not os.path.exists(prediction_directory):
    os.makedirs(prediction_directory)

/app/datamart/gold/model_predictions/LR/predictions_2024_09_01


In [56]:
filepath = os.path.join(prediction_directory, f"predictions_{snapshot_date_path}.csv")
y_inference_pdf.to_csv(filepath, index=False)

In [57]:
# --- end spark session --- 
spark.stop()

print('\n\n---completed job---\n\n')



---completed job---




## END

In [6]:
import mlflow
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("http://mlflow:5000")
client = MlflowClient()

# List all experiments
experiments = client.search_experiments()

print("Available experiments:")
for exp in experiments:
    print(f"  ID: {exp.experiment_id}, Name: {exp.name}, Lifecycle: {exp.lifecycle_stage}")

Available experiments:
  ID: 6, Name: credit_model_LR_3, Lifecycle: active
  ID: 5, Name: credit_model_XGB_2, Lifecycle: active
  ID: 4, Name: credit_model_LR_2, Lifecycle: active
  ID: 3, Name: loan_default_prediction, Lifecycle: active
  ID: 0, Name: Default, Lifecycle: active


In [4]:
import mlflow
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("http://mlflow:5000")
client = MlflowClient()

# Check the latest run
run_id = "6e7db08463dc4107aab6c6f8f0d8b8ad"

print(f"Checking run: {run_id}\n")

# List artifacts
artifacts = client.list_artifacts(run_id)

if artifacts:
    print("‚úÖ Artifacts found:")
    for artifact in artifacts:
        print(f"  - {artifact.path} (is_dir: {artifact.is_dir})")
        
        # If it's a directory, list its contents
        if artifact.is_dir:
            sub_artifacts = client.list_artifacts(run_id, artifact.path)
            for sub in sub_artifacts:
                print(f"    - {sub.path}")
    
    # Try to load the model
    print("\n‚è±Ô∏è  Loading model...")
    model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
    print(f"‚úÖ Model loaded successfully! Type: {type(model).__name__}")
    
    # Test prediction
    import numpy as np
    n_features = model.n_features_in_
    dummy_data = np.random.rand(1, n_features)
    prediction = model.predict(dummy_data)
    
    print(f"\n‚úÖ Test prediction successful!")
    print(f"   Features: {n_features}")
    print(f"   Prediction: {prediction}")
    
else:
    print("‚ùå No artifacts found")

Checking run: 6e7db08463dc4107aab6c6f8f0d8b8ad

‚ùå No artifacts found


In [5]:
import mlflow
import numpy as np

mlflow.set_tracking_uri("http://mlflow:5000")

# Load the latest version from registry
print("Loading model from registry...")
model = mlflow.sklearn.load_model("models:/credit_model_LR_3/latest")

print(f"‚úÖ Model loaded! Version: 5")
print(f"   Features expected: {model.n_features_in_}")


# Test prediction
dummy_data = np.random.rand(1, model.n_features_in_)
prediction = model.predict(dummy_data)
prediction_proba = model.predict_proba(dummy_data)

print(f"\n‚úÖ Prediction test passed!")
print(f"   Prediction: {prediction}")
print(f"   Probability: {prediction_proba}")

Loading model from registry...


OSError: No such file or directory: '/mlflow/mlruns/6/models/m-cb28067221ff4f0bba5a4948ebb01d33/artifacts/.'