In [None]:
import kagglehub
import shutil
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import pickle
import os
import json
from datetime import datetime
import logging
import mlflow
from mlflow.models.signature import infer_signature

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download latest version from iris dataset. Uncomment below.

# path = kagglehub.dataset_download("uciml/iris")
# print("Path to dataset files:", path)

# # Copy the files to data folder
# shutil.copytree(path, "./data", dirs_exist_ok=True)
# print("Dataset saved to ./data directory")

# # Clean up the temporary path
# shutil.rmtree(path)
# print("Temporary files cleaned up")


In [3]:
data = pd.read_csv('./data/Iris.csv')
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

mlflow.set_tracking_uri("http://0.0.0.0:5000")  # Use the service name from docker-compose
mlflow.set_experiment("iris-classification-dev")


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1750984050631, experiment_id='1', last_update_time=1750984050631, lifecycle_stage='active', name='iris-classification-dev', tags={}>

In [5]:
# Custom transformer for data validation and cleaning
class DataValidator(BaseEstimator, TransformerMixin):
    def __init__(self, missing_threshold=0.3, variance_threshold=0.01):
        self.missing_threshold = missing_threshold
        self.variance_threshold = variance_threshold
        self.columns_to_drop = None
        self.feature_names = None
        
    def fit(self, X, y=None):
        logger.info("Starting data validation and cleaning...")
        
        # Check for missing values
        missing_counts = X.isnull().sum()
        missing_percentages = (missing_counts / len(X)) * 100
        
        logger.info(f"Missing values per column: {missing_percentages.to_dict()}")
        
        # Store columns to drop
        self.columns_to_drop = missing_percentages[missing_percentages > self.missing_threshold * 100].index
        
        if len(self.columns_to_drop) > 0:
            logger.warning(f"Dropping columns with >{self.missing_threshold*100}% missing values: {list(self.columns_to_drop)}")
        
        # Store feature names for variance check
        numeric_columns = X.select_dtypes(include=[np.number]).columns
        if len(numeric_columns) > 0:
            variance_selector = VarianceThreshold(threshold=self.variance_threshold)
            variance_selector.fit(X[numeric_columns])
            support_mask = variance_selector.get_support()
            if support_mask is not None:
                low_variance_cols = numeric_columns[~support_mask]
                if len(low_variance_cols) > 0:
                    logger.warning(f"Columns with low variance: {list(low_variance_cols)}")
            else:
                logger.warning("Variance selector support mask is None")
        
        self.feature_names = X.columns
        logger.info("Data validation completed")
        return self
    
    def transform(self, X):
        # Drop columns with too many missing values
        if self.columns_to_drop is not None and len(self.columns_to_drop) > 0:
            X = X.drop(columns=self.columns_to_drop)
        return X


In [6]:
MODEL_NAME = "RandomForestClassifier"
MODEL_VERSION = "v0.0.0"
TEST_SIZE = 0.3
RANDOM_STATE = 42

In [7]:
#start mlflow run
mlflow.start_run()

<ActiveRun: >

In [None]:

# Splitting target from features
logger.info("Preparing features and target...")
features = data.drop(['Id', 'Species'], axis=1)
target = data['Species']

# Encode target variable
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Print mapping
print("Mapping dict:",
    {label: idx for idx, label in enumerate(label_encoder.classes_)})

# Create a machine learning pipeline with the following steps:
# 1. Data validation: Check for missing values and low variance features
# 2. Imputation: Fill missing values using mean strategy
# 3. Scaling: Standardize features to have zero mean and unit variance
# 4. Classification: Use Decision Tree classifier for prediction
pipeline = Pipeline([
    ('validator', DataValidator(missing_threshold=0.3, variance_threshold=0.01)),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    features, target_encoded, test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, stratify=target_encoded
)

logger.info(f"Training set size: {X_train.shape[0]}")
logger.info(f"Test set size: {X_test.shape[0]}")

# Fit the pipeline
logger.info("Fitting the pipeline...")
pipeline.fit(X_train, y_train)

# Cross-validation
logger.info("Performing cross-validation...")
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
logger.info(f"Cross-validation scores: {cv_scores}")
mean_cv_accuracy = cv_scores.mean()
logger.info(f"Mean CV accuracy: {mean_cv_accuracy:.4f} (+/- {cv_scores.std() * 2:.4f})")

# Test set evaluation
test_accuracy = pipeline.score(X_test, y_test)
logger.info(f"Test set accuracy: {test_accuracy:.4f}")

INFO:__main__:Preparing features and target...
INFO:__main__:Training set size: 105
INFO:__main__:Test set size: 45
INFO:__main__:Fitting the pipeline...


INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Performing cross-validation...
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed


Mapping dict: {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}


INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Cross-validation scores: [0.95238095 0.95238095 0.95238095 0.9047619  0.95238095]
INFO:__main__:Mean CV accuracy: 0.9429 (+/- 0.0381)
INFO:__main__:Test set accuracy: 0.9111


In [9]:
#pack encoder withn model
class ModelWithEncoder(mlflow.pyfunc.PythonModel):
    def __init__(self, model, label_encoder):
        self.model = model
        self.label_encoder = label_encoder

    def predict(self, model_input):
        # Handle string (JSON)
        if isinstance(model_input, str):
            model_input = json.loads(model_input)

        # Handle dict (single row)
        if isinstance(model_input, dict):
            model_input = pd.DataFrame([model_input])

        # Handle Series (single row)
        elif isinstance(model_input, pd.Series):
            model_input = model_input.to_frame().T

        # Handle flat list (single row) or list of lists (multiple rows)
        elif isinstance(model_input, list):
            # Infer feature count from training data (optional)
            if all(isinstance(x, (int, float)) for x in model_input):
                model_input = pd.DataFrame([model_input], columns=self.model.feature_names_in_)
            else:
                model_input = pd.DataFrame(model_input, columns=self.model.feature_names_in_)

        # Already a DataFrame
        elif not isinstance(model_input, pd.DataFrame):
            raise TypeError(f"Unsupported input type: {type(model_input)}")

        # Predict
        encoded_preds = self.model.predict(model_input)
        return self.label_encoder.inverse_transform(encoded_preds)



In [10]:
# Log the model and label encoder
rfc_model = ModelWithEncoder(pipeline, label_encoder)
input_example = pd.DataFrame(X_test[:5], columns=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"])
signature = infer_signature(X_train, pipeline.predict(X_train))

# Log the model with MLflow
mlflow.pyfunc.log_model(
    python_model=rfc_model,
    name=MODEL_NAME,
    input_example=input_example,
    signature=signature,
    registered_model_name=MODEL_NAME
)
# Log all parameters and metrics to MLflow
mlflow.log_param("model_name", MODEL_NAME)
mlflow.log_param("model_version", MODEL_VERSION)
mlflow.log_param("test_size", TEST_SIZE)
mlflow.log_param("random_state", RANDOM_STATE)
mlflow.log_param("packed_model", True)
mlflow.log_metric("mean_cv_accuracy", mean_cv_accuracy)
mlflow.log_metric("test_set_accuracy", test_accuracy)

# Ende MlFlow experiment run
mlflow.end_run()

2025/06/27 18:45:23 INFO mlflow.pyfunc: Validating input example against model signature
Registered model 'RandomForestClassifier' already exists. Creating a new version of this model...
2025/06/27 18:45:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 38
Created version '38' of model 'RandomForestClassifier'.


🏃 View run casual-stag-913 at: http://0.0.0.0:5000/#/experiments/1/runs/63fe3d862b9f40448fe12f23f3ff7ef7
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/1


In [11]:
# # Saving model locally. Uncomment below.

# # Create model directory
# os.makedirs("./model", exist_ok=True)

# # Save the model with timestamp
# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# model_filename = f"./model/decision_tree_{timestamp}.pkl"
# label_encoder_filename = f"./model/label_encoder_{timestamp}.pkl"

# with open(model_filename, 'wb') as f:
#     pickle.dump(pipeline, f)

# with open(label_encoder_filename, 'wb') as f:
#     pickle.dump(label_encoder, f)

# logger.info(f"Model saved to: {model_filename}")
# logger.info(f"Label encoder saved to: {label_encoder_filename}")

# # Save target_encoded for model serving
# target_encoded_filename = f"./model/target_encoded_{timestamp}.pkl"

# with open(target_encoded_filename, 'wb') as f:
#     pickle.dump(target_encoded, f)

# logger.info(f"Target encoded data saved to: {target_encoded_filename}")


In [12]:
# Showing differences between unpacked and packed models

# Unpacked
input = [[0.2,5.1,3.5,1.4]]

result = pipeline.predict(input)
logger.info(f"[Unpacked] Predicted species for input {input}: {result}")
result_species = label_encoder.inverse_transform(result)
logger.info(f"[Unpacked] Predicted species for input {input}: {result_species[0]}")

# Packed
input = {
    "SepalLengthCm": 5.1,
    "SepalWidthCm": 3.5,
    "PetalLengthCm": 1.4,
    "PetalWidthCm": 0.2
}

result_packed = rfc_model.predict(model_input=input)
logger.info(f"[Packed] Predicted species for input {input}: {result_packed[0]}")

INFO:__main__:[Unpacked] Predicted species for input [[0.2, 5.1, 3.5, 1.4]]: [1]
INFO:__main__:[Unpacked] Predicted species for input [[0.2, 5.1, 3.5, 1.4]]: Iris-versicolor
INFO:__main__:[Packed] Predicted species for input {'SepalLengthCm': 5.1, 'SepalWidthCm': 3.5, 'PetalLengthCm': 1.4, 'PetalWidthCm': 0.2}: Iris-setosa


In [13]:
# Import MLflow client for model registry operations
from mlflow import MlflowClient

# Initialize MLflow client to interact with model registry
client = MlflowClient()
# del label_encoder
# Check the production model version using alias
try:
    # Get model version associated with 'production' alias
    model_info = client.get_model_version_by_alias(MODEL_NAME, "production")
    logger.info(f"Successfully retrieved production model version: {model_info.version}")
except Exception as e:
    # Log error if unable to retrieve production model version
    logger.error(f"Failed to retrieve production model version: {e}")

INFO:__main__:Successfully retrieved production model version: 36


In [14]:
print(model_info)

<ModelVersion: aliases=['production'], creation_timestamp=1751039742191, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1751039742191, metrics=None, model_id=None, name='RandomForestClassifier', params=None, run_id='893b7e745bac4159a81e753e25098294', run_link='', source='models:/m-5a66a96ce3c242e6ab72398f70a0d17e', status='READY', status_message=None, tags={}, user_id='', version='36'>


In [15]:
# Load production model from MLflow model registry for inference
logger.info(f"Loading production model from registry: {MODEL_NAME}")
model_uri = f"models:/{MODEL_NAME}@production"
model = mlflow.pyfunc.load_model(model_uri)
logger.info("Production model loaded successfully for inference")

INFO:__main__:Loading production model from registry: RandomForestClassifier


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 11.88it/s]
INFO:__main__:Production model loaded successfully for inference


In [16]:
# Predict and decode
input = {
    "SepalLengthCm": 5.1,
    "SepalWidthCm": 3.5,
    "PetalLengthCm": 1.4,
    "PetalWidthCm": 0.2
}
pred = model.predict(input)

print("Decoded predictions:", pred)

Decoded predictions: ['Iris-setosa']


In [None]:

# Train a more robust model using the existing pipeline structure

# Import additional libraries for more robust models
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

logger.info("Training a more robust RandomForest model...")
mlflow.set_tracking_uri("http://0.0.0.0:5000")  # Use the service name from docker-compose
mlflow.set_experiment("iris-classification-dev")

# Log the robust model with a different name
ROBUST_MODEL_NAME = "RobustRandomForestClassifier"
TEST_SIZE = 0.5

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    features, target_encoded, test_size=TEST_SIZE, 
    random_state=RANDOM_STATE, stratify=target_encoded
)


# Create a heavier RandomForest classifier 
# Forcing an Overfit
robust_classifier = RandomForestClassifier(
    n_estimators=500,  
    max_depth=15,      
    min_samples_split=2,  
    min_samples_leaf=5,   
    max_features='sqrt',  
    bootstrap=True,       
    oob_score=True,       
    random_state=42,
    n_jobs=-1            
)

# Use the existing pipeline structure with the robust classifier
robust_pipeline = Pipeline([
    ('validator', DataValidator(missing_threshold=0.3, variance_threshold=0.01)),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', robust_classifier)
])

# Train the robust model
logger.info("Fitting the robust RandomForest pipeline...")
robust_pipeline.fit(X_train, y_train)

# Evaluate the robust model
y_pred_robust = robust_pipeline.predict(X_test)
robust_accuracy = accuracy_score(y_test, y_pred_robust)

# Cross-validation for robust model
cv_scores_robust = cross_val_score(robust_pipeline, X_train, y_train, cv=5)
logger.info(f"Robust model CV scores: {cv_scores_robust}")
logger.info(f"Robust model mean CV accuracy: {cv_scores_robust.mean():.4f} (+/- {cv_scores_robust.std() * 2:.4f})")
logger.info(f"Robust model test accuracy: {robust_accuracy:.4f}")

# Get out-of-bag score if available
if hasattr(robust_classifier, 'oob_score_'):
    logger.info(f"Out-of-bag score: {robust_classifier.oob_score_:.4f}")

# Get predictions from the original simple model for comparison
# Load the original model from MLflow registry
original_model_uri = "models:/RandomForestClassifier/latest"
original_model = mlflow.pyfunc.load_model(original_model_uri)

# Convert X_test to the format expected by the MLflow model (dictionary format)
X_test_dict = []
for i in range(len(X_test)):
    sample = {
        "SepalLengthCm": X_test.iloc[i, 0],
        "SepalWidthCm": X_test.iloc[i, 1], 
        "PetalLengthCm": X_test.iloc[i, 2],
        "PetalWidthCm": X_test.iloc[i, 3]
    }
    X_test_dict.append(sample)

# Get predictions from the original model
y_pred_simple_raw = original_model.predict(X_test_dict)
logger.info(f"Original model predictions shape: {len(y_pred_simple_raw)}")
logger.info(f"Original model predictions sample: {y_pred_simple_raw[:5]}")

# Convert predictions to numeric format for accuracy calculation
# The MLflow model returns string predictions, so we need to convert them back to numeric
y_pred_simple_numeric = []
for pred in y_pred_simple_raw:
    if isinstance(pred, list):
        pred = pred[0]  # Extract from list if needed
    # Convert string prediction back to numeric using label_encoder
    numeric_pred = label_encoder.transform([pred])[0]
    y_pred_simple_numeric.append(numeric_pred)

y_pred_simple_numeric = np.array(y_pred_simple_numeric)
simple_accuracy = accuracy_score(y_test, y_pred_simple_numeric)

# Compare with simple model
logger.info(f"Simple model test accuracy: {simple_accuracy:.4f}")
logger.info(f"Robust model improvement: {robust_accuracy - simple_accuracy:.4f}")

# Reuse the existing ModelWithEncoder class instead of creating a new one
robust_pyfunc_model = ModelWithEncoder(robust_pipeline, label_encoder)

with mlflow.start_run(run_name="robust_randomforest_training"):
    # Log parameters
    mlflow.log_params({
        "model_type": "robust_random_forest",
        "n_estimators": 500,
        "max_depth": 15,
        "min_samples_split": 5,
        "min_samples_leaf": 3,
        "max_features": "sqrt",
        "bootstrap": True,
        "oob_score": True
    })
    
    # Log metrics
    mlflow.log_metrics({
        "test_accuracy": robust_accuracy,
        "cv_mean_accuracy": cv_scores_robust.mean(),
        "cv_std_accuracy": cv_scores_robust.std(),
        "improvement_over_simple": robust_accuracy - simple_accuracy
    })
    
    # Log the robust model using pyfunc with the same strategy as the original model
    mlflow.pyfunc.log_model(
        python_model=robust_pyfunc_model,
        name=ROBUST_MODEL_NAME,
        input_example=input_example,
        signature=signature,
        registered_model_name=ROBUST_MODEL_NAME
    )

logger.info(f"Robust RandomForest model logged as: {ROBUST_MODEL_NAME}")


# Load and test the robust model using pyfunc
robust_model_uri = f"models:/{ROBUST_MODEL_NAME}/latest"
robust_model = mlflow.pyfunc.load_model(robust_model_uri)

# Test predictions
test_inputs = [
    {"SepalLengthCm": 5.1, "SepalWidthCm": 3.5, "PetalLengthCm": 1.4, "PetalWidthCm": 0.2},
    {"SepalLengthCm": 6.3, "SepalWidthCm": 3.3, "PetalLengthCm": 4.7, "PetalWidthCm": 1.6},
    {"SepalLengthCm": 7.2, "SepalWidthCm": 3.0, "PetalLengthCm": 5.8, "PetalWidthCm": 1.6}
]

logger.info("Testing robust RandomForest model predictions:")
for i, test_input in enumerate(test_inputs):
    prediction = robust_model.predict(test_input)
    logger.info(f"Input {i+1}: {test_input} -> Prediction: {prediction}")


# Create a comparison summary
comparison_data = {
    'Model': ['Simple RandomForest', 'Robust RandomForest'],
    'Test Accuracy': [simple_accuracy, robust_accuracy],
    'CV Mean': [cv_scores.mean(), cv_scores_robust.mean()],
    'CV Std': [cv_scores.std(), cv_scores_robust.std()],
    'Model Complexity': ['Low', 'Medium'],
    'Training Time': ['Fast', 'Medium']
}

comparison_df = pd.DataFrame(comparison_data)
print("Model Comparison Summary:")
print(comparison_df.to_string(index=False))


INFO:__main__:Training a more robust RandomForest model...


INFO:__main__:Fitting the robust RandomForest pipeline...
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Starting data validation and cleaning...
INFO:__main__:Missing values per column: {'SepalLengthCm': 0.0, 'SepalWidthCm': 0.0, 'PetalLengthCm': 0.0, 'PetalWidthCm': 0.0}
INFO:__main__:Data validation completed
INFO:__main__:Starting data validation and cleanin

🏃 View run robust_randomforest_training at: http://0.0.0.0:5000/#/experiments/1/runs/0d5637253a844b6b88435e6820eee436
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/1


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 39.28it/s]  
INFO:__main__:Testing robust RandomForest model predictions:
INFO:__main__:Input 1: {'SepalLengthCm': 5.1, 'SepalWidthCm': 3.5, 'PetalLengthCm': 1.4, 'PetalWidthCm': 0.2} -> Prediction: ['Iris-setosa']
INFO:__main__:Input 2: {'SepalLengthCm': 6.3, 'SepalWidthCm': 3.3, 'PetalLengthCm': 4.7, 'PetalWidthCm': 1.6} -> Prediction: ['Iris-versicolor']
INFO:__main__:Input 3: {'SepalLengthCm': 7.2, 'SepalWidthCm': 3.0, 'PetalLengthCm': 5.8, 'PetalWidthCm': 1.6} -> Prediction: ['Iris-virginica']


Model Comparison Summary:
              Model  Test Accuracy  CV Mean   CV Std Model Complexity Training Time
Simple RandomForest       0.946667 0.942857 0.019048              Low          Fast
Robust RandomForest       0.906667 0.973333 0.032660           Medium        Medium
