# Model Engineering - Art Auction Price Prediction
## ML Pipeline for Training and Evaluating Regression Models

### Import Required Libraries

In [1]:
# !pip install mlflow scikit-learn


In [2]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import json
import os
import warnings
warnings.filterwarnings('ignore')

print("🔄 MLflow Version:", mlflow.__version__)
print("✓ Libraries imported successfully")

models_dir = "../models/"
os.makedirs(models_dir, exist_ok=True)

final_df = pd.read_csv('../data/processed/final_art_dataset.csv')

🔄 MLflow Version: 3.4.0
✓ Libraries imported successfully


### Handling missing values, standardizing numerical features, and encoding categorical variables

In [3]:
# Step 1: Remove records with missing prices
# Since price_usd is our target variable, we cannot use records without prices for training
print("\nHandling Missing Target Values")
print(f"Initial number of records: {len(final_df)}")

# Check for missing prices
missing_prices = final_df['price_usd'].isnull().sum()
print(f"Records with missing prices: {missing_prices}")

# Remove records with missing prices for training data
train_df = final_df.dropna(subset=['price_usd']).copy()
print(f"Records available for training: {len(train_df)}")

# Save records with missing prices for future inference/prediction
inference_df = final_df[final_df['price_usd'].isnull()].copy()
print(f"Records saved for inference: {len(inference_df)}")


Handling Missing Target Values
Initial number of records: 754
Records with missing prices: 0
Records available for training: 754
Records saved for inference: 0


In [4]:
# Step 2: Separate features and target variable
print("\nPreparing Features and Target Variable")

X = train_df.drop('price_usd', axis=1)  # Features
y = train_df['price_usd']               # Target variable

print(f"Features shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

# Display feature summary
print("\nFeature summary:")
print(X.info())


Preparing Features and Target Variable
Features shape: (754, 13)
Target variable shape: (754,)

Feature summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754 entries, 0 to 753
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   artist_id          296 non-null    float64
 1   artist             753 non-null    object 
 2   title              754 non-null    object 
 3   creation_year      725 non-null    float64
 4   period             753 non-null    object 
 5   movement           754 non-null    object 
 6   size_category      458 non-null    object 
 7   avg_dimension_cm   458 non-null    float64
 8   lifespan           1 non-null      float64
 9   years_since_death  1 non-null      float64
 10  paintings          1 non-null      float64
 11  signed_binary      754 non-null    int64  
 12  is_living          1 non-null      object 
dtypes: float64(6), int64(1), object(6)
memory usage: 76.7+ K

In [5]:
# Step 3: Define feature types for preprocessing with proper missing value analysis
print("\nStep 3: Defining Feature Types with Missing Value Analysis")

# First, let's analyze the actual availability of each feature
print("Feature availability analysis:")
feature_analysis = []
for feature in X.columns:
    non_null_count = X[feature].notnull().sum()
    non_null_pct = (non_null_count / len(X)) * 100
    feature_analysis.append({
        'feature': feature,
        'non_null_count': non_null_count,
        'non_null_pct': non_null_pct,
        'dtype': X[feature].dtype
    })
    print(f"  {feature}: {non_null_count}/{len(X)} ({non_null_pct:.1f}%) - {X[feature].dtype}")

feature_analysis_df = pd.DataFrame(feature_analysis)

# Select features with sufficient data (>50% non-null values)
usable_numeric_features = feature_analysis_df[
    (feature_analysis_df['dtype'].isin(['float64', 'int64'])) & 
    (feature_analysis_df['non_null_pct'] > 50)
]['feature'].tolist()

usable_categorical_features = feature_analysis_df[
    (feature_analysis_df['dtype'] == 'object') & 
    (feature_analysis_df['non_null_pct'] > 50)
]['feature'].tolist()

# Remove identifier features from modeling features
identifier_features = ['artist_id', 'artist', 'title']
modeling_numeric_features = [f for f in usable_numeric_features if f not in identifier_features]
modeling_categorical_features = [f for f in usable_categorical_features if f not in identifier_features]

print(f"\nUsable numerical features ({len(modeling_numeric_features)}): {modeling_numeric_features}")
print(f"Usable categorical features ({len(modeling_categorical_features)}): {modeling_categorical_features}")

# Check if we have enough features to proceed
if len(modeling_numeric_features) == 0 and len(modeling_categorical_features) == 0:
    print("❌ ERROR: No usable features found for modeling!")
else:
    print("✓ Sufficient features available for modeling")


Step 3: Defining Feature Types with Missing Value Analysis
Feature availability analysis:
  artist_id: 296/754 (39.3%) - float64
  artist: 753/754 (99.9%) - object
  title: 754/754 (100.0%) - object
  creation_year: 725/754 (96.2%) - float64
  period: 753/754 (99.9%) - object
  movement: 754/754 (100.0%) - object
  size_category: 458/754 (60.7%) - object
  avg_dimension_cm: 458/754 (60.7%) - float64
  lifespan: 1/754 (0.1%) - float64
  years_since_death: 1/754 (0.1%) - float64
  paintings: 1/754 (0.1%) - float64
  signed_binary: 754/754 (100.0%) - int64
  is_living: 1/754 (0.1%) - object

Usable numerical features (0): []
Usable categorical features (3): ['period', 'movement', 'size_category']
✓ Sufficient features available for modeling


In [6]:
# Step 4: Create preprocessing pipelines for available features
print("\nCreating Preprocessing Pipelines for Available Features")

if len(modeling_numeric_features) > 0:
    # Numerical pipeline for features with sufficient data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    print(f"Numerical transformer created for {len(modeling_numeric_features)} features")
else:
    numeric_transformer = 'drop'
    print("No numerical features available - numerical transformer set to 'drop'")

if len(modeling_categorical_features) > 0:
    # Categorical pipeline for features with sufficient data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
    ])
    print(f"Categorical transformer created for {len(modeling_categorical_features)} features")
else:
    categorical_transformer = 'drop'
    print("No categorical features available - categorical transformer set to 'drop'")

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, modeling_numeric_features),
        ('cat', categorical_transformer, modeling_categorical_features)
    ],
    remainder='drop'  # Drop features not used in modeling
)

print("✓ Preprocessing pipeline created successfully!")


Creating Preprocessing Pipelines for Available Features
No numerical features available - numerical transformer set to 'drop'
Categorical transformer created for 3 features
✓ Preprocessing pipeline created successfully!


In [7]:
# Step 5: Apply preprocessing transformations
print("\nApplying Preprocessing Transformations")

try:
    # Fit and transform the features
    X_processed = preprocessor.fit_transform(X)
    
    # Get feature names after preprocessing
    feature_names = []
    
    if len(modeling_numeric_features) > 0:
        feature_names.extend(modeling_numeric_features)
    
    if len(modeling_categorical_features) > 0:
        cat_transformer = preprocessor.named_transformers_['cat']
        cat_feature_names = cat_transformer.named_steps['onehot'].get_feature_names_out(modeling_categorical_features)
        feature_names.extend(cat_feature_names)
    
    print(f"Original number of features: {X.shape[1]}")
    print(f"Number of features after preprocessing: {len(feature_names)}")
    print(f"Processed features shape: {X_processed.shape}")
    
    if X_processed.shape[1] == 0:
        print("❌ WARNING: No features were produced after preprocessing!")
        print("This usually happens when most features have too many missing values.")
        
except Exception as e:
    print(f"❌ Error during preprocessing: {e}")
    print("This is likely due to insufficient data in the features.")


Applying Preprocessing Transformations
Original number of features: 13
Number of features after preprocessing: 37
Processed features shape: (754, 37)


### Initialize MLflow Experiment

In [8]:
# Set MLflow tracking URI and experiment name
mlflow.set_tracking_uri("file:../models/mlruns")
mlflow.set_experiment("Art_Auction_Price_Prediction")

print("\nMLflow Experiment: Art_Auction_Price_Prediction")
print("✓ Experiment configured successfully")


MLflow Experiment: Art_Auction_Price_Prediction
✓ Experiment configured successfully


### Step 1: Splitting data into train/test sets

In [9]:
print("\nSplitting Data into Training and Testing Sets")

# Create DataFrame with processed features
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df, 
    y, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

print(f"✓ Training set: {X_train.shape} features, {y_train.shape} target")
print(f"✓ Testing set: {X_test.shape} features, {y_test.shape} target")
print(f"✓ Train/Test split: {len(X_train)}/{len(X_test)} records")


Splitting Data into Training and Testing Sets
✓ Training set: (603, 37) features, (603,) target
✓ Testing set: (151, 37) features, (151,) target
✓ Train/Test split: 603/151 records


### Step 2: Feature Engineering (Additional Transformations)

### Log Transformation Rationale

The logarithmic transformation (`np.log1p`) addresses the highly **right-skewed distribution** typical of art auction prices, where most artworks cluster at lower price points while a few masterpieces reach extreme values. This transformation compresses the scale, reduces the influence of outliers, and creates a more normal distribution, leading to more stable training and percentage-based error interpretation. The reverse transformation (`np.expm1`) converts predictions back to the original dollar scale while maintaining meaningful error metrics.

In [10]:
# Apply logarithmic transformation to target variable (for skewed price distribution)
y_train_log = np.log1p(y_train)  # log(1 + y) to handle zero values
y_test_log = np.log1p(y_test)

print("✓ Applied logarithmic transformation to target variable")

✓ Applied logarithmic transformation to target variable


### Step 3: Model Training with MLflow Tracking

### Model Selection Justification

The **Random Forest Regressor** was chosen for its robustness against overfitting and ability to capture complex non-linear relationships in art pricing data, which often involves intricate interactions between artist reputation, artwork characteristics, and market trends. The **Linear Regression** serves as a simple baseline model to benchmark performance against more complex algorithms. Both models handle the log-transformed target variable effectively.

In [11]:
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
    "LinearRegression": LinearRegression()
}

best_model = None
best_score = float('inf')
best_model_name = ""

for model_name, model in models.items():
    print(f"\n--- Training {model_name} ---")
    
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train_log)
        
        # Make predictions (convert back from log scale)
        y_pred_log = model.predict(X_test)
        y_pred = np.expm1(y_pred_log)  # Reverse log transformation
        
        # Calculate metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        cv_scores = cross_val_score(model, X_train, y_train_log, 
                                  cv=5, scoring='neg_mean_squared_error')
        cv_rmse = np.sqrt(-cv_scores.mean())
        
        # Log parameters
        mlflow.log_param("model_type", model_name)
        if model_name == "RandomForest":
            mlflow.log_param("n_estimators", 100)
            mlflow.log_param("max_depth", 10)
        
        # Log metrics
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("cv_rmse", cv_rmse)
        
        # Log model
        mlflow.sklearn.log_model(model, f"{model_name.lower()}_model")
        
        # Log feature importance for RandomForest
        if model_name == "RandomForest":
            feature_importance = pd.DataFrame({
                'feature': X_train.columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            # Save feature importance as artifact
            feature_importance.to_csv(f"{model_name}_feature_importance.csv", index=False)
            mlflow.log_artifact(f"{model_name}_feature_importance.csv")
        
        print(f"✅ {model_name} trained successfully")
        print(f"   MAE: ${mae:,.2f}")
        print(f"   RMSE: ${rmse:,.2f}")
        print(f"   R² Score: {r2:.4f}")
        
        if rmse < best_score:
            best_score = rmse
            best_model = model
            best_model_name = model_name

print(f"\nBest Model: {best_model_name} with RMSE: ${best_score:,.2f}")


--- Training RandomForest ---




✅ RandomForest trained successfully
   MAE: $160.59
   RMSE: $327.93
   R² Score: -0.1412

--- Training LinearRegression ---




✅ LinearRegression trained successfully
   MAE: $159.12
   RMSE: $327.89
   R² Score: -0.1410

Best Model: LinearRegression with RMSE: $327.89


In [12]:
# mlflow.end_run()

### Step 4: Model Packaging and Saving

In [13]:
print("\nModel Packaging and Saving")

# Save the best model as the final artifact
if best_model is not None:
    # Save using MLflow
    with mlflow.start_run(run_name="Final_Model"):
        mlflow.log_param("best_model", best_model_name)
        mlflow.log_metric("best_rmse", best_score)
        
        # Log the best model
        mlflow.sklearn.log_model(best_model, "final_model")
        
        # Also save using joblib for easy access
        import joblib
        model_filename = f"art_auction_price_model_{best_model_name.lower()}.pkl"
        model_filepath = os.path.join(models_dir, model_filename)
        joblib.dump(best_model, model_filepath)
        
        # Log the file as artifact
        mlflow.log_artifact(model_filepath)
        
        print(f"✓ Best model saved as: {model_filepath}")
        print("✓ Model logged to MLflow tracking server")
        
        model_metadata = {
            "model_name": best_model_name,
            "training_date": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
            "features_used": list(X_train.columns),
            "performance_metrics": {
                "rmse": float(best_score),
                "r2_score": float(r2_score(y_test, np.expm1(best_model.predict(X_test))))
            },
            "data_dimensions": {
                "training_samples": X_train.shape[0],
                "features_count": X_train.shape[1],
                "test_samples": X_test.shape[0]
            }
        }
        
        metadata_filepath = os.path.join(models_dir, "model_metadata.json")
        with open(metadata_filepath, 'w') as f:
            json.dump(model_metadata, f, indent=2)
        
        mlflow.log_artifact(metadata_filepath)
        print("✓ Model metadata saved and logged")
        
else:
    print("❌ No model was trained successfully")


Model Packaging and Saving




✓ Best model saved as: ../models/art_auction_price_model_linearregression.pkl
✓ Model logged to MLflow tracking server
✓ Model metadata saved and logged


### Step 5: Performance Visualization and Analysis

In [14]:
print("\nModel Performance Analysis")

if best_model is not None:
    y_pred_log = best_model.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    
    results_df = pd.DataFrame({
        'actual_price': y_test.values,
        'predicted_price': y_pred,
        'absolute_error': np.abs(y_test.values - y_pred),
        'percentage_error': (np.abs(y_test.values - y_pred) / y_test.values) * 100
    })
    
    results_filepath = os.path.join(models_dir, "model_predictions.csv")
    results_df.to_csv(results_filepath, index=False)
    mlflow.log_artifact(results_filepath)
    
    print(f"✓ Prediction results saved: {results_filepath}")
    print(f"\nFinal Model Performance ({best_model_name}):")
    print(f"Mean Absolute Error: ${results_df['absolute_error'].mean():,.2f}")
    print(f"Median Absolute Error: ${results_df['absolute_error'].median():,.2f}")
    print(f"Max Absolute Error: ${results_df['absolute_error'].max():,.2f}")
    print(f"Mean Percentage Error: {results_df['percentage_error'].mean():.2f}%")
    
    # Sample predictions
    print(f"\nSample Predictions:")
    sample_results = results_df.head(10).copy()
    for idx, row in sample_results.iterrows():
        print(f"Actual: ${row['actual_price']:,.2f} | Predicted: ${row['predicted_price']:,.2f} | Error: ${row['absolute_error']:,.2f}")


Model Performance Analysis
✓ Prediction results saved: ../models/model_predictions.csv

Final Model Performance (LinearRegression):
Mean Absolute Error: $159.12
Median Absolute Error: $20.00
Max Absolute Error: $963.89
Mean Percentage Error: 568.45%

Sample Predictions:
Actual: $1.50 | Predicted: $42.08 | Error: $40.58
Actual: $9.50 | Predicted: $22.50 | Error: $13.00
Actual: $1.27 | Predicted: $19.96 | Error: $18.68
Actual: $2.50 | Predicted: $22.50 | Error: $20.00
Actual: $13.00 | Predicted: $9.30 | Error: $3.70
Actual: $4.00 | Predicted: $4.00 | Error: $0.00
Actual: $680.00 | Predicted: $22.50 | Error: $657.50
Actual: $1.50 | Predicted: $42.08 | Error: $40.58
Actual: $1.50 | Predicted: $17.70 | Error: $16.20
Actual: $680.00 | Predicted: $8.19 | Error: $671.81


### Step 6: MLflow Experiment Summary

In [15]:
print("\nMLflow Experiment Summary")

# Display MLflow experiment information
experiment = mlflow.get_experiment_by_name("Art_Auction_Price_Prediction")
if experiment:
    print(f"Experiment ID: {experiment.experiment_id}")
    print(f"Artifact Location: {experiment.artifact_location}")
    
    # List recent runs
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    if not runs.empty:
        print(f"\nTotal runs in experiment: {len(runs)}")
        print("Recent runs:")
        recent_runs = runs[['run_id', 'tags.mlflow.runName', 'metrics.rmse', 'metrics.r2']].head()
        print(recent_runs.to_string(index=False))

print("\nModel Engineering Completed Successfully!")
print("Generated Artifacts:")
print("  - Trained model files (.pkl)")
print("  - Model metadata (model_metadata.json)")
print("  - Prediction results (model_predictions.csv)")
print("  - Feature importance analysis")
print("  - MLflow experiment tracking data")


MLflow Experiment Summary
Experiment ID: 822850360807149285
Artifact Location: file:///c:/Users/User/Desktop/Forecasting-auction-prices-for-artworks/notebooks/../models/mlruns/822850360807149285

Total runs in experiment: 4
Recent runs:
                          run_id  tags.mlflow.runName  metrics.rmse  metrics.r2
780ca74a722d4a5295d174503adbd974 adventurous-slug-729           NaN         NaN
01431042e8704917937e2d09ebb11b01          Final_Model           NaN         NaN
5a8e0e68f26c4a26a0f299b8dbf35b13     LinearRegression    327.887343   -0.140985
59717f59028f4a469902895b9743a6d3         RandomForest    327.925227   -0.141249

Model Engineering Completed Successfully!
Generated Artifacts:
  - Trained model files (.pkl)
  - Model metadata (model_metadata.json)
  - Prediction results (model_predictions.csv)
  - Feature importance analysis
  - MLflow experiment tracking data
