In [2]:
# =============================================================================
# STEP 1: DATA INTEGRATION
# =============================================================================
import subprocess
import os

# Set project root
script_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(script_dir) if script_dir.endswith('src') else script_dir
if os.getcwd().endswith('src'):
    os.chdir('..')
project_root = os.getcwd()

# Run data integration script
print("Running 01-data_integration.py...")
script_path = os.path.join(project_root, 'src', '01-data_integration.py')
result = subprocess.run(['python', script_path], capture_output=True, text=True)
print(result.stdout)
if result.returncode != 0:
    print(f"Error: {result.stderr}")
    raise RuntimeError("Data integration failed")

print("✅ Data integration complete")

Running 01-data_integration.py...
Data integration complete: learn(50,044 rows, 18.2MB) | test(50,042 rows, 17.8MB)


                      DATASET SUMMARY                       
Metric                                     Learn         Test
------------------------------------------------------------
Observations                              50,044       50,042
Features                                      50           50
Target column                             target          N/A

Column groups for pipeline:
  - Current job cols: 13
  - Retired job cols: 13
  - Pension cols: 1
  - Sport cols: 2

✅ Datasets ready for model building


In [None]:
# =============================================================================
# STEP 2: MODEL BUILDING
# =============================================================================
import subprocess
import os

# Run model building script
print("Running 02-model_building.py...")
print("=" * 60)
script_path = os.path.join(os.getcwd(), 'src', '02-model_building.py')
result = subprocess.run(['python', script_path], capture_output=True, text=True)
print(result.stdout)
if result.returncode != 0:
    print(f"Error: {result.stderr}")
    raise RuntimeError("Model building failed")

print("\n✅ Model building complete")
print("   - Model saved to: models/best_model.joblib")
print("   - Figures saved to: figures/")

In [5]:
# =============================================================================
# STEP 3: DISPLAY PIPELINE DIAGRAM
# =============================================================================
import sys
import os
import warnings
import joblib
from sklearn import set_config

# Add src directory to path for custom transformers
src_dir = os.path.join(os.getcwd(), 'src')
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

# Import custom transformers (required for unpickling)
from __fn__ActivityTypeImputer import ActivityTypeImputer
from __fn__SportImputer import SportImputer
from __fn__AAV2020Encoder import AAV2020Encoder

# Load the trained model
best_model = joblib.load('models/best_model.joblib')

# Display pipeline diagram
set_config(display='diagram')
best_model

0,1,2
,steps,"[('activity_imputer', ...), ('sport_imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,retired_cols,"['ECONOMIC_SECTOR_retired', 'Previous_dep_retired', ...]"
,job_cols,"['job_desc_current', 'Work_condition_current', ...]"
,pension_cols,['RETIREMENT_INCOME']

0,1,2
,sport_cols,"['Sports', 'Categorie']"

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,-1
,min_frequency,
,max_categories,

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,300
,max_leaf_nodes,15
,max_depth,
,min_samples_leaf,10
,l2_regularization,1.0
,max_features,1.0
,max_bins,255


In [7]:
# =============================================================================
# STEP 3BIS: DISPLAY PIPELINE DIAGRAMS (RF BASELINE & RF FULL)
# =============================================================================
import sys
import os
import warnings
import joblib
from sklearn import set_config

# Add src directory to path for custom transformers
src_dir = os.path.join(os.getcwd(), 'src')
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

# Import custom transformers (required for unpickling)
from __fn__ActivityTypeImputer import ActivityTypeImputer
from __fn__SportImputer import SportImputer
from __fn__AAV2020Encoder import AAV2020Encoder

# Load the trained RF Baseline model
rf_baseline_model = joblib.load('models/rf_model_baseline.joblib')

# Load the trained RF Full model
rf_full_model = joblib.load('models/rf_model.joblib')

# Display pipeline diagrams
set_config(display='diagram')
print("Random Forest Baseline Pipeline:")
rf_baseline_model
print("Random Forest Full Pipeline:")
rf_full_model

Random Forest Baseline Pipeline:
Random Forest Full Pipeline:


0,1,2
,steps,"[('activity_imputer', ...), ('sport_imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,retired_cols,"['ECONOMIC_SECTOR_retired', 'Previous_dep_retired', ...]"
,job_cols,"['job_desc_current', 'Work_condition_current', ...]"
,pension_cols,['RETIREMENT_INCOME']

0,1,2
,sport_cols,"['Sports', 'Categorie']"

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,-1
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'n/a'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,20
,min_samples_split,10
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# =============================================================================
# DISPLAY PIPELINE DIAGRAM: RANDOM FOREST BASELINE (02bis)
# =============================================================================
import joblib
from sklearn import set_config
from sklearn.utils import estimator_html_repr

set_config(display='diagram')
rf_baseline_model = joblib.load('models/rf_model_baseline.joblib')

with open("figures/pipeline.html", "w") as f:
    f.write(estimator_html_repr(rf_baseline_model))

import imgkit

imgkit.from_file('figures/pipeline.html', 'figures/pipeline.png')  

rf_baseline_model

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,20
,min_samples_split,10
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
# =============================================================================
# DISPLAY PIPELINE DIAGRAM: RANDOM FOREST FULL (02ter)
# =============================================================================
import joblib
from sklearn import set_config
set_config(display='diagram')
rf_full_model = joblib.load('models/rf_model.joblib')
rf_full_model

0,1,2
,steps,"[('activity_imputer', ...), ('sport_imputer', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,retired_cols,"['ECONOMIC_SECTOR_retired', 'Previous_dep_retired', ...]"
,job_cols,"['job_desc_current', 'Work_condition_current', ...]"
,pension_cols,['RETIREMENT_INCOME']

0,1,2
,sport_cols,"['Sports', 'Categorie']"

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,-1
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'n/a'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,20
,min_samples_split,10
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# =============================================================================
# DISPLAY PIPELINE DIAGRAM: BEST MODEL (HGBR or main)
# =============================================================================
import joblib
from sklearn import set_config
set_config(display='diagram')
best_model = joblib.load('models/best_model.joblib')
best_model

In [None]:
# =============================================================================
# STEP 2BIS: MODEL BUILDING (RF BASELINE & RF FULL)
# =============================================================================
import subprocess
import os

# Run Random Forest Baseline model building script
print("Running 02bis-model_building_RF_baseline.py...")
print("=" * 60)
script_path_baseline = os.path.join(os.getcwd(), 'src', '02bis-model_building_RF_baseline.py')
result_baseline = subprocess.run(['python', script_path_baseline], capture_output=True, text=True)
print(result_baseline.stdout)
if result_baseline.returncode != 0:
    print(f"Error: {result_baseline.stderr}")
    raise RuntimeError("RF Baseline model building failed")

print("\n✅ RF Baseline model building complete")
print("   - Model saved to: models/rf_model_baseline.joblib")
print("   - Figures saved to: figures/")

# Run Random Forest Full model building script
print("\nRunning 02ter-model_building_RF.py...")
print("=" * 60)
script_path_full = os.path.join(os.getcwd(), 'src', '02ter-model_building_RF.py')
result_full = subprocess.run(['python', script_path_full], capture_output=True, text=True)
print(result_full.stdout)
if result_full.returncode != 0:
    print(f"Error: {result_full.stderr}")
    raise RuntimeError("RF Full model building failed")

print("\n✅ RF Full model building complete")
print("   - Model saved to: models/rf_model_2ter.joblib")
print("   - Figures saved to: figures/")