In [2]:
import sys
import os
from pathlib import Path

# # Add the parent directory to the path to allow imports
# sys.path.append(str(Path(__file__).parent.parent))

from model_pipeline.config import get_config
from model_pipeline.pipeline import run_full_training, run_preprocessing_only

## 

In [3]:
# Get centralized configuration
config = get_config()
    
# Display configuration
print(f"Input CSV: {config['input_csv_path']}")
print(f"Date Column: {config['date_column']}")
print(f"Target Column: {config['target_column']}")
print(f"Features: {config['feature_columns']}")

✓ Configuration validated successfully!
  - Input: ../alert_analysis/data/main_data_2022/df_main_active_adult_renamed_clean_sample_10pct.csv
  - Features: 6 columns
  - Data split: 70% train, 15% eval, 15% test
  - Output: model_pipeline/outputs
Input CSV: ../alert_analysis/data/main_data_2022/df_main_active_adult_renamed_clean_sample_10pct.csv
Date Column: time_prescribing_order
Target Column: alert_status_binary
Features: ['age', 'gender', 'hospital_days', 'charlson_score', 'shift_type', 'unit_category']


In [5]:
# get file path
file_path = config['input_csv_path']
file_path

'../alert_analysis/data/main_data_2022/df_main_active_adult_renamed_clean_sample_10pct.csv'

## Part 2: run pre-processing

- Split data
- prepere data
- run profiling 

In [6]:
# Run preprocessing only:

train_df, eval_df, test_df, artifacts = run_preprocessing_only(config)

Running preprocessing pipeline...
Loading data...
✓ Data loaded: 250,658 rows, 8 columns

Splitting data...
✓ Data split: Train=175,460, Eval=37,598, Test=37,600

Preprocessing data...
  Preprocessing configuration:
    - Impute numeric: True
    - Scale numeric: False
    - Rare category threshold: 0.01
  Fitting preprocessor on training data...
  Preprocessing details:
    - Numeric columns: 3
    - Categorical columns: 3
    - Numeric imputation applied to: age, hospital_days, charlson_score
    - No missing values found in numeric columns
    - Categorical processing applied to: gender, shift_type, unit_category
      - gender: 3 levels preserved
      - shift_type: 4 levels preserved
      - unit_category: 8 levels preserved
  Transforming data...
  Final processed shapes:
    - Train: (175460, 7)
    - Eval: (37598, 7)
    - Test: (37600, 7)

Generating profile report...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 104.14it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✓ Profile report saved to: model_pipeline/outputs/20250818_185550/train_profile_report.html

✓ Preprocessing completed successfully!


In [7]:
# Get trainset

train_df.head()

Unnamed: 0,age,gender,hospital_days,charlson_score,shift_type,unit_category,alert_status_binary
0,71.0,FEMALE,1,0,night,Gynecology,0
1,74.0,FEMALE,2,2,night,Internal,0
2,74.0,FEMALE,2,2,night,Internal,0
3,90.0,MALE,1,5,night,Internal,0
4,69.0,MALE,16,6,night,Surgery,0


In [8]:
# Get evaluation set

eval_df.head()

Unnamed: 0,age,gender,hospital_days,charlson_score,shift_type,unit_category,alert_status_binary
175460,88.0,FEMALE,1,5,afternoon,Internal,0
175461,40.0,MALE,1,0,afternoon,Emergency,0
175462,85.0,FEMALE,1,6,afternoon,Internal,0
175463,85.0,FEMALE,1,6,afternoon,Internal,0
175464,85.0,FEMALE,1,6,afternoon,Internal,0


In [12]:
# get artifacts
profile_path = artifacts['profile_path']
run_directory = artifacts['run_directory']
print(profile_path)
print(run_directory)

model_pipeline/outputs/20250818_185550/train_profile_report.html
model_pipeline/outputs/20250818_185550
