In [1]:
import sys
import os
from pathlib import Path

# # Add the parent directory to the path to allow imports
#sys.path.append(str(Path.cwd()))
os.chdir(r'C:\Users\hibaa\Documents\GitHub\alert-fatigue')


from model_pipeline.config import get_config
from model_pipeline.pipeline import run_full_training, run_preprocessing_only
from model_pipeline.models.statsmodels_logit import StatsmodelsLogitModel
from model_pipeline.reporting.coefficients import (
    coefficients_to_or, create_coefficient_summary, filter_significant_coefficients,
    sort_coefficients_by_importance
)

In [2]:
#!pip install scikit-learn
os.getcwd()
#sys.path.append(str(Path.cwd()))

'C:\\Users\\hibaa\\Documents\\GitHub\\alert-fatigue'

In [3]:
# Get centralized configuration
config = get_config()
    
# Display configuration
print(f"Input CSV: {config['input_csv_path']}")
print(f"Date Column: {config['date_column']}")
print(f"Target Column: {config['target_column']}")
print(f"Features: {config['feature_columns']}")
print(f"output dir: {config['output_dir']}")

✓ Configuration validated successfully!
  - Input: alert_analysis/data/main_data_2022/df_main_active_adult_renamed_clean_sample_10pct.csv
  - Features: 7 columns
  - Data split: 70% train, 15% eval, 15% test
  - Output: model_pipeline\outputs
Input CSV: alert_analysis/data/main_data_2022/df_main_active_adult_renamed_clean_sample_10pct.csv
Date Column: time_prescribing_order
Target Column: alert_status_binary
Features: ['age', 'gender', 'hospital_days', 'charlson_score', 'shift_type', 'unit_category', 'drug_atc']
output dir: model_pipeline\outputs


In [4]:
# get file path
file_path = config['input_csv_path']
file_path

date_column = config['date_column']
print(date_column)

time_prescribing_order


## Part 2: run pre-processing

- Split data
- prepere data
- run profiling 

In [5]:
# Run preprocessing only:

train_df, eval_df, test_df, artifacts = run_preprocessing_only(config)

Running preprocessing pipeline...
Loading data...
✓ Data loaded: 250,658 rows, 9 columns

Splitting data...
✓ Data split: Train=175,460, Eval=37,598, Test=37,600

Preprocessing data...
  Preprocessing configuration:
    - Impute numeric: True
    - Scale numeric: False
    - Rare category threshold: 0.01
  Fitting preprocessor on training data...
  Preprocessing details:
    - Numeric columns: 3
    - Categorical columns: 4
    - Numeric imputation applied to: age, hospital_days, charlson_score
    - No missing values found in numeric columns
    - Categorical processing applied to: gender, shift_type, unit_category, drug_atc
      - gender: 3 levels preserved
      - shift_type: 4 levels preserved
      - unit_category: 8 levels preserved
      - drug_atc: 23 levels preserved
  Transforming data...
  Final processed shapes:
    - Train: (175460, 8)
    - Eval: (37598, 8)
    - Test: (37600, 8)

Generating profile report...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                            | 0/8 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 50.52it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✓ Profile report saved to: model_pipeline\outputs\20250824_221607\train_profile_report.html

✓ Preprocessing completed successfully!


In [6]:
# Get trainset

train_df.head()

Unnamed: 0,age,gender,hospital_days,charlson_score,shift_type,unit_category,drug_atc,alert_status_binary
0,71.0,FEMALE,1,0,night,Gynecology,Other,0
1,74.0,FEMALE,2,2,night,Internal,Other,0
2,74.0,FEMALE,2,2,night,Internal,N02BE01,0
3,90.0,MALE,1,5,night,Internal,Other,0
4,69.0,MALE,16,6,night,Surgery,N02BE01,0


In [7]:
# Get evaluation set

eval_df.head()

Unnamed: 0,age,gender,hospital_days,charlson_score,shift_type,unit_category,drug_atc,alert_status_binary
175460,88.0,FEMALE,1,5,afternoon,Internal,Other,0
175461,40.0,MALE,1,0,afternoon,Emergency,A03FA01,0
175462,85.0,FEMALE,1,6,afternoon,Internal,Other,0
175463,85.0,FEMALE,1,6,afternoon,Internal,Other,0
175464,85.0,FEMALE,1,6,afternoon,Internal,Other,0


In [8]:
# get artifacts
profile_path = artifacts['profile_path']
run_directory = artifacts['run_directory']
print(profile_path)
print(run_directory)

model_pipeline\outputs\20250824_221607\train_profile_report.html
model_pipeline\outputs\20250824_221607


## Run model training

In [9]:
# Train model
print("Training logistic regression model...")
model = StatsmodelsLogitModel(use_glm=config['use_glm'])
    
train_features = train_df[config['feature_columns']]
train_target = train_df[config['target_column']]
    
model.fit(train_features, train_target)
    
    # Get model summary
model_summary = model.get_model_summary()
aic_bic = model.get_aic_bic()

Training logistic regression model...




In [10]:
print(model_summary)

                  Generalized Linear Model Regression Results                  
Dep. Variable:     alert_status_binary   No. Observations:               175460
Model:                             GLM   Df Residuals:                   175424
Model Family:                 Binomial   Df Model:                           35
Link Function:                   Logit   Scale:                          1.0000
Method:                           IRLS   Log-Likelihood:                -29977.
Date:                 Sun, 24 Aug 2025   Deviance:                       59954.
Time:                         22:17:53   Pearson chi2:                 1.73e+05
No. Iterations:                     26   Pseudo R-squ. (CS):            0.01379
Covariance Type:             nonrobust                                         
                                                                                                                                                                                                        

In [11]:
 # Get coefficients and convert to odds ratios
print("Analyzing coefficients...")
coef_df = model.get_coefficients()
or_df = coefficients_to_or(coef_df)
    
# Sort coefficients by importance
or_df_sorted = sort_coefficients_by_importance(or_df, method='odds_ratio')
    
# Filter significant coefficients
significant_coef = filter_significant_coefficients(or_df)
    
# Create coefficient summary
coef_summary = create_coefficient_summary(or_df)

Analyzing coefficients...


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [12]:
coef_summary

{'total_features': 42,
 'significant_features': np.int64(25),
 'non_significant_features': np.int64(17),
 'significance_breakdown': {'***': 22, 'ns': 17, '**': 3},
 'odds_ratio_stats': {'min': 1.7196755521618243e-11,
  'max': 1.3969401671200836,
  'mean': 0.6683749137192656,
  'median': 0.9191201497977162},
 'factor_types': {'protective_factors': 29, 'risk_factors': 11},
 'p_value_stats': {'min': 0.0,
  'max': 0.9984812668819361,
  'mean': 0.25801137949784075,
  'median': 3.849432457350164e-05}}