In [1]:
import json
from copy import deepcopy

import pandas as pd
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent) + "/common/")  # adds parent directory

from audit import ModelNotebookRenderer

%load_ext autoreload
%autoreload 2

In [4]:
rmd_maker = ModelNotebookRenderer("/home/mike/workspace/soa-ilec/soa-ilec/data/workspaces/ul_post_2000_vbt_rd_2")
print(rmd_maker.render())

---
title: "Model Output"
author: "MMAgent"
date: "2025-11-29"
output: html_document
---

```{r setup, include=FALSE}

install_if_missing <- function(all_pkgs) { 
  for (pkg_name in all_pkgs) {
    if (!require(pkg_name, character.only = T)) {
      install.packages(pkg_name)
      require(pkg_name, character.only = T)
    }  
  }
}

# these packages will be installed if they are missing,
# and then loaded into the RSession
install_if_missing(c(
  "glmnet",
  "recipes",
  "butcher",
  "carrier",
  "rpart",
  "rpart.plot",
  "rsample",
  "arrow",
  "splines",
  "splines2",
  "tidyverse"
))

knitr::opts_chunk$set(echo = TRUE)

```


## Setup Data

Should contain both a test and train split.

```{r data-setup}

# SELECT
#   DATASET,
#   NUMBER_OF_DEATHS,
#   EXPDEATHQX2015VBTWMI_BYPOL,
#   ATTAINED_AGE,
#   GENDER,
#   SMOKER_STATUS,
#   PREFERRED_CLASS,
#   FACE_AMOUNT_BAND,
#   NUMBER_OF_PREFERRED_CLASSES
# FROM V_FUW_UL
# WHERE
#   DATASET = 'TRAIN' AND EXPDEATHQX2015VBTWMI_BYPOL > 0
m

In [2]:
with open("/home/mike/workspace/soa-ilec/soa-ilec/data/workspaces/ul_post_2000_vbt_rd_2/final.json", "r") as fh:
    model_data = json.load(fh)

model_data

{'workspace_id': '1d1f14ef-cb11-4b8c-901f-4dd67ba1bed5',
 'sql_log': [{'success': True,
   'desc': 'Count rows overall and by DATASET',
   'sql': 'select DATASET, count(*) as n_rows from V_FUW_UL group by DATASET order by DATASET limit 1000'},
  {'success': True,
   'desc': 'Count rows with non-positive exposure',
   'sql': 'select sum(case when EXPDEATHQX2015VBTWMI_BYPOL <= 0 or EXPDEATHQX2015VBTWMI_BYPOL is null then 1 else 0 end) as n_bad_exposure from V_FUW_UL limit 1000'},
  {'success': True,
   'desc': 'Overall totals and AE',
   'sql': 'select sum(NUMBER_OF_DEATHS) as total_deaths, sum(EXPDEATHQX2015VBTWMI_BYPOL) as total_exposure, sum(NUMBER_OF_DEATHS)/sum(EXPDEATHQX2015VBTWMI_BYPOL) as overall_qx from V_FUW_UL limit 1000'},
  {'success': True,
   'desc': 'Null counts for key columns',
   'sql': 'select \n  sum(case when NUMBER_OF_DEATHS is null then 1 else 0 end) as n_null_deaths,\n  sum(case when EXPDEATHQX2015VBTWMI_BYPOL is null then 1 else 0 end) as n_null_exposure,\n  sum

In [3]:
final_model_data = model_data["final_model_log"]
final_model_data

{'type': 1,
 'next': {'type': 2,
  'next': {'type': 2,
   'next': {'type': 2,
    'next': {'type': 2,
     'next': {'type': 2,
      'next': {'type': 2,
       'next': {'type': 2,
        'next': {'type': 2,
         'next': {'type': 2,
          'next': None,
          'entry': {'last_workspace_id': '0b241350-c0b9-47c5-90dc-dcce554effee',
           'workspace_id': '6761fda5-f91c-4849-8d1b-49dcba0b3f87',
           'tool_name': 'cmd_rpart',
           'args': ['model_data_test_preds_v5',
            ['ATTAINED_AGE',
             'GENDER',
             'SMOKER_STATUS',
             'PREFERRED_CLASS',
             'FACE_AMOUNT_BAND'],
            'MODEL_PRED',
            'NUMBER_OF_DEATHS',
            4,
            0.001],
           'result': {'success': True,
            'result': 'n= 9282 \n\nnode), split, n, deviance, yval\n      * denotes terminal node\n\n 1) root 9282 6289.44900 1.0206540  \n   2) PREFERRED_CLASS=1,2,3 7721 5483.95600 1.0147290  \n     4) FACE_AMOUNT_BAND=10000

In [36]:
depth = 6
curr_node = final_model_data

for _ in range(0, depth):
    curr_node = curr_node["next"]

curr_node = deepcopy(curr_node)
del curr_node["next"]

curr_node


{'type': 2,
 'entry': {'last_workspace_id': 'e7e5cca2-b10f-4f30-9ec8-ce63e9c06603',
  'workspace_id': 'd3bb5e7b-a609-451d-b5af-d7c1af4e5ab8',
  'tool_name': 'cmd_run_inference',
  'args': ['model_data_train', 'model_data_train_preds_v5'],
  'result': {'success': True,
   'result': 'created dataset model_data_train_preds_v5 with predictions contained in MODEL_PRED column.'}},
 'workspace_id': 'd3bb5e7b-a609-451d-b5af-d7c1af4e5ab8'}

In [37]:
import sqlglot

def pretty_sql(raw_sql):
    sql = sqlglot.transpile(raw_sql, read="duckdb", write="duckdb", pretty=True)[0]
    return "\n".join(list(map(lambda l: "# " + l, sql.split("\n")))) 

In [42]:
curr_node = final_model_data

# rparts before glmnet() are var_imp, after are model_validation
is_var_imp = True
datasets = []
var_imp = []
cmd_model = None
model_validation = []
model_inference = []

while curr_node is not None and "next" in curr_node:
    
    is_child_node = (
        curr_node["type"] == AuditLogReader.NODE_TYPE_CHILD
    )
    
    if is_child_node and "entry" in curr_node:
        node_entry = curr_node["entry"]
        cmd_name = node_entry["tool_name"] 
        node_args = node_entry["args"]
        if cmd_name == "cmd_create_dataset":
            cmd = {
                "name": node_args[0], 
                "sql" : pretty_sql(node_args[1])
            }
            datasets.append(cmd)
        elif cmd_name == "cmd_rpart":
            cmd = {
                "dataset" : node_args[0],
                "x_vars" : node_args[1],
                "offset_var" : node_args[2],
                "y_var" : node_args[3],
                "max_depth" : node_args[4],
                "cp" : node_args[5]
            }
            if is_var_imp:
                var_imp.append(cmd)
            else:
                model_validation.append(cmd)
        elif cmd_name == "cmd_run_inference":
            cmd = {
                "in_dataset" : node_args[0],
                "out_dataset" : node_args[1]
            }
            model_inference.append(cmd)
        elif cmd_name == "cmd_glmnet":
            is_var_imp = False
            cmd_model = {
                "dataset": node_args[0],
                "x_vars" : node_args[1],
                "design_matrix_vars" : node_args[2],
                "factor_vars_levels": node_args[3],
                "num_var_clip" : node_args[4],
                "offset_var" : node_args[5],
                "y_var" : node_args[6],
                "lambda_strat" : node_args[7]
            }
       
    curr_node = curr_node["next"]


In [43]:
doc = {
    "title": "Model Output",
    "date" : "2024-06-12",
}

cmds = {
    "datasets": datasets,
    "cmd_model" : cmd_model,
    "var_imp" : var_imp,
    "model_validation" : model_validation,
    "model_inference" : model_inference
}

In [45]:
from jinja2 import Environment, FileSystemLoader, select_autoescape
env = Environment(
    loader=FileSystemLoader("/home/mike/workspace/soa-ilec/soa-ilec/common/templates/"),
    autoescape=select_autoescape()
)
template = env.get_template("model_output.Rmd")

tmpl_out = template.render(
    doc = doc,
    cmds = cmds,    
)

with open("/home/mike/workspace/soa-ilec/soa-ilec/notebooks/sample_model.Rmd", "w") as fh:
    fh.write(tmpl_out)

In [33]:
template.render

<bound method Template.render of <Template 'model_output.Rmd'>>