In [13]:
import pandas as pd
from pathlib import Path
import sys

sys.path.insert(0, str(Path.cwd().parent / "common"))  # common
sys.path.insert(0, str(Path.cwd().parent / "mcp"))  # mcp

from agent import AssumptionsAgent
from prompt import ModelingPrompt
from app_shared import Database
from ilec_mcp_server import create_REnv
from ilec_r_lib import AgentRCommands as RCmd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
agent = AssumptionsAgent()

## Canary

In [14]:
print(agent.prompt("How many differerent insurance plan types are there, list them"))

There are 7 plan types:
- Other
- Perm
- Term
- UL
- ULSG
- VL
- VLSG


## Modeling

### Prepare Modeling Data

In [20]:
with Database.get_duckdb_conn() as conn:
    res = conn.execute("PRAGMA table_info(ILEC_DATA)").fetchall()
res

[(0, 'Observation_Year', 'INTEGER', False, None, False),
 (1, 'Preferred_Indicator', 'VARCHAR', False, None, False),
 (2, 'Gender', 'VARCHAR', False, None, False),
 (3, 'Smoker_Status', 'VARCHAR', False, None, False),
 (4, 'Insurance_Plan', 'VARCHAR', False, None, False),
 (5, 'Issue_Age', 'INTEGER', False, None, False),
 (6, 'Duration', 'INTEGER', False, None, False),
 (7, 'Attained_Age', 'INTEGER', False, None, False),
 (8, 'Age_Basis', 'VARCHAR', False, None, False),
 (9, 'Face_Amount_Band', 'VARCHAR', False, None, False),
 (10, 'Issue_Year', 'INTEGER', False, None, False),
 (11, 'Number_Of_Preferred_Classes', 'DECIMAL(11,1)', False, None, False),
 (12, 'Preferred_Class', 'VARCHAR', False, None, False),
 (13, 'SOA_Anticipated_Level_Term_Period', 'VARCHAR', False, None, False),
 (14, 'SOA_Guaranteed_Level_Term_Period', 'VARCHAR', False, None, False),
 (15, 'SOA_Post_level_Term_Indicator', 'VARCHAR', False, None, False),
 (16, 'Select_Ultimate_Indicator', 'VARCHAR', False, None, False

In [26]:
predictors = list(map(lambda x: x[1], res[1:13]))
predictor_columns = ",".join(predictors)
target_column = "Number_Of_Deaths"
offset_column = "ExpDeathQx2015VBTwMI_byPol"

predictor_columns

'Preferred_Indicator,Gender,Smoker_Status,Insurance_Plan,Issue_Age,Duration,Attained_Age,Age_Basis,Face_Amount_Band,Issue_Year,Number_Of_Preferred_Classes,Preferred_Class'

In [27]:
MODEL_DATA_QUERY = f"""
create or replace view UL_MODEL_DATA as (
    select
        {predictor_columns},
        {target_column},
        {offset_column},
        case 
            when Observation_Year < 2016 then 'TRAIN'
            else 'TEST'
        end as DATASET
    from ILEC_DATA
    where Insurance_Plan = 'UL'
)
"""

with Database.get_duckdb_conn() as conn:
    vw_res = conn.execute(MODEL_DATA_QUERY).fetchall()
vw_res

[]

In [29]:
with Database.get_duckdb_conn() as conn:
    vw_res = conn.execute("select * from UL_MODEL_DATA limit 1").fetchall()
vw_res

[('0',
  'Female',
  'NonSmoker',
  'UL',
  0,
  1,
  0,
  'ALB',
  '25000-49999',
  2009,
  Decimal('0.0'),
  None,
  0.0,
  0.0220545389421079,
  'TRAIN')]

### Run Modeling Prompt

In [36]:
modeling_prompt = ModelingPrompt(
    "UL_MODEL_DATA",
    predictors,
    target_column,
    offset_column
)

str(modeling_prompt)

'The goal is to create a model to predict mortality on the sql table \'UL_MODEL_DATA\'.First, call sql_schema() with \'UL_MODEL_DATA\' as the table_name argument.Use the column \'Number_Of_Deaths\' as the target (y_var) and column \'ExpDeathQx2015VBTwMI_byPol\' as the offset (offset_var), including in calls to cmd_rpart() and cmd_glmnet().If either the target or y_var columns are not present in \'UL_MODEL_DATA\', fail and report your findings.Perform exploratory data analysis on UL_MODEL_DATA using sql_query(). Use the EDA results in model design when possible.These columns in UL_MODEL_DATA are all valid model features: Preferred_Indicator,Gender,Smoker_Status,Insurance_Plan,Issue_Age,Duration,Attained_Age,Age_Basis,Face_Amount_Band,Issue_Year,Number_Of_Preferred_Classes,Preferred_Class.You may perform basic feature engineering via binning continuous variables as categorical or ordinal, but nothing else.Ensure any basic feature engineering tasks are included in sql argument for cmd_cre

In [None]:
agent.prompt(str(modeling_prompt))

In [2]:
sql_query = "select\n  Preferred_Indicator,\n  Gender,\n  case when Smoker_Status is null or upper(Smoker_Status)='UNKNOWN' then 'Unknown' else Smoker_Status end as Smoker_Status,\n  Insurance_Plan,\n  Issue_Age,\n  Duration,\n  Attained_Age,\n  Age_Basis,\n  Face_Amount_Band,\n  Issue_Year,\n  cast(Number_Of_Preferred_Classes as integer) as Number_Of_Preferred_Classes,\n  case when Preferred_Indicator='0' and Preferred_Class is null then '0' else coalesce(Preferred_Class, '0') end as Preferred_Class,\n  Number_Of_Deaths,\n  ExpDeathQx2015VBTwMI_byPol\nfrom UL_MODEL_DATA\nwhere DATASET='TRAIN'\n  and ExpDeathQx2015VBTwMI_byPol > 0\n  and Number_Of_Deaths >= 0\n  and Issue_Age between 0 and 97\n  and Attained_Age between 0 and 120\n  and Duration between 1 and 95"
print(sql_query)

select
  Preferred_Indicator,
  Gender,
  case when Smoker_Status is null or upper(Smoker_Status)='UNKNOWN' then 'Unknown' else Smoker_Status end as Smoker_Status,
  Insurance_Plan,
  Issue_Age,
  Duration,
  Attained_Age,
  Age_Basis,
  Face_Amount_Band,
  Issue_Year,
  cast(Number_Of_Preferred_Classes as integer) as Number_Of_Preferred_Classes,
  case when Preferred_Indicator='0' and Preferred_Class is null then '0' else coalesce(Preferred_Class, '0') end as Preferred_Class,
  Number_Of_Deaths,
  ExpDeathQx2015VBTwMI_byPol
from UL_MODEL_DATA
where DATASET='TRAIN'
  and ExpDeathQx2015VBTwMI_byPol > 0
  and Number_Of_Deaths >= 0
  and Issue_Age between 0 and 97
  and Attained_Age between 0 and 120
  and Duration between 1 and 95


In [14]:
workspace_id = "02916c65-7b41-40b6-a577-dd62f01f8026"
dataset_name = "model_data_train"
sql = "select\n  Preferred_Indicator,\n  Gender,\n  case when Smoker_Status is null or upper(Smoker_Status)='UNKNOWN' then 'Unknown' else Smoker_Status end as Smoker_Status,\n  Insurance_Plan,\n  Issue_Age,\n  Duration,\n  Attained_Age,\n  Age_Basis,\n  Face_Amount_Band,\n  Issue_Year,\n  cast(Number_Of_Preferred_Classes as integer) as Number_Of_Preferred_Classes,\n  case when Preferred_Indicator='0' and Preferred_Class is null then '0' else coalesce(Preferred_Class, '0') end as Preferred_Class,\n  Number_Of_Deaths,\n  ExpDeathQx2015VBTwMI_byPol\nfrom UL_MODEL_DATA\nwhere DATASET='TRAIN'\n  and ExpDeathQx2015VBTwMI_byPol > 0\n  and Number_Of_Deaths >= 0\n  and Issue_Age between 0 and 97\n  and Attained_Age between 0 and 120\n  and Duration between 1 and 95"

r_env = create_REnv(workspace_id)
new_workspace_id = r_env.workspace_id    
dataset_res = RCmd.run_command(
    RCmd.cmd_create_dataset,
    (
        dataset_name,
        sql
    ),
    r_env
)

{
    "workspace_id": new_workspace_id,
    "result": dataset_res
}

INFO:ilec_r_lib:setting up environment
INFO:ilec_r_lib:rsync /home/mike/workspace/soa-ilec/soa-ilec/mcp_agent_work/workspace_02916c65-7b41-40b6-a577-dd62f01f8026->/home/mike/workspace/soa-ilec/soa-ilec/mcp_agent_work/workspace_3a70b9f3-c6b7-4f82-871b-df363b47baa9
  
  
  
Attaching package: ‘dplyr’

  

    filter, lag

  

    intersect, setdiff, setequal, union

  
Attaching package: ‘recipes’

  

    update

  

    step

  
  method                 from    
  as.character.dev_topic generics
  
Attaching package: ‘arrow’

  

    timestamp

  


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.1     ✔ readr     2.1.5
✔ ggplot2   4.0.0     ✔ stringr   1.5.2
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.1.0     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::duration() masks arrow::duration()
✖ tidyr::expand()       masks Matrix::expand()
✖ dplyr::filter()       masks stats::filter()
✖ stringr::fixed()      masks recipes::fixed()
✖ dplyr::lag()          masks stats::lag()
✖ tidyr::pack()         masks Matrix::pack()
✖ tidyr::unpack()       masks Matrix::unpack()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


INFO:ilec_r_lib:tearing down environment
INFO:rpy2.rinterface_lib.embedded:Embedded R ended.


{'workspace_id': '3a70b9f3-c6b7-4f82-871b-df363b47baa9',
 'result': {'success': True, 'result': {'n_rows': 4129417.0}}}