project.yaml

version: '3.0'

expectations:
  population_size: 1000

actions:

# study cohort

  generate_study_population_covid_admission:
    run: cohortextractor:latest generate_cohort --study-definition study_definition_covid_admission
    outputs:
      highly_sensitive:
        cohort: output/input_covid_admission.csv

  process_1:  
    run: r:latest analysis/process_1.R
    needs: [generate_study_population_covid_admission]
    outputs:
      highly_sensitive:
        case: output/case_covid_icu_death.csv
       # case2: output/case_covid_icu_death_2.csv
        control: output/control_covid_hosp.csv
      #  control2: output/control_covid_hosp_2.csv

  check_process_1: 
    run: r:latest -e 'rmarkdown::render("analysis/check_process_1.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [generate_study_population_covid_admission]
    outputs:
      moderately_sensitive:
        html: output/check_process_1.html
        csv: output/check_process_1.csv


# matching

  matching: #R MatchIt  matching with replacement
    run: r:latest -e 'rmarkdown::render("analysis/matching.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
    needs: [process_1]
    outputs:
      moderately_sensitive:
        html: output/matching.html
      highly_sensitive: 
        rds1: output/matched_patients.rds
        rds2: output/unmatched_cases.rds
        csv: output/matched_patients_id.csv
        
  check_unmatched:
    run: r:latest -e 'rmarkdown::render("analysis/check_unmatched.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
    needs: [matching]
    outputs:
      moderately_sensitive:
        html: output/check_unmatched.html

  extract_variables: # confounders
    run: cohortextractor:latest generate_cohort --study-definition study_definition_outcome --with-end-date-fix
    needs: [matching]
    outputs:
      highly_sensitive:
        cohort: output/input_outcome.csv

  process_Rmatching: #  confounders
    run: r:latest analysis/process_Rmatching.R
    needs: [extract_variables,matching]
    outputs:
      highly_sensitive:
        cohort1: output/matched_outcome.rds
        cohort2: output/matched_outcome_check.rds # filter died $ de-regist again
        rds1: output/abtype79.rds
        rds2: output/comor17.rds

# extract ab for RF
  extract_variables_ab_time:   # exposure variables
    run: cohortextractor:latest generate_cohort --study-definition study_definition_ab_time --with-end-date-fix
    needs: [matching]
    outputs:
      highly_sensitive:
        cohort: output/input_ab_time.csv

  process_ab_time: # exposures #merge ab time with mathced patients
    run: r:latest -e 'rmarkdown::render("analysis/process_ab_time.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
    needs: [extract_variables_ab_time,process_Rmatching]
    outputs:
      moderately_sensitive:
        html: output/process_ab_time.html
      highly_sensitive: 
         rds: output/matched_ab.rds

  model_RF_process: # merge 79 types of ab, split train and valid set
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_process.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [process_ab_time,process_Rmatching]
    outputs:
      moderately_sensitive:
        html: output/model_RF_process.html
      highly_sensitive: 
        rds1: output/train_X.rds
        rds2: output/train_Y.rds
        rds3: output/valid_X.rds
        rds4: output/valid_Y.rds
        rds5: output/abtype.rds


# train model

  model_RandomForest: # pick variables for model training
    run: r:latest -e 'rmarkdown::render("analysis/model_RandomForest.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_RandomForest.html
  #      csv1: output/var_tree.csv
        rds: output/model_RandomForest.rds

  model_RandomForest_check: # check performance
    run: r:latest -e 'rmarkdown::render("analysis/model_RandomForest_check.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process,model_RandomForest]
    outputs:
      moderately_sensitive:
        html: output/model_RandomForest_check.html

  model_RandomForest_tree: # check tree
    run: r:latest -e 'rmarkdown::render("analysis/model_RandomForest_tree.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process,model_RandomForest]
    outputs:
      moderately_sensitive:
        html: output/model_RandomForest_tree.html

  model_RandomForest_decile: # create decile groups for probabilities
    run: r:latest -e 'rmarkdown::render("analysis/model_RandomForest_decile.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process,model_RandomForest,process_ab_time,process_Rmatching]
    outputs:
      moderately_sensitive:
        html: output/model_RandomForest_decile.html
        rds1: output/development.rds
        rds2: output/validation.rds

  model: # coditional logistic regression for decile groups
    run: r:latest -e 'rmarkdown::render("analysis/model.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RandomForest_decile]
    outputs:
      moderately_sensitive:
        html: output/model.html

# distinct  
  model_RF_distinct: # pick variables for model training # distinct patients
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_distinct.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_RF_distinct.html
        rds: output/model_RF_distinct.rds

  model_RF_distinct_check: 
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_distinct_check.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process,model_RF_distinct]
    outputs:
      moderately_sensitive:
        html: output/model_RF_distinct_check.html

# ab users 
  model_RF_ab: # pick variables for model training # distinct patients
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_ab.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_RF_ab.html
        rds: output/model_RF_ab.rds

  model_RF_ab_check: 
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_ab_check.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process,model_RF_ab]
    outputs:
      moderately_sensitive:
        html: output/model_RF_ab_check.html

# conditional logistics
  model_clogit: # coditional logistic regression for expo variables
    run: r:latest -e 'rmarkdown::render("analysis/model_clogit.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RandomForest_decile,model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_clogit.html


  # # main analysis
 
  table1_round: 
    run: r:latest analysis/table1.R
    needs: [process_1,process_Rmatching]
    outputs:
      moderately_sensitive:
        csv1: output/table1_unmatched.csv
        csv2: output/table1_matched.csv
        csv3: output/table1_random.csv

  table2_round: 
    run: r:latest analysis/table2.R
    needs: [process_Rmatching]
    outputs:
      moderately_sensitive:
        csv1: output/table2_matched.csv
        csv3: output/table2_random.csv
  
  table3_round: # baseline table of exposure variables/ training &validation 
    run: r:latest analysis/table3.R
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        csv1: output/table3_train.csv
        csv2: output/table3_valid.csv
        csv3: output/table3_all.csv

        
# variables check
  check_variables: # check input
    run: r:latest -e 'rmarkdown::render("analysis/check_variables.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/check_variables.html

# test each variables

  model_RF_length: # length mean
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_length.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_RF_length.html

  model_RF_length_CV: # length cv
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_length_CV.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_RF_length_CV.html

  model_RF_interval: # interval mean
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_interval.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_RF_interval.html


  model_tuneRF: #
    run: r:latest -e 'rmarkdown::render("analysis/model_tuneRF.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [model_RF_process]
    outputs:
      moderately_sensitive:
        html: output/model_tuneRF.html

  check_ab_time:  
    run: r:latest -e 'rmarkdown::render("analysis/check_ab_time.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
    needs: [process_ab_time]
    outputs:
      moderately_sensitive:
        html: output/check_ab_time.html
      # highly_sensitive: 
      #   rds: output/matched_patients_monthly_ab.rds

  check_RF_grid: 
    run: r:latest -e 'rmarkdown::render("analysis/check_RF_grid.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [process_ab_time]
    outputs:
      moderately_sensitive:
        html: output/check_RF_grid.html

  check_RF: 
    run: r:latest -e 'rmarkdown::render("analysis/check_RF.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [process_ab_time]
    outputs:
      moderately_sensitive:
        html: output/check_RF.html

  model_RF: 
    run: r:latest -e 'rmarkdown::render("analysis/model_RF.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [process_ab_time]
    outputs:
      moderately_sensitive:
        html: output/model_RF.html

  model_RF_process_subclass: # random sampling by subclass
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_process_subclass.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [process_ab_time]
    outputs:
      moderately_sensitive:
        html: output/model_RF_process_subclass.html

  model_RF_process_check_sample: # check sample method
    run: r:latest -e 'rmarkdown::render("analysis/model_RF_process_check_sample.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
    needs: [process_ab_time, process_Rmatching]
    outputs:
      moderately_sensitive:
        html: output/model_RF_process_check_sample.html

# #check

#   process_filter_ab: # filter ab users
#     run: r:latest -e 'rmarkdown::render("analysis/process_filter_ab.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
#     needs: [process_Rmatching]
#     outputs:
#       moderately_sensitive:
#         html: output/process_filter_ab.html
#       highly_sensitive: 
#         csv: output/matched_patients_id_ab.csv

#   extract_variables_ab_yr1: 
#     run: cohortextractor:latest generate_cohort --study-definition study_definition_ab_yr1 --with-end-date-fix
#     needs: [process_filter_ab]
#     outputs:
#       highly_sensitive:
#         cohort: output/input_ab_yr1.csv

#   extract_variables_ab_yr2: 
#     run: cohortextractor:latest generate_cohort --study-definition study_definition_ab_yr2 --with-end-date-fix
#     needs: [process_filter_ab]
#     outputs:
#       highly_sensitive:
#         cohort: output/input_ab_yr2.csv

#   extract_variables_ab_yr3: 
#     run: cohortextractor:latest generate_cohort --study-definition study_definition_ab_yr3 --with-end-date-fix
#     needs: [process_filter_ab]
#     outputs:
#       highly_sensitive:
#         cohort: output/input_ab_yr3.csv

#   extract_variables_ab_yr3_15d: 
#     run: cohortextractor:latest generate_cohort --study-definition study_definition_ab_yr3_15d --with-end-date-fix
#     needs: [process_filter_ab]
#     outputs:
#       highly_sensitive:
#         cohort: output/input_ab_yr3_15d.csv


#   process_merge_ab: # merge 1-2-3 year ab 
#     run: r:latest -e 'rmarkdown::render("analysis/process_merge_ab.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
#     needs: [process_Rmatching,extract_variables_ab_yr3_15d, extract_variables_ab_yr3,extract_variables_ab_yr2,extract_variables_ab_yr1]
#     outputs:
#       moderately_sensitive:
#         html: output/process_merge_ab.html
#       highly_sensitive: 
#         rds: output/matched_patients_monthly_ab.rds

#   check_ab_yr1:
#     run: r:latest -e 'rmarkdown::render("analysis/check_ab_yr1.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
#     needs: [extract_variables_ab_yr1,matching,process_Rmatching]
#     outputs:
#       moderately_sensitive:
#         html: output/check_ab_yr1.html

#   check_ab_yr3:
#     run: r:latest -e 'rmarkdown::render("analysis/check_ab_yr3.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
#     needs: [process_Rmatching]
#     outputs:
#       moderately_sensitive:
#         html: output/check_ab_yr3.html
 
#   check_abtype:
#     run: r:latest -e 'rmarkdown::render("analysis/check_abtype.Rmd", knit_root_dir = "/workspace", output_dir="/workspace/output")'
#     needs: [process_Rmatching]
#     outputs:
#       moderately_sensitive:
#         html: output/check_abtype.html

#   check_process_1: 
#     run: r:latest -e 'rmarkdown::render("analysis/check_process_1.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
#     needs: [generate_study_population_covid_primarycare,generate_study_population_covid_SGSS,generate_study_population_covid_admission]
#     outputs:
#       moderately_sensitive:
#         html: output/check_process_1.html

#   check_RF: 
#     run: r:latest -e 'rmarkdown::render("analysis/check_RF.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
#     needs: [process_Rmatching]
#     outputs:
#       moderately_sensitive:
#         html: output/check_RF.html
  
#   check_RF_grid: 
#     run: r:latest -e 'rmarkdown::render("analysis/check_RF_grid.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
#     needs: [process_Rmatching]
#     outputs:
#       moderately_sensitive:
#         html: output/check_RF_grid.html
  
#   check_RF_yr1: 
#     run: r:latest -e 'rmarkdown::render("analysis/check_RF_yr1.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
#     needs: [extract_variables_ab_yr1,matching,process_Rmatching]
#     outputs:
#       moderately_sensitive:
#         html: output/check_RF_yr1.html

  # model_RF_clust: # use proximity
  #   run: r:latest -e 'rmarkdown::render("analysis/model_RF_clust.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
  #   needs: [model_RandomForest_decile,model_RF_process]
  #   outputs:
  #     moderately_sensitive:
  #       html: output/model_RF_clust.html
  # #      csv1: output/var_tree.csv
  #       rds: output/model_clust.rds


  # model_tuneRF: #mtry, 
  #   run: r:latest -e 'rmarkdown::render("analysis/model_tuneRF.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
  #   needs: [model_RF_process]
  #   outputs:
  #     moderately_sensitive:
  #       html: output/model_tuneRF.html

  # model_RF_training: #
  #   run: r:latest -e 'rmarkdown::render("analysis/model_RF_training.Rmd", knit_root_dir = "/workspace", output_dir = "output")'
  #   needs: [model_RF_process]
  #   outputs:
  #     moderately_sensitive:
  #       html: output/model_RF_training.html