In [5]:
library(tidyverse)
library(MASS)
library(dplyr)
library(ggplot2)

In [2]:
library(glue)

## Haojia's data cleaning

In [3]:
data_dir <- '/Users/u1001626/Documents/PhD/Statistical Practice/Projects/Project2/phs7050_neoNateCHD'
nnns0 <- read.csv(glue("{data_dir}/NNNS_score_data.csv"))

In [6]:
# clean up variable names
# remove the dots at the end of variable names
colnames(nnns0) <- gsub("\\.+$", "", colnames(nnns0))
# replace all the dots in variable names with underscores
colnames(nnns0) <- gsub("\\.+", "_", colnames(nnns0))

nnns <- nnns0 |> 
  
  filter(
    # exclude neurologic or airway anomaly
    Neurologic_Complication == 0, AirwayAnomalyYN == 0,
    # include infants from birth to 4 weeks old
    # there are two outliers with age at surgery > 30 days
    Age_at_Surgery_days <= 30
  ) |>
  
  mutate(
    
    # some binary variables have values of 1/2 or Y/N, recode them to 0/1
    Female = as.integer(sex_1_M_2_F == 2),
    Premature = as.integer(Premature == 1),
    Extubation_failure = as.integer(Extubation_failure == "Y"),
    
    # for model building purposes, combine the 2 levels w/o arch obstruction in cardiac anatomy
    Cardiac_Anatomy = factor(case_when(
      Cardiac_Anatomy %in% c(1,3) ~ 1,
      Cardiac_Anatomy == 2 ~ 2,
      Cardiac_Anatomy == 4 ~ 3
    ), levels = 1:3, labels = c("W/o arch obstruction", "Single ventricle w/ arch obstruction", "Two ventricle w/ arch obstruction"))
    
  ) |>
  
  # convert date variables to date class
  mutate_at(
    vars("Date_PO_feeds_started", "Date_Reaching_Full_PO", "Date_Identified_as_not_yet_full_PO"), 
    as_date, format = "%m/%d/%Y"
  ) |> 
  
  # drop unnecessary variables
  dplyr::select(!c(
    "sex_1_M_2_F", # use Female instead
    # "Intubated_Pre_operatively", "bypass_used", "bypass_time_min", # not of interest 
    "Neurologic_Complication", "AirwayAnomalyYN" # already excluded
  )) 

# dichotomize attention scores
nnns <- nnns |>
  mutate(Pre_op_attention_2cat = ifelse(is.na(Pre_Op_NNNS_attention_score) | Pre_Op_NNNS_attention_score <= 3, 0, 1),
         Post_op_attention_2cat = ifelse(is.na(Post_Op_NNNS_attention_score) | Post_Op_NNNS_attention_score <= 4, 0, 1))

saveRDS(nnns, "nnns_attention_2cat.rds")

In [9]:
# variables with missing data
data.frame(nmiss = colSums(is.na(nnns |> select(-starts_with("Date"))))) |>
  # only keep variables with missing data
  filter(nmiss > 0) |>
  # add percentage
  mutate(perc = round(nmiss / nrow(nnns) * 100, 1)) |>
  # add variable names
  rownames_to_column("variable") |>
  # match variable names with labels
  mutate(variable = dict_nnns$label[match(variable, dict_nnns$name)]) |>
  # sort by percentage
  arrange(desc(perc)) |>
  kable() |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

ERROR: Error in kable_styling(kable(arrange(mutate(rownames_to_column(mutate(filter(data.frame(nmiss = colSums(is.na(select(nnns, : could not find function "kable_styling"
