In [2]:
library(tidyverse)
library(dplyr)
#library(knitr)
#library(kableExtra)
# handle missing data
#library(VIM) # visualization
#library(mice) # imputation

In [3]:
library(glue)

In [4]:
data_dir <- '/Users/u1001626/Documents/PhD/Statistical Practice/Projects/Project2/phs7050_neoNateCHD'
nnns0 <- read.csv(glue("{data_dir}/NNNS_score_data.csv"))

In [5]:
# clean up variable names
# remove the dots at the end of variable names
colnames(nnns0) <- gsub("\\.+$", "", colnames(nnns0))
# replace all the dots in variable names with underscores
colnames(nnns0) <- gsub("\\.+", "_", colnames(nnns0))

# summary of the data
# summary(nnns0)

nnns <- nnns0 |> 
  
  filter(
    # exclude neurologic or airway anomaly
    Neurologic_Complication == 0, AirwayAnomalyYN == 0,
    # include infants from birth to 4 weeks old
    # there are two outliers with age at surgery > 30 days
    Age_at_Surgery_days <= 30
  ) |>
  
  mutate(
    
    # some binary variables have values of 1/2 or Y/N, recode them to 0/1
    Female = as.integer(sex_1_M_2_F == 2),
    Premature = as.integer(Premature == 1),
    Extubation_failure = as.integer(Extubation_failure == "Y"),
    
    # for model building purposes, combine the 2 levels w/o arch obstruction in cardiac anatomy
    Cardiac_Anatomy = factor(case_when(
      Cardiac_Anatomy %in% c(1,3) ~ 1,
      Cardiac_Anatomy == 2 ~ 2,
      Cardiac_Anatomy == 4 ~ 3
    ), levels = 1:3, labels = c("W/o arch obstruction", "Single ventricle w/ arch obstruction", "Two ventricle w/ arch obstruction"))
    
  ) |>
  
  # convert date variables to date class
  mutate_at(
    vars("Date_PO_feeds_started", "Date_Reaching_Full_PO", "Date_Identified_as_not_yet_full_PO"), 
    as_date, format = "%m/%d/%Y"
  ) |> 
  
  # drop unnecessary variables
  select(!c(
    "sex_1_M_2_F", # use Female instead
    "Intubated_Pre_operatively", "bypass_used", "bypass_time_min", # not of interest 
    "Neurologic_Complication", "AirwayAnomalyYN" # already excluded
  )) 

In [7]:
head(nnns)

Unnamed: 0_level_0,Age_at_Surgery_days,Premature,Genetic_Syndrome_or_Chromosomal_Abnormality,Cardiac_Anatomy,GI_Complication,Length_of_Stay_days,Length_of_intubation_days,Extubation_failure,Pre_Op_NNNS_habituation_score,Pre_Op_NNNS_attention_score,⋯,Post_Op_NNNS_Regulation_Score,Post_Op_NNNS_Non_Optimal_Reflexes_Score,Post_Op_NNNS_Stress_Score,Post_Op_NNNS_Arousal_Score,Post_Op_NNNS_Hypertonic_Score,Post_Op_NNNS_Hypotonic_Score,Post_Op_NNNS_Asymmetry_Score,Post_Op_NNNS_Excitability_Score,Post_Op_NNNS_Lethargy_Score,Female
Unnamed: 0_level_1,<int>,<int>,<int>,<fct>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,⋯,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>
1,9,1,0,W/o arch obstruction,0,23,1.3,0,,,⋯,5.4,2,0.04081633,4.0,1,0,1,2,5,1
2,8,1,0,W/o arch obstruction,0,46,5.8,1,,3.566833,⋯,5.0,0,0.0,3.428571,0,0,1,3,4,0
3,4,1,0,Single ventricle w/ arch obstruction,0,19,5.8,0,,3.418106,⋯,5.714286,3,0.06122449,3.714286,0,0,1,3,3,0
4,14,1,0,Two ventricle w/ arch obstruction,0,25,4.8,0,,,⋯,5.428571,3,0.08163265,3.142857,0,0,1,2,4,1
5,7,1,1,Two ventricle w/ arch obstruction,0,37,3.1,0,,3.946078,⋯,3.384615,2,0.02040816,3.285714,0,0,0,5,5,0
6,7,1,0,Single ventricle w/ arch obstruction,0,18,2.8,0,,,⋯,4.4,3,0.16326531,3.428571,0,0,5,3,7,0


In [8]:
# names and labels of variables
dict_nnns <- data.frame(
  
  name = c(
    # primary outcome
    "Percent_of_feeds_taken_by_mouth_at_discharge",
    # predictor of interest
    "Pre_Op_NNNS_attention_score", "Post_Op_NNNS_attention_score",
    # 8 infant/surgery characteristics
    "Age_at_Surgery_days", "Female",
    "Genetic_Syndrome_or_Chromosomal_Abnormality", "Cardiac_Anatomy",
    "GI_Complication", "Length_of_Stay_days",
    "Length_of_intubation_days", "Extubation_failure",
    # 12 pre-op non-attention NNNS scores
    "Pre_Op_NNNS_habituation_score", "Pre_Op_NNNS_handling_score",
    "Pre_Op_NNNS_Quality_of_Movement_Score", "Pre_Op_NNNS_Regulation_Score",
    "Pre_Op_NNNS_Non_Optimal_Reflexes_Score", "Pre_Op_NNNS_Stress_Score",
    "Pre_Op_NNNS_Arousal_Score", "Pre_Op_NNNS_Hypertonic_Score",
    "Pre_Op_NNNS_Hypotonic_Score", "Pre_Op_NNNS_Asymmetry_Score",
    "Pre_Op_NNNS_Excitability_Score", "Pre_Op_NNNS_Lethargy_Score",
    # 12 post-op non-attention NNNS scores
    "Post_Op_NNNS_habituation_score", "Post_Op_NNNS_handling_score",
    "Post_Op_NNNS_Quality_of_Movement_Score", "Post_Op_NNNS_Regulation_Score",
    "Post_Op_NNNS_Non_Optimal_Reflexes_Score", "Post_Op_NNNS_Stress_Score",
    "Post_Op_NNNS_Arousal_Score", "Post_Op_NNNS_Hypertonic_Score",
    "Post_Op_NNNS_Hypotonic_Score", "Post_Op_NNNS_Asymmetry_Score",
    "Post_Op_NNNS_Excitability_Score", "Post_Op_NNNS_Lethargy_Score"
  ), 
  
  label = c(
    # primary outcome
    "Percentage of oral feed at discharge",
    # predictor of interest
    "Pre-op attention", "Post-op attention",
    # 8 infant/surgery characteristics
    "Age at surgery in days", "Female",
    "Genetic Syndrome / Chromosomal Abnormality", "Cardiac Anatomy",
    "Gastrointestinal Complication", "Length of Stay in days", 
    "Length of intubation in days", "Extubation failure", 
    # 12 pre-op non-attention NNNS scores
    "Pre-op habituation", "Pre-op handling", "Pre-op quality of movement", 
    "Pre-op regulation", "Pre-op non-optimal reflexes", "Pre-op stress", 
    "Pre-op arousal", "Pre-op hypertonic", "Pre-op hypotonic", 
    "Pre-op asymmetry", "Pre-op excitability", "Pre-op lethargy", 
    # 12 post-op non-attention NNNS scores
    "Post-op habituation", "Post-op handling", "Post-op quality of movement",
    "Post-op regulation", "Post-op non-optimal reflexes", "Post-op stress", 
    "Post-op arousal", "Post-op hypertonic", "Post-op hypotonic", 
    "Post-op asymmetry", "Post-op excitability", "Post-op lethargy" 
  ))

# create sets of variables for the convenience of calling them later
basevar <- c(
  "Age_at_Surgery_days", "Female",
  "Genetic_Syndrome_or_Chromosomal_Abnormality", "Cardiac_Anatomy",
  "GI_Complication", "Length_of_Stay_days",
  "Length_of_intubation_days", "Extubation_failure"
)
preop_nnns <- dict_nnns$name[grepl("Pre_Op_NNNS", dict_nnns$name)]
postop_nnns <- dict_nnns$name[grepl("Post_Op_NNNS", dict_nnns$name)]

# function to label variables
label_data <- function(data) {
  colnames(data) <- ifelse(
    colnames(data) %in% dict_nnns$name, 
    dict_nnns$label[match(colnames(data), dict_nnns$name)],
    colnames(data)
  )
  return(data)
}

In [11]:
# variables with missing data
data.frame(nmiss = colSums(is.na(nnns |> select(-starts_with("Date"))))) |>
  # only keep variables with missing data
  filter(nmiss > 0) |>
  # add percentage
  mutate(perc = round(nmiss / nrow(nnns) * 100, 1)) |>
  # add variable names
  rownames_to_column("variable") |>
  # match variable names with labels
  mutate(variable = dict_nnns$label[match(variable, dict_nnns$name)]) |>
  # sort by percentage
  arrange(desc(perc)) -> missing_table

In [14]:
write.csv(missing_table,'missing_table.csv',row.names=FALSE)