# R workflow for mice imputation

In [4]:
import pandas as pd

Set directory

setwd("Dropbox/Healthcare work Patric")

require mice package

require(mice)

load dataset

dataset = read.csv("imputation_set_24012018.csv", header=TRUE, na.strings = '')

get overview of dataset incl. number of NAs per variable

summary(dataset)

load VIM library for missing data visualization

library(VIM)

aggr(dataset, prop = F, numbers = T)

<img src="Dropbox/Healthcare work Patric/aggr(dataset, prop = F, numbers = T).png">

matrixplot(dataset, interactive = F)

<img src="Dropbox/Healthcare work Patric/matrixplot(dataset, interactive = F).png">

marginplot(dataset[,c("IK","IDH_TERT")])

<img src="Dropbox/Healthcare work Patric/marginplot(dataset[c(IK,IDH_TERT).png">

code IDH_TERT and X1p19q_codel as factors so that R/MICE doesnt treat them as continuous

dataset$IDH_TERT <- as.factor(dataset$IDH_TERT)

dataset$X1p19q_codel <- as.factor(dataset$X1p19q_codel)

run mice imputation with 20 iterations

imp <- mice(dataset, maxit = 20)

inspect methods used for imputing each variable

imp$method

In [9]:
#Gender      Tumor_type     Tumor_grade        Gene_P53       Gene_Mgmt       Gene_Egfr       Gene_Mdm2 
#    ""              ""              ""       "polyreg"        "logreg"        "logreg"        "logreg" 
#     Gene_Cdk4        Gene_P16   Gene_Ihc_Atrx      Gene_Ch10Q       Gene_Ch9P  Tumor_Location  Tumor_Position 
#       "logreg"        "logreg"       "polyreg"        "logreg"       "polyreg"       "polyreg"       "polyreg" 
#   Surgery_type     Age_surgery life_expectancy             IDH            TERT        IDH_TERT    X1p19q_codel 
#      "polyreg"              ""              ""       "polyreg"        "logreg"       "polyreg"        "logreg" 
#             IK 
#          "pmm" 

Inspect convergenve

plot(imp)

<img src="Dropbox/Healthcare work Patric/plot(imp).png">

stripplot(imp, pch = 20, cex = 1.2)

<img src="Dropbox/Healthcare work Patric/stripplot(imp, pch = 20, cex = 1.2).png">

export the imputed dataset

write.csv(complete(imp), file = "imputed_dataset_no_censoring_16022018")

# Amelia imputation

setwd(Dropbox/Healthcare work Patric)

dataset = read.csv("imputation_dataset_no_censoring_24022018", header = TRUE, na.strings = '')

load Amelia

library(Amelia)

run imputation

noms = categorical variables

ords = ordinal variables

idvars = nominal variables that should not be imputed

if running Amelia on full dataset, it will throw an error. you need to remove either Tumor_type of Tumor_grade
since removing Tumor_type converges faster, I removed Tumor_grade by adding it to idvars option
Since both Tumor_type and Tumor_grade variables have no missing data, this does not hinder us at all

a.out <- amelia(dataset, m=1, noms = c("Gender", "Tumor_grade", "Gene_P53", "Gene_Mgmt", "Gene_Egfr", "Gene_Mdm2", 
                                       "Gene_Cdk4", "Gene_P16", "Gene_Ihc_Atrx", "Gene_Ch10Q", "Gene_Ch9P", 
                                       "Tumor_Location", "Tumor_Position", "Surgery_type", "IDH", "TERT", 
                                       "X1p19q_codel"), ords = c("IDH_TERT"), idvars = c("RX", "CHEM", "Tumor_type"))

get key messages post imputation, including "Normal EM convergence"

a.out

get full summary of output dataset

summary(a.out)

Post imputation diagnostic graphs

plot(a.out, which.vars = 4:15)

overimpute(a.out, var = "IDH_TERT")

overimpute(a.out, var = "IK")

export dataset. Here a.out contains only one imputed dataset. If it contained 5, this would export each of them in a csv 
named file.stem+1, file.stem+2...

write.amelia(obj=a.out, file.stem = "imputed_dataset_no_censoring_24022018_Amelia")

we repeat the same with the censured dataset. However, this time Amelia throws a collinearity error again, even when run
with the same parameters as for non censored dataset
As such, we pass "life expectancy" as an idvars to avoid collinearity issue

a.out <- amelia(dataset, m=1, noms = c("Gender", "Tumor_grade", "Gene_P53", "Gene_Mgmt", "Gene_Egfr", "Gene_Mdm2", 
                                                                                "Gene_Cdk4", "Gene_P16", "Gene_Ihc_Atrx", "Gene_Ch10Q", "Gene_Ch9P", 
                                                                                "Tumor_Location", "Tumor_Position", "Surgery_type", "IDH", "TERT", 
                                                                                "X1p19q_codel"), ords = c("IDH_TERT"), idvars = c("Tumor_type", "life_expectancy"))

and export it

write.amelia(obj=a.out, file.stem = "imputed_dataset_with_censoring_26022018_Amelia")

## KNN Imputation

dataset <- read.csv("imputation_dataset_no_censoring_24022018", header=TRUE, na.strings = "")

dataset$IDH_TERT <- as.factor(dataset$IDH_TERT)

dataset$X1p19q_codel <- as.factor(dataset$X1p19q_codel)

imp <- kNN(dataset, k = 10)

write.csv(imp, "imputed_dataset_with_censoring_16.02.2018_kNN.csv")

Same but for censored dataset

dataset <- read.csv("imputation_dataset_with_censoring_24022018", header=TRUE, na.strings = "")

dataset$IDH_TERT <- as.factor(dataset$IDH_TERT)

dataset$X1p19q_codel <- as.factor(dataset$X1p19q_codel)

imp <- kNN(dataset, k = 10)

write.csv(imp, "imputed_dataset_with_censoring_26022018_kNN.csv")