In [8]:
#install.packages('stringr')

In [9]:
#Set your working directory
setwd("/home/mxenoc/workspace/multiomics-benchmark/data/RData")

#Import libraries
library(data.table)
library(stringr)
library(dplyr)

#Load your data
load('PreE_Omics_Data_Final.rda')

In [10]:
#These are a bit different for each omic
#Give the columns sensible names and get the sample IDs
rn = as.data.frame(t(cfrna_dataFINAL[,3:ncol(cfrna_dataFINAL)]))
colnames(rn) = c(paste0('rna', 1:ncol(rn)))
rn$IDs = rownames(rn)

lipid = t(lipidome_dataFINAL)
colnames(lipid) = lipid[1,]
lipid = as.data.frame(lipid[-1,])
colnames(lipid) = c(paste0('lipid', 1:ncol(lipid)))
lipid$IDs = rownames(lipid)

metabolome_plasma_dataFINAL = metabolome_plasma_dataFINAL[ , 
                              grepl( "X" , names(metabolome_plasma_dataFINAL) ) ]
plasma = as.data.frame(t(metabolome_plasma_dataFINAL))
colnames(plasma) = c(paste0('plasma', 1:ncol(plasma)))
plasma$IDs = rownames(plasma)

metabolome_urine_dataFINAL = metabolome_urine_dataFINAL[ , 
                              grepl( "X" , names(metabolome_urine_dataFINAL) ) ]
urine = as.data.frame(t(metabolome_urine_dataFINAL))
colnames(urine) = c(paste0('urine', 1:ncol(urine)))
urine$IDs = rownames(urine)

somalog = as.data.frame(somalogic_dataFINAL)
somalog = somalog[,-c(2:5)]
colnames(somalog) = c(paste0('somalog', 1:ncol(somalog)))
colnames(somalog)[1] = "IDs"
somalog$IDs = paste0("X", somalog$IDs) 

microb = microbiome_dataFINAL
microb$sample_id = paste0("X", str_extract(microb$sample_id, "[^.]+"))
microb = microb[,-c(2:32)]
colnames(microb) = c(paste0('microb', 1:ncol(microb)))
colnames(microb)[1] = "IDs"

In [11]:
#Get your datasets, their old names and the new names you will assign to them
omics <- list(rn, lipid, plasma, urine, somalog, microb)
omic_names <- c('rn', 'lipid', 'plasma', 'urine', 'somalog', 'microb')
new_omic_names <- c('rna', 'lipidome', 'metabolome_plasma', 'metabolome_urine', 
                    'somalogic', 'microbiome')

#Extract patients, ga, and add a column with patient number and ga
for (i in 1:length(omics)){
  omics[[i]]$Patients <- substr(omics[[i]]$IDs, 1, 6)
  omics[[i]]$ga <- as.numeric(substr(omics[[i]]$IDs, 9, 10))
  omics[[i]]$Patient_ga <- paste0(substr(omics[[i]]$IDs, 0, 6), omics[[i]]$ga)
}

In [12]:
#Give some leeway, +-1 week, for matching samples taken at different times
for (i in 1:length(omics)){
  
  omic_set_m1 <- omics[[i]]
  omic_set_m1$Patient_ga <- paste0(omics[[i]]$Patients, omics[[i]]$ga-1)
  
  omic_set_p1 <- omics[[i]]
  omic_set_p1$Patient_ga <- paste0(omics[[i]]$Patients, omics[[i]]$ga+1)
  
  all_omic_sets = do.call("rbind", list(omics[[i]], omic_set_m1, omic_set_p1))
  
  #Assign the new names 
  assign(new_omic_names[[i]], all_omic_sets)
}

#Merge all datasets 
all = Reduce(function(x, y) merge(x, y, by = "Patient_ga",all = FALSE), 
             list(rna, lipidome, metabolome_plasma, metabolome_urine, 
                  somalogic, microbiome))

“column names ‘IDs.x’, ‘Patients.x’, ‘ga.x’, ‘IDs.y’, ‘Patients.y’, ‘ga.y’ are duplicated in the result”
“column names ‘IDs.x’, ‘Patients.x’, ‘ga.x’, ‘IDs.y’, ‘Patients.y’, ‘ga.y’ are duplicated in the result”
“column names ‘IDs.x’, ‘Patients.x’, ‘ga.x’, ‘IDs.y’, ‘Patients.y’, ‘ga.y’, ‘IDs.x’, ‘Patients.x’, ‘ga.x’, ‘IDs.y’, ‘Patients.y’, ‘ga.y’ are duplicated in the result”


In [13]:
microbiome_dataFINAL$Patient <- paste0('X', microbiome_dataFINAL$individual)
all$Patient <- all$Patients.x

#Get rid of the columns you messed up earlier since they don't mean much anymore
columns.to.remove <- paste(c('IDs', 'ga'), collapse = '|')
all_data <- all[!grepl(columns.to.remove, colnames(all))]

#And now get rid of all the duplicated rows you created 
all_data <- unique(all_data)

save(all_data, file = "PreE.RData")