# RNASeq phenotypes sample matching using MatchIt
#### Author: Mihail Mihov

In [1]:
#Load necessary libraries
library(data.table)
library(broom)
options(repr.matrix.max.cols = 2000)

#Read the counts data
counts=fread("~/Your_directory/data.txt", data.table=FALSE)

# fix column names in counts
colnames(counts)=make.names(colnames(counts))

# rownames is first column of counts i.e. the genes
rownames(counts)=counts[,1]
genes=counts[,1]
counts=counts[,-1]

# transform expresison values to integer as this is requried by DESeq2
counts=apply(counts, c(1,2), as.integer)

#read in the phenotypes
phenotypes=fread("~/your_directory/Phenotype.txt", data.table=FALSE)

# fix Tobacco phenotype
table(phenotypes$Tobacco)
phenotypes[phenotypes$Tobacco=="contrtol", "Tobacco"]="control"
table(phenotypes$Tobacco)

# use median BMI as imputation for missing BMI
phenotypes[is.na(phenotypes$BMI), "BMI"]=median(phenotypes$BMI, na.rm=TRUE)

# scale BMI and Age to have mean of 0 and sd of one
phenotypes$BMI=scale(phenotypes$BMI)
phenotypes$AGE=scale(phenotypes$AGE)

# reduce phenotypes to samples which are present in count data
phenotypes=phenotypes[!is.na(match(phenotypes$sample, colnames(counts))),]


# reorder phenotypes to be in same order as counts
phenotypes=phenotypes[match(colnames(counts), phenotypes$sample),]
head(phenotypes)

In [None]:
#Set rownames as sample names (column 1) and delete it
rownames(phenotypes)=phenotypes$sample
phenotypes=phenotypes[,-1]
head(phenotypes)

In [None]:
#Change parameter values to be in binary format of 0 an 1
phenotypes[phenotypes$Tobacco=="control", "Tobacco"]=0
phenotypes[phenotypes$Tobacco=="smokes", "Tobacco"]=1

phenotypes[phenotypes$SEX=="male", "SEX"]=0
phenotypes[phenotypes$SEX=="female", "SEX"]=1

phenotypes[phenotypes$AGE_groups=="early", "AGE_groups"]=1
phenotypes[phenotypes$AGE_groups=="mid", "AGE_groups"]=2
phenotypes[phenotypes$AGE_groups=="late", "AGE_groups"]=3

phenotypes[phenotypes$BMI_groups=="no", "BMI_groups"]=0
phenotypes[phenotypes$BMI_groups=="yes", "BMI_groups"]=1

#Transform all fields in numeric values
phenotypes$Tobacco=as.numeric(phenotypes$Tobacco)
phenotypes$SEX=as.numeric(phenotypes$SEX)
phenotypes$AGE_groups=as.numeric(phenotypes$AGE_groups)
phenotypes$BMI=as.numeric(phenotypes$BMI)
phenotypes$AGE=as.numeric(phenotypes$AGE)

In [None]:
#Transform the variables to be used for matching in to leveled factors 
phenotypes$AGE_groups=as.factor(phenotypes$AGE_groups)
phenotypes$SEX=as.factor(phenotypes$SEX)
phenotypes$Tobacco=as.factor(phenotypes$Tobacco)
phenotypes$BMI_groups=as.factor(phenotypes$BMI_groups)

In [None]:
#load the MatchIt package
library("MatchIt")

#Make matching maternal smoking groups based on embryonal sex and age group
m.out0 <- matchit(Tobacco ~ SEX+AGE_groups, data = phenotypes[,], method = "nearest", exact=c("SEX","AGE_groups"), distance="glm", ration=1)

#check the summary statistics
summary(m.out0)

#write the matched data in to a variable to be used in the next matching step
p=match.data(m.out0)

#Make matching embryonal sex groups based on maternal smoking and embryonal age group
m.out1 <- matchit(SEX~Tobacco +AGE_groups, data = p[,1:7], method = "nearest", exact=c("Tobacco","AGE_groups"), distance="glm", ration=1)

#check the summary statistics
summary(m.out1)

#write the matched data in to a variable to be used in the next matching step
p1=match.data(m.out1)

#check the quality of the matching
table(p1[,-c(1:3,7:10)])
head(p1)

In [None]:
#Write the matched data into a file to be used in later analyses
write.table(p1, file="/Your_directory/pheno_matched_MatchIt.txt", sep="\t", row.names=TRUE)