# Computing Lifestyle Factor - MS correlations
### Author: Mihail Mihov

In [None]:
#load the necessary libraries for the project
library(data.table)
suppressMessages(library(tidyverse))
library(broom)

#expand the number of columns printed by jupyteR to see the whole table
options(repr.matrix.max.cols = 1000)

#load in the data table from UKBiobank
pheno <- fread(file="/home//mmihov/Projects/Maternal_Smoking/MS_TWAS_PheWAS_heritability/MS_Cox_PheWAS_Cor/data/V2/UoA_UKB_phenotypes_2021-19-11.txt", data.table=F)

#Fix the required columns to be in numeric format
pheno$maternal_smoking=as.numeric(pheno\$maternal_smoking)
pheno$sex=as.numeric(pheno$sex)
pheno$x_agebase=as.numeric(pheno$x_agebase)
pheno$education=as.numeric(pheno$education)
pheno$alcohol=as.numeric(pheno$alcohol)
pheno$lack_of_PA=as.numeric(pheno$lack_of_PA)
pheno$deprv_index=as.numeric(pheno$deprv_index)
pheno$packyears=as.numeric(pheno$packyears)
pheno$smoking_status=as.numeric(pheno$smoking_status, levels=c(0,1,2))

#remove all records with sex chromosome abnormalities, non-European ancestry and missing maternal smoking record
pheno=pheno[is.na(pheno$sex_aneu)&is.na(pheno$gen_exclude) & !is.na(pheno$gen_ethnicity) & (pheno$maternal_smoking!="missing") & !is.na(pheno$maternal_smoking),]

#change all NA values related to personal smoking status to "missing". This will make sure they are included during the calcualtion of the linear models.
pheno[is.na(pheno$smoking_status), "smoking_status"]="missing"




#impute missing values for deprivation index and pack years to the median for the population
pheno[is.na(pheno$deprv_index), "deprv_index"]=median(pheno$deprv_index, na.rm=T)
pheno[is.na(pheno$packyears), "packyears"]=median(pheno$packyears, na.rm=T)

#Fix all people that indicated non-smoker status to have pack years equal to 0
pheno[(pheno$smoking_status)==0, "packyears"]=0

#scale all continuous variables to have a mean = 0 and SD = 1
pheno$x_BMI=scale(pheno$x_BMI)
pheno$deprv_index=scale(pheno$deprv_index)
pheno$packyears=scale(pheno$packyears)

head(pheno)

### Fit regression models between the choses variables and maternal smoking while accounting for confounders 

In [None]:
#sex~MS correlation
fit.a=glm(sex~maternal_smoking+x_agebase+x_BMI+packyears+education+alcohol+lack_of_PA+deprv_index, data=pheno, family="binomial")

#create the first row of the dataframe (that will contain the beta values for all correlations) to be used for plotting later on
#tidy(...) extracts the statistics of the model (e.g. fit.a) in table format
df_beta=as.data.frame(tidy(fit.a))
df_beta=as.data.frame(df_beta[2,c(1,2)])

#check the dataframe
head(df_beta)

In [None]:
#repeat for all variable alternating between linear regression for continuous variables and logistic regression for binary variables

#age
fit.a=lm(x_agebase~maternal_smoking+sex+x_BMI+packyears+education+alcohol+lack_of_PA+deprv_index, data=pheno)

x=as.data.frame(tidy(fit.a))
x=as.data.frame(x[2,c(1,2)])

df_beta=rbind(df_beta,x)

#bmi
fit.a=lm(x_BMI~maternal_smoking+x_agebase+sex+packyears+education+alcohol+lack_of_PA+deprv_index, data=pheno)

x=as.data.frame(tidy(fit.a))
x=as.data.frame(x[2,c(1,2)])

df_beta=rbind(df_beta,x)

#packyears
fit.a=lm(packyears~maternal_smoking+x_agebase+x_BMI+sex+education+alcohol+lack_of_PA+deprv_index, data=pheno)

x=as.data.frame(tidy(fit.a))
x=as.data.frame(x[2,c(1,2)])

df_beta=rbind(df_beta,x)

#education
fit.a=glm(education~maternal_smoking+x_agebase+x_BMI+packyears+sex+alcohol+lack_of_PA+deprv_index, data=pheno, family="binomial")

x=as.data.frame(tidy(fit.a))
x=as.data.frame(x[2,c(1,2)])

df_beta=rbind(df_beta,x)

#alcohol
fit.a=glm(alcohol~maternal_smoking+x_agebase+x_BMI+packyears+education+sex+lack_of_PA+deprv_index, data=pheno, family="binomial")

x=as.data.frame(tidy(fit.a))
x=as.data.frame(x[2,c(1,2)])

df_beta=rbind(df_beta,x)

#PA
fit.a=glm(lack_of_PA~maternal_smoking+x_agebase+x_BMI+packyears+education+alcohol+sex+deprv_index, data=pheno, family="binomial")

x=as.data.frame(tidy(fit.a))
x=as.data.frame(x[2,c(1,2)])

df_beta=rbind(df_beta,x)

#deprv inx
fit.a=lm(deprv_index~maternal_smoking+x_agebase+x_BMI+packyears+education+alcohol+lack_of_PA+sex, data=pheno)

x=as.data.frame(tidy(fit.a))
x=as.data.frame(x[2,c(1,2)])

df_beta=rbind(df_beta,x)

head(df_beta)

In [None]:
#Name each column in the table accordingly (e.g. The first column is the overall or "All" analysis)
colnames(x)[2]="All"

In [None]:
#Change the term column to indicate the correlated variable in the correct order
df_beta[,1]=c("sex","age","bmi","packyears","education","alcohol","pa","di")

In [None]:
#Create female, male, smoker and non-smoker subgroups of the data 
pheno.f=pheno[pheno$sex==0,]
pheno.m=pheno[pheno$sex==1,]
pheno.smokers=pheno[pheno$smoking_status>0,]
pheno.nonsmokers=pheno[pheno$smoking_status==0,]

#Repeat the analysis for each subgroup following the above steps

#Combine all data frames into a final dataframe by term column using the following general code
Beta_frame=left_join(Beta_frame1,Beta_frame2,by="term")

#Repeat all steps to generate a combined dataframe of P-values to be used for plotting as well

### Plotting

In [None]:
#Make the first column in the data frames to be row names
rownames(df_beta)=df_beta$term
rownames(df_p)=df_p$term

#Transpose both dataframes
df_beta=t(df_beta)
df_p=t(df_p)

#Remove the "term" row
df_beta=df_beta[-1,]
df_p=df_p[-1,]

#Final touches in preparation for plotting
df_beta=as.data.frame(t(df_beta))
df_beta=sapply(df_beta, as.numeric)
df_beta=as.data.frame(df_beta)
rownames(df_beta)=c("Male sex", "Age at baseline", "BMI at baseline", "Packyears",
                    "University degree vs. other", "High alcohol consumption", "Lack of physical activity", 
                    "Deprivation Index")

df_p=as.data.frame(t(df_p))
df_p=sapply(df_p, as.numeric)
df_p=as.data.frame(df_p)
rownames(df_p)=c("Male sex", "Age at baseline", "BMI at baseline", "Packyears",
                    "University degree vs. other", "High alcohol consumption", "Lack of physical activity", 
                    "Deprivation Index")

In [None]:
#Plot using package 'Corrplot'

library(corrplot)
#RColorBrewer expands the choice of color legends
library(RColorBrewer)
options(repr.plot.width=10, repr.plot.height=6, repr.plot.res=240)
par(bg = 'white')

options(repr.plot.width=10, repr.plot.height=6, repr.plot.res=240)
corrplot(t(df_beta),na.label = "square", na.label.col = rgb(200,200, 200, max = 255, alpha = 255),is.corr = FALSE, method =c("circle"),
         addgrid.col = "grey", col = brewer.pal(10, "RdBu")[10:1], col.lim=c(-1.5, 1.5),
         tl.cex = 1.5, tl.col = "black", tl.srt=45,p.mat=t(df_p), 
         sig.level = c(.05), mar = c(1, 1, 0,3),
         pch.cex = 2.2, cl.cex=1.5, cl.align.text="l", cl.ratio = .2, outline=TRUE, , pch.col = "black", pch="*",insig = "label_sig")

#Export the figure to PDF
dev.copy2pdf(width=20, height=10, file = "~/Your_Directory/Corrplot_Lifestyle_Factors.pdf", out.type="pdf")
dev.off()