# Adjust gene expression (TPM) for significant cofactors using MLR residuals.
- 'Age' and 'Ischemic minutes' were found to be correlated with gene expression in many genes and across tissues.
- Using processed and annotated gene expression dataset (in TPM) we produce an MLR for each gene/tissue combination and use the residuals as adjusted gene expression.

### Load R packages/dependencies.

In [None]:
# Load R libraries.
library(tidyr)
library(dplyr)
library(patchwork)
library(ggplot2)
library(reshape2)
library(ggpubr)
library(rstatix)
#library(gginnards)

In [9]:
# List of genes and tissues.
list_tissues = c('Muscle - Skeletal','Esophagus - Muscularis','Artery - Tibial','Nerve - Tibial','Whole Blood','Heart - Left Ventricle','Heart - Atrial Appendage')

#list_mtdna = c('ND1','ND2','CO1','CO2','ATP8','ATP6','CO3','ND3','ND4L','ND4','CYB','ND5','ND6')

# Exclude ND5 and ND6 from list.
list_mtdna = c('ATP6','ATP8','CO1','CO2','CO3','CYB','ND1','ND2','ND3','ND4','ND4L')

### Join genes into bicistronic transcripts.
###list_mtdna = c('ATP8/ATP6','CO1','CO2','CO3','CYB','ND1','ND2','ND3','ND4L/ND4')

## Import the annotated gene expression (TPM) file.

In [None]:
# GTEx dataset with genotype data.
df_tpm = read.table("gtex_v8_tpm_annotated.mt", header=TRUE, sep="\t")

## Import available phenotypes (requires dbGap access).
- Includes: 'Cohort', 'Sex', 'Age', 'Ischemic minutes', and 'BMI'

In [3]:
df_phen = read.table("gtex_phenotypes.txt", sep="\t",
                      col.names=c('short_ID','Race','Cohort','Sex','Age','Ethnicity','Ischemic_minutes','BMI'), skip=1 )

## Annotate the gene expression dataframe with phenotypes.

In [5]:
# Merge dataframes.
df_tpm_phen = merge(df_tpm,df_phen,by=c("short_ID"))

# Keep relevant columns.
df_tpm_phen = df_tpm_phen %>% select('short_ID','GTEX_ID','mtDNA_haplo','mt_ancestry','mitonucl_discord','self_rep_race','Sex','Cohort','Age','Ischemic_minutes','Tissue','Gene','TPM')

## `Test`: Create MLR with Expression vs. Age + Ischemic time for a single gene and tissue

In [None]:
# Test: Filter to ND6 gene in Skeletal muscle tissue.
test_get_residuals = function(){data_test = df_tpm_phen %>% 
    select('short_ID','Tissue','Gene','Age','Ischemic_minutes','TPM') %>% 
    filter(Tissue == 'Muscle - Skeletal') %>%
    filter(Gene == 'ND1')

    # Multiple Linear Regression.
    fit <- lm( TPM ~ Age + Ischemic_minutes , data=data_test )
    #summary(fit) # show results
    #residuals(fit) # residuals

    # Add residuals to the df.
    data_test$Residuals_AgeIsch = residuals(fit)
    data_test
}

test_get_residuals()

## `Function`: MLR for each tissue and gene, to use residuals as adjusted TPM.
Gets residuals from: `TPM ~ Age + Ischemic_minutes`

Age and Ischemic_minutes were shown to be correlated with many genes in many tissues (though not in all).

`e.g. An MLR is fitted to subset of ND6 gene in Skeletal muscle.`

In [None]:
# Iterate over tissues and genes.
# Only the genes for Skeletal muscle are succesful.
get_residuals_all = function(df){
    ## List of tissues and genes.
    ##list_tissues = c('Muscle - Skeletal','Esophagus - Muscularis','Artery - Tibial','Nerve - Tibial','Whole Blood','Heart - Left Ventricle','Heart - Atrial Appendage')
    ##list_mtdna = c('ND1','ND2','CO1','CO2','ATP8','ATP6','CO3','ND3','ND4L','ND4','CYB')
    # Empty dataframe keeping column headers.
    out_df = df[FALSE,]
    out_df$Residuals_AgeIsch = numeric()
    # Iterate over tissue and gene to get residuals.
    for (tissue in list_tissues) {
        for (gene in list_mtdna) {
            data = df %>% filter(Tissue == tissue) %>% filter(Gene == gene)
            # Fit an MLR.
            fit <- lm( TPM ~ Age + Ischemic_minutes , data=data )
            # Get residuals.
            data$Residuals_AgeIsch = residuals(fit)
            #return(data)
            # Merge dataframes.
            ##out_df = merge(out_df, data, all.y=TRUE, by=c('short_ID','GTEX_ID','mtDNA_haplo','mt_ancestry','mitonucl_discord','self_rep_race','Sex','Cohort','Age','Ischemic_minutes','Tissue','Gene','TPM','Residuals_AgeIsch') )
            out_df = bind_rows(out_df, data)
        }
        # Omit rows with an NA.
        #out_df = na.omit(out_df)
    }
    # Return results for all tissues and genes.
    return(out_df)
}


df_adjTPM = get_residuals_all(df_tpm_phen)

## Export the df annotated with residuals (adjusted TPM).

In [16]:
# Create a tab-separated table without quotes(""), and without index ('row.names').
write.table( get_residuals_all(df_tpm_phen), 
            "residuals_adjTPM.txt", 
            sep='\t', quote=FALSE, row.names = FALSE )