In [1]:
interactome = read.table('../intermediate/interactome_lists/GATA4-TBX5_genes.txt', stringsAsFactors=F)
interactome_genes = interactome$V1

## Load and format variants

In [2]:
dnv_cases = read.csv("../data/variants/DNV_cases.csv", stringsAsFactors=FALSE)
dnv_ctrls = read.csv("../data/variants/DNV_ctrls.csv", stringsAsFactors=FALSE)

# separate synonymous and non-synonymous
dnv_case_syn = dnv_cases[which(dnv_cases$Variant.Class == "syn"),]
dnv_ctrl_syn = dnv_ctrls[which(dnv_ctrls$Variant.Class == "syn"),]
dnv_cases = dnv_cases[which(!(dnv_cases$Variant.Class == "syn")),]
dnv_ctrls = dnv_ctrls[which(!(dnv_ctrls$Variant.Class == "syn")),]

inh_lof_cases = read.csv("../data/variants/LoF_cases.csv", stringsAsFactors=FALSE)
inh_lof_ctrls = read.csv("../data/variants/LoF_ctrls.csv", stringsAsFactors=FALSE)

In [3]:
# Format de novo variants
cols <- c("Blinded.ID", "Cardiac.Category", "EM", "NDD", "CHR", "POS", 
          "REF", "ALT", "Gene","Variant.Class","AA.change","HHE.Rank")
m_dnv = dnv_cases[,cols]

# Format inherited LoF variants
cols <- c("Blinded.ID", "CHR", "POS", "REF", "ALT", "Gene","ExonicFunc.refGene","AA.change")
m_lof = inh_lof_cases[,cols]
m_lof$POS <- as.numeric(as.character(m_lof$POS))
names(m_lof)[which(names(m_lof)=="ExonicFunc.refGene")] <- "Variant.Class"

# Spike in reference variants
spiked = read.csv("../data/variants/reference_variants.csv", stringsAsFactors = F)
names(spiked)[which(names(spiked)=="ExonicFunc.refGene")] <- "Variant.Class"
cols <- c("Blinded.ID", "Cardiac.Category", "EM", "NDD", "CHR", "POS", "REF", "ALT", "Gene",
          "Variant.Class","AA.change","HHE.Rank")
d_add = spiked[,cols]
m_dnv = rbind(m_dnv, d_add)

spiked["ProbandGT"] = "0/1"
spiked$Func = "misD"
cols <- c("Blinded.ID", "CHR", "POS", "REF", "ALT", "Gene","Variant.Class","AA.change")
l_add = spiked[,cols]
m_lof = rbind(m_lof, l_add)


# Create a combined lof + dnv table
m_lof$inh_type = "lof"
m_dnv$inh_type = "dnv"
cols = c("Blinded.ID","CHR","POS","REF","ALT","Gene","Variant.Class","AA.change","inh_type")
comb_muts = rbind(m_lof[,c(cols)], m_dnv[,c(cols)])
comb_muts[which(comb_muts$Blinded.ID == "OLIGO"),"inh_type"] = "lof"

# We focus mainly on de novo variants within interactome genes
mut_table = m_dnv[which(m_dnv$Gene %in% interactome_genes | (m_dnv$Blinded.ID %in% c("GATA4","OLIGO","NKX25"))),]

## Tally DNV+rare loss-of-function mutations in each gene

In [4]:
# Tally the number of DNV and inherited loss-of-function mutations per gene
freq_table = as.data.frame(table(comb_muts$Gene), stringsAsFactors=F)

## Mutations per kilobase

In [5]:
exon_coords = read.table("../data/databases/gene_start_stop.txt", sep = "\t", header = TRUE, stringsAsFactors=F)
exon_coords$CDS.Length = NULL
exon_coords$geneLength = exon_coords$Gene.end..bp. - exon_coords$Gene.start..bp.
exon_coords <- unique(exon_coords)

freq_table$geneLength = exon_coords$geneLength[match(freq_table$Var1, exon_coords$Gene.name)]
freq_table$mutperkb = (freq_table$Freq / freq_table$geneLength) * 1000

mut_table$mutperkb = freq_table$mutperkb[match(mut_table$Gene, freq_table$Var1)]

## Tissue-specific expression score

In [6]:
gtex = read.table("../data/databases/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz",
                  sep = "\t", stringsAsFactors=F, skip=2, header=T)
interactome_genes = c(interactome_genes, "RFWD2", "TMEM55B", "MSH6") # alias issue
focal_genes = unique(c(interactome_genes, mut_table$Gene))

In [7]:
gtex <- gtex[which(gtex$Description%in% focal_genes),c('Description','Heart...Atrial.Appendage',
                                                       'Heart...Left.Ventricle','Artery...Aorta',
                                                       'Artery...Coronary','Brain...Amygdala','Brain...Cortex',
                                                       'Brain...Cerebellum','Brain...Hypothalamus','Bladder',
                                                       'Breast...Mammary.Tissue','Colon...Sigmoid',
                                                       'Esophagus...Muscularis','Fallopian.Tube',
                                                       'Adrenal.Gland','Kidney...Cortex','Liver','Lung',
                                                       'Pancreas','Prostate','Spleen','Thyroid','Whole.Blood')]
names(gtex) = c("Gene","Adult Atrium","Adult Left Ventricle","Aortic Artery","Coronary Artery","Amygdala",
                "Cortex","Cerebellum","Hypothalamus","Bladder","Mammary Tissue","Sigmoid Colon",
                "Espohagus Muscularis","Fallopian Tube","Adrenal Gland","Kidney","Liver","Lung",
                "Pancreas","Prostate","Spleen","Thyroid","Whole Blood")

row.names(gtex) <- gtex$Gene
gtex <- gtex[,c(-1)]

In [8]:
gtex$heart_avg = rowMeans(gtex[,c("Adult Atrium","Adult Left Ventricle")])
gtex$other_avg = rowMeans(gtex[,c(-1,-2,-23)])
gtex$heart_specificity = gtex$heart_avg / rowSums(gtex[,c(-1,-2,-23)])
mut_table$specificity_score = gtex$heart_specificity[match(mut_table$Gene, row.names(gtex))]

## CADD scores and connectivity
This was previously calculated in variant_comparison.ipynb

In [9]:
variant_characteristics = read.table("../manuscript/tables/variant_characteristics.tsv", sep="\t",
                                    stringsAsFactors = FALSE, header = TRUE)
names(variant_characteristics)[which(names(variant_characteristics)=="CHROM")] <- "CHR"

# append data for reference variants
ref_cadd = read.table('../data/cadd/cadd_reference_scores.tsv', sep="\t", 
                      stringsAsFactors = FALSE, header = FALSE)
names(ref_cadd) <- c('CHR','POS','REF','ALT','CADD.score','PHRED.score')
ref_cadd = merge(ref_cadd, spiked, all.y=TRUE)
names(ref_cadd)[which(names(ref_cadd)=="pLI.Score")] <- "pLI"

gene_lookup = unique(variant_characteristics[,c('Gene','corrected_dnv_node_degree',
                                                'corrected_chd_node_degree','oe')])
ref_cadd = merge(ref_cadd, gene_lookup, all.x = TRUE)

# See variant_comparison.ipynb for this calculation
ref_cadd[which(ref_cadd$Gene=='NKX2-5'),'corrected_dnv_node_degree'] = 0.3333333
ref_cadd[which(ref_cadd$Gene=='NKX2-5'),'corrected_chd_node_degree'] = 1
ref_cadd[which(ref_cadd$Gene %in% c('MYH7', 'MKL2')), 'corrected_chd_node_degree'] = 0
ref_cadd[which(ref_cadd$Gene == "MYH7"), 'corrected_dnv_node_degree'] = 0.1052632
ref_cadd[which(ref_cadd$Gene == "MKL2"), 'corrected_dnv_node_degree'] = 0

ref_cadd[which(ref_cadd$Gene=='NKX2-5'),'oe'] = 0.95896
ref_cadd[which(ref_cadd$Gene == "MYH7"), 'oe'] = 0.66794
ref_cadd[which(ref_cadd$Gene == "MKL2"), 'oe'] = 0.93905

In [10]:
cols = c('CHR','POS','REF','ALT','pLI','oe','corrected_dnv_node_degree','corrected_chd_node_degree','CADD.score')
variant_characteristics = rbind(variant_characteristics[,c(cols)], ref_cadd[,c(cols)])
gene_data = merge(mut_table, variant_characteristics[,c(cols)], by = c("CHR","POS","REF","ALT"), all.x=T)

## Known protein domain

In [11]:
# Determine if this occurs in a known protein domain
###### OR is a lof variant - we expect these proteins to not do their job #######

domains = read.csv("../data/databases/domain_annotations.csv", stringsAsFactors = F)
gene_data = merge(gene_data, domains[,c("CHR","POS","REF","ALT","Known.interaction","NEARBY.VARIANTS","BAIT",
                               "Variant.protein.domain","Protein.domains.affected")], 
                by = c("CHR","POS","REF","ALT"), all.x = TRUE)
gene_data$protein_domain_or_lof = 1
neg_str = c("no specific domain","non specific domain","not known domain","")
gene_data$protein_domain_or_lof[which(gene_data$Protein.domains.affected %in% neg_str)] <- 0.5
gene_data$protein_domain_or_lof[which(gene_data$Variant.Class %in% c("non","frameshift","startloss","splice",
                                                                   "stoploss"))] <- 1
gd = gene_data

## Proband-level data

In [12]:
# Determine if that person has another mutation
  #No known-mut or interactome = score 1
  #LoF inherited interactor not know-mut = score 0.75
  #DNV interactome not known mut mis = 0.75
  #LoF inherited known-mut = score 0.5
  #DNV interactome not known mut -LoF or misD = 0.5
  #DNV known-mut mis = 0.25
  #DNV known-mut LoF or misD = 0.1 (so we do not create 0 values)


#### NOTE: This DOES NOT include whether the variant itself occurs in a known gene
known = read.table("../data/databases/known_CHD_genes.txt", stringsAsFactors = F)
known_genes = known$V1

unknown_interactors = interactome_genes[which(!interactome_genes %in% known_genes)]

gd$no_known_muts = 1
gd$known_dnv_gene_list = ""
gd$known_lof_gene_list = ""
gd$interactome_dnv_gene_list = ""
gd$interactome_lof_gene_list = ""

for (i in c(1:nrow(gd))){
  
  muts = comb_muts[which(comb_muts$Blinded.ID == gd$Blinded.ID[i]),]
  muts = muts[which(!(muts$CHR == gd$CHR[i] & muts$POS == gd$POS[i])),] # don't include the variant itself
  
  if (any(muts$Gene %in% known_genes) | any(muts$Gene %in% unknown_interactors)){
    
    dnv_k = muts[which(muts$inh_type == "dnv" & muts$Gene %in% known_genes),]
    lof_k = muts[which(muts$inh_type == "lof" & muts$Gene %in% known_genes),]
    
    dnv_i = muts[which(muts$inh_type == "dnv" & muts$Gene %in% unknown_interactors),]
    lof_i = muts[which(muts$inh_type == "lof" & muts$Gene %in% unknown_interactors),]
    
    
    if(nrow(dnv_k) >= 1){
      if(any(muts$Func %in% c("misD","frameshift","stoploss","startloss","non","splice"))){
        gd$no_known_muts[i] = 0.1
      } else{
        gd$no_known_muts[i] = 0.25
      }
      
    }
    
    else if(nrow(dnv_i) >= 1){
      if(any(muts$Func %in% c("misD","frameshift","stoploss","startloss","non","splice"))){
        gd$no_known_muts[i] = 0.5
      } else if (nrow(lof_k) >= 1){
        gd$no_known_muts[i] = 0.5
      } else{
        gd$no_known_muts[i] = 0.75
      }
    }
    
    else if (nrow(lof_k) >= 1) {
      gd$no_known_muts[i] = 0.5
    }
    
    else if (nrow(lof_i) >= 1){
      gd$no_known_muts[i] = 0.75
    }
    
    else{
      print("known gene weirdness")
    }
    
    gd$known_dnv_gene_list[i] = paste(dnv_k$Gene, collapse=";")
    gd$known_lof_gene_list[i] = paste(lof_k$Gene, collapse=";")
    gd$interactome_dnv_gene_list[i] = paste(dnv_i$Gene, collapse=";")
    gd$interactome_lof_gene_list[i] = paste(lof_i$Gene, collapse=";")
  }
}

In [13]:
# Determine if this person has other known interactome genes
gd$interactome_genes_w_dnvs = ""
gd$interactome_genes_w_lofs = ""
for (i in c(1:nrow(gd))){
  muts = comb_muts[which(comb_muts$Blinded.ID == gd$Blinded.ID[i]),]
  muts = muts[which(!(muts$CHR == gd$CHR[i] & muts$POS == gd$POS[i])),] # don't include the variant itself
  if (any(muts$Gene %in% interactome_genes)){
    dnv_ = muts[which(muts$inh_type == "dnv" & muts$Gene %in% interactome_genes),]
    lof_ = muts[which(muts$inh_type == "lof" & muts$Gene %in% interactome_genes),]
    
    gd$interactome_genes_w_dnvs[i] = paste(dnv_$Gene, collapse=";")
    gd$interactome_genes_w_lofs[i] = paste(lof_$Gene, collapse=";")
  }
}

In [14]:
# Column to state whether the gene the variant is in is known or not
gd$known_gene = "unknown"
gd$known_gene[which(gd$Gene %in% known_genes)] <- "known"

## Rank normalize and sum

In [15]:
# Median-permute missing data
medify <- function(dataframe, col_name){
  med = median(as.numeric(dataframe[,col_name][!is.na(dataframe[,col_name])]))
  dataframe[,col_name][which(is.na(dataframe[,col_name]))] <- med
  return(dataframe)
}

gene_data = medify(gd, "specificity_score")
gene_data = medify(gene_data, "CADD.score")

In [16]:
# Create rank_cols for applicable columns: mutperkb, phyloP, specificity_score, CADD, pLI_gnomAD, oe
rank_col <- function(dataframe, col_name_list){
  for(col_name in col_name_list){
      rank_col_name = paste0(col_name, "_rank")
      if (col_name != "oe"){
          dataframe[,rank_col_name] <- rank(dataframe[,col_name])
      } else{
          dataframe[,rank_col_name] <- rank(-dataframe[,col_name])
      }
  } 
    return(dataframe)
}

rank_data <- rank_col(gene_data, c("mutperkb","specificity_score","CADD.score","corrected_chd_node_degree",
                                   "corrected_dnv_node_degree","pLI","oe"))

rank_data$avg_gene_rank = rowMeans(rank_data[,c("mutperkb_rank","specificity_score_rank","oe_rank",
                                    "corrected_chd_node_degree_rank", "corrected_dnv_node_degree_rank",
                                    "pLI_rank")])
rank_data$avg_residue_rank = rank_data$CADD.score_rank
rank_data$rank_sum =rowSums(rank_data[,c("avg_gene_rank","avg_residue_rank")])

In [17]:
cols = c('CHR','POS','REF','ALT','Gene','known_gene','Variant.Class','AA.change','Blinded.ID',
         'Cardiac.Category','NDD',"CADD.score","pLI","HHE.Rank","oe","mutperkb","specificity_score", 
         "corrected_chd_node_degree","corrected_dnv_node_degree",
         "no_known_muts","known_dnv_gene_list","known_lof_gene_list","interactome_genes_w_dnvs",
         "interactome_genes_w_lofs","protein_domain_or_lof","mutperkb_rank","oe_rank",
         "specificity_score_rank","CADD.score_rank","corrected_chd_node_degree_rank",
         "corrected_dnv_node_degree_rank","pLI_rank","avg_gene_rank","avg_residue_rank","rank_sum")

gd <- rank_data[,c(cols)]
f1 <- gd[order(-gd$rank_sum),]

rank_data$binary_multiplied = rank_data$rank_sum * rank_data$no_known_muts * rank_data$protein_domain_or_lof
gd <- rank_data[,c(cols,"binary_multiplied")]
f2 <- gd[order(-gd$binary_multiplied),]


# Save out
write.csv(f1, "../manuscript/tables/GATA4-TBX5_interactors_simple_rank_sum.csv", quote=F, row.names = F)
write.csv(f2,"../manuscript/tables/GATA4-TBX5_interactors_binary_multiplied.csv", quote=F, row.names = F)
