In [1]:
library(RCurl)
library(XML)
library(biomaRt)
library(data.table)

Loading required package: bitops
Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


In [2]:
DNV_cases = read.csv('../data/variants/DNV_cases_v10v11.csv', stringsAsFactors = FALSE)
DNV_ctrls = read.csv('../data/variants/DNV_ctrls.csv', stringsAsFactors = FALSE)
DNV_cases$variant_type = ifelse(DNV_cases$Variant.Class == "syn", "Case Synonymous", "Case Non-synonymous")
DNV_ctrls$variant_type = ifelse(DNV_ctrls$Variant.Class == "syn", "Control Synonymous", "Control Non-synonymous")

unique(DNV_cases$Variant.Class)

In [3]:
names(DNV_cases)[which(names(DNV_cases) == "CHR")] <- "CHROM"
cols  = c("Blinded.ID","CHROM","POS",'REF','ALT','Variant.Class','variant_type','Gene')
combined = rbind(DNV_cases[,cols], DNV_ctrls[,cols])
head(combined)
nrow(combined)

Blinded.ID,CHROM,POS,REF,ALT,Variant.Class,variant_type,Gene
1-05022,12,112915523,A,G,misD,Case Non-synonymous,PTPN11
1-04539,10,120789851,A,G,mis,Case Non-synonymous,NANOS1
1-04539,1,32256494,G,A,loss of function,Case Non-synonymous,SPOCD1
1-04957,1,186301443,G,A,mis,Case Non-synonymous,TPR
1-11668,5,180056757,C,G,misD,Case Non-synonymous,FLT4
1-12471,16,57760088,C,T,mis,Case Non-synonymous,DRC7


In [4]:
gt_df = read.table('../intermediate/interactome_lists/GATA4-TBX5_genes.txt', stringsAsFactors=FALSE)
gt_interactome = gt_df$V1

combined$Interactome = ifelse(combined$Gene %in% gt_interactome, "Interactome", "Non-interactome")
combined$variant_type_2 = paste0(combined$variant_type, '_', combined$Interactome)
head(combined)
nrow(combined)

Blinded.ID,CHROM,POS,REF,ALT,Variant.Class,variant_type,Gene,Interactome,variant_type_2
1-05022,12,112915523,A,G,misD,Case Non-synonymous,PTPN11,Non-interactome,Case Non-synonymous_Non-interactome
1-04539,10,120789851,A,G,mis,Case Non-synonymous,NANOS1,Non-interactome,Case Non-synonymous_Non-interactome
1-04539,1,32256494,G,A,loss of function,Case Non-synonymous,SPOCD1,Non-interactome,Case Non-synonymous_Non-interactome
1-04957,1,186301443,G,A,mis,Case Non-synonymous,TPR,Non-interactome,Case Non-synonymous_Non-interactome
1-11668,5,180056757,C,G,misD,Case Non-synonymous,FLT4,Non-interactome,Case Non-synonymous_Non-interactome
1-12471,16,57760088,C,T,mis,Case Non-synonymous,DRC7,Non-interactome,Case Non-synonymous_Non-interactome


## CADD score

In [5]:
cadd_df = read.table('../data/cadd/cadd_scores.tsv', stringsAsFactors = FALSE, sep="\t", 
                     col.names = c('CHROM','POS','REF','ALT','CADD.score','PHRED.score'))

df = merge(combined, cadd_df, all.x = TRUE)
nrow(df)

## gnomAD pLI and observed/expected score 

In [6]:
gnomad = read.table('../data/databases/gnomad_constraint.txt', sep = "\t", stringsAsFactors=FALSE, header=TRUE)
gnomad = unique(gnomad[!duplicated(gnomad$gene),])
df$Gene[which(!df$Gene %in% gnomad$gene)]

In [7]:
df$alias = df$Gene

df$alias[which(df$Gene == "9-Sep")] <- 'SEPT9'
df$alias[which(df$Gene == "CFAP46")] <- 'TTC40'
df$alias[which(df$Gene == "SRPRA")] <- 'SRPR'
df$alias[which(df$Gene =="JMJD7-PLA2G4B\\x3bPLA2G4B")] <- 'PLA2G4B'
df$alias[which(df$Gene =="INTS14")] <- "VWA9"
df$alias[which(df$Gene =="CRAMP1")] <- "CRAMP1L"
df$alias[which(df$Gene =="DRC7")] <- "CCDC135"
df$alias[which(df$Gene == "C2CD6")] <- 'ALS2CR11'
df$alias[which(df$Gene == "ASB3\\x3bGPR75-ASB3")] <- 'GPR75-ASB3'
df$alias[which(df$Gene == "USF3")] <- 'KIAA2018'
df$alias[which(df$Gene == "PXYLP1")] <- 'ACPL2'
df$alias[which(df$Gene == "SMIM29")] <- 'C6orf1'
df$alias[which(df$Gene == "C7orf55-LUC7L2\\x3bLUC7L2")] <- 'LUC7L2'

df$Gene[which(!df$alias %in% gnomad$gene)] # These three genes do not appear to have aliases in the gnomad DB



In [8]:
df = unique(merge(df, gnomad[,c('gene','oe_mis','oe_lof','oe_syn','pLI','cds_length')], 
           by.x='alias', by.y='gene', all.x = TRUE))
nrow(df)

In [9]:
# Create column with relevant oe score (oe_mis for missense variants, oe_lof for loss-of-function variants)
df$oe = NA

for(i in c(1:nrow(df))){
    if (df$Variant.Class[i] %in% c("mis","misD")){
        df$oe[i] = df$oe_mis[i]
    } else if(df$Variant.Class[i] %in% c("non","frameshift","startloss","splice","stoploss")){
        df$oe[i] = df$oe_lof[i]
    } else if(df$Variant.Class[i] == 'syn'){
        df$oe[i] = df$oe_syn[i]
    }
}
    
nrow(df)

## Heart expression percentile rank

In [10]:
hhe_df = read.table('../data/databases/hhe_genes_rank.txt', sep="\t", stringsAsFactors = FALSE, header=TRUE)
names(hhe_df) <- c("Gene", "Heart.Expression.Percentile.Rank")
df = merge(df, hhe_df, by.x='alias', by.y='Gene', all.x = TRUE)
nrow(df)

## Node degree: other DNV genes, known CHD genes

In [11]:
iref = read.table('../data/databases/mammalian_iRefIndex.txt', sep='\t', stringsAsFactors=F, header=T)

In [12]:
nonsyn_dnvs = unique(DNV_cases[which(DNV_cases$Variant.Class != "syn"), 'Gene'])

known = read.table('../data/databases/known_CHD_genes.txt', stringsAsFactors = FALSE)
known_genes = known$V1

In [13]:
# We're interested in any genes with a DNV, all interactome genes, and reference variant genes
genes_of_interest = unique(c(DNV_cases$Gene, DNV_ctrls$Gene, gt_interactome, df$alias,
                             c("NKX2-5","TBX5","MKL2","MYH7")))

In [14]:
get_degree <- function(gene, subnetwork){
    
    lines = subnetwork[which(subnetwork$aliasA == gene | subnetwork$aliasB == gene),]
    lines = lines[which(lines$aliasA != lines$aliasB),c('aliasA','aliasB')]
    swap_lines = lines[,c('aliasB','aliasA')]
    names(swap_lines) = c('aliasA','aliasB')
    both = rbind(lines, swap_lines)
    both = unique(both)
    degree = nrow(both)/2
    
    return(degree)
}

In [17]:
gene_conn = data.frame(genes_of_interest)
names(gene_conn) <- 'Gene'
gene_conn$total_connections = 0
gene_conn$dnv_node_degree = 0
gene_conn$corrected_dnv_node_degree = 0
gene_conn$chd_node_degree = 0
gene_conn$corrected_chd_node_degree = 0


for (i in c(1:nrow(gene_conn))){
    
    gene = gene_conn$Gene[i]
    
    # Total number of connections in iref network
    degree = get_degree(gene, iref)
    gene_conn$total_connections[i] = degree
    
    # Number of connections the gene has to genes with a nonsynonymous DNV
    dnv_iref = iref[which(iref$aliasA %in% nonsyn_dnvs | iref$aliasB %in% nonsyn_dnvs),]
    dnv_degree = get_degree(gene, dnv_iref)
    gene_conn$dnv_node_degree[i] = dnv_degree
    gene_conn$corrected_dnv_node_degree[i] = dnv_degree/degree
    
    # Number of connections the gene has to known CHD genes
    chd_iref = iref[which(iref$aliasA %in% known_genes | iref$aliasB %in% known_genes),]
    chd_degree = get_degree(gene, chd_iref)
    gene_conn$chd_node_degree[i] = chd_degree
    gene_conn$corrected_chd_node_degree[i] = chd_degree/degree
}

In [18]:
write.csv(gene_conn, "../intermediate/iref_gene_connections.csv", row.names=FALSE)

In [19]:
df = merge(df, gene_conn, by.x='alias', by.y='Gene', all.x = TRUE)
nrow(df)

## Haploinsufficiency

In [20]:
haploinsufficiency = read.table("../data/databases/huang-et-al_haploinsufficiency.txt", sep = "\t",
                               stringsAsFactors=FALSE)
names(haploinsufficiency) <- c("CHR","START","STOP","Gene||","p_haploinsufficient","","")
head(haploinsufficiency)

CHR,START,STOP,Gene||,p_haploinsufficient,Unnamed: 5,Unnamed: 6,NA,NA.1
chr1,850392,869824,SAMD11|0.085|79.5%,0.085,.,850393,869824,502050
chr1,869458,884494,NOC2L|0.230|38.2%,0.23,.,869459,884494,1501050
chr1,885829,890958,KLHL17|0.084|80.2%,0.084,.,885830,890958,252300
chr1,891739,900339,PLEKHN1|0.170|49.2%,0.17,.,891740,900339,1251300
chr1,900446,907336,C1orf170|0.061|90.0%,0.061,.,900447,907336,252300
chr1,924207,925333,HES4|0.177|47.7%,0.177,.,924208,925333,1251300


In [21]:
haploinsufficiency$Gene <- sub('\\|.*', "",haploinsufficiency$`Gene||`)
df = merge(df, haploinsufficiency[,c('Gene','p_haploinsufficient')], all.x=TRUE)
nrow(df)

## Mutations per kilobase

In [22]:
case_lof = read.csv('../data/variants/LoF_cases_v10v11.csv', stringsAsFactors=F)
case_lof$Gene[which(case_lof$Gene == "1-Sep")] <- 'SEPT1'
case_lof$Gene[which(case_lof$Gene == "9-Sep")] <- 'SEPT9'
case_lof$Gene[which(case_lof$Gene == "14-Sep")] <- 'SEPT14'

case_lof$Gene[which(case_lof$Gene == "10-Mar")] <- 'MARCH10'
case_lof$Gene[which(case_lof$Gene == "11-Mar")] <- 'MARCH11'
case_lof$Gene[which(case_lof$Gene == "9-Mar")] <- 'MARCH9'

dnvs = DNV_cases[which(DNV_cases$variant_type=='Case Non-synonymous'),]
cols = c('Blinded.ID','Gene','AA.change')
vars = rbind(DNV_cases[,cols], case_lof[,cols])
gene_counts = as.data.frame(table(vars$Gene))
names(gene_counts) = c('Gene','n_mutations')

In [23]:
df = merge(df, gene_counts, all.x=TRUE)
df$mutperkb = (df$n_mutations / df$cds_length) * 1000
nrow(df)

## Heart expression specificity

In [24]:
gtex = read.table("../data/databases/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz",
                  sep = "\t", stringsAsFactors=F, header=T, skip=2, fill=T)
# Some genes have multiple transcripts
gtex = aggregate(.~Description, data=gtex[,c(-1)], mean)
head(gtex)

Description,Adipose...Subcutaneous,Adipose...Visceral..Omentum.,Adrenal.Gland,Artery...Aorta,Artery...Coronary,Artery...Tibial,Bladder,Brain...Amygdala,Brain...Anterior.cingulate.cortex..BA24.,...,Skin...Not.Sun.Exposed..Suprapubic.,Skin...Sun.Exposed..Lower.leg.,Small.Intestine...Terminal.Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole.Blood
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5S_rRNA,0.03274744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01952531,0.02435181,0.02203625,0.06317431,0.012659,0.03525225,0.02120125,0.03329181,0.02251006,0.01484906
7SK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01897867,0.0,0.0,0.0,0.0,0.39720833,0.0,0.0,0.0,0.0
A1BG,3.10956,3.27881,1.40566,3.37156,7.40513,6.43856,2.17151,4.02207,3.55573,...,2.3401,2.19279,2.80474,6.32484,1.35086,1.54219,4.53652,7.84693,5.49158,1.69172
A1BG-AS1,1.61339,1.95934,1.02482,2.10953,5.08509,3.53623,1.25874,1.00217,1.86325,...,1.50524,1.25731,2.50539,5.42682,1.00689,0.640786,3.71696,6.28329,3.2736,1.28071
A1CF,0.00738411,0.00561853,0.0187032,0.0163877,0.0122843,0.0158928,0.00884071,0.0,0.00468671,...,0.0112339,0.0143977,6.02867,0.00963833,0.239152,0.0475573,0.0163917,0.00701341,0.0108762,0.00572623


In [25]:
g <- gtex[which(gtex$Description%in% df$Gene),c('Description','Heart...Atrial.Appendage',
                                                               'Heart...Left.Ventricle','Artery...Aorta',
                                                               'Artery...Coronary',
                                                               'Brain...Amygdala','Brain...Cortex','Brain...Cerebellum','Brain...Hypothalamus','Bladder',
                                                               'Breast...Mammary.Tissue','Colon...Sigmoid','Esophagus...Muscularis','Fallopian.Tube',
                                                               'Adrenal.Gland','Kidney...Cortex','Liver','Lung','Pancreas','Prostate','Spleen',
                                                               'Thyroid','Whole.Blood')]
names(g) = c("Gene","Adult Atrium","Adult Left Ventricle","Aortic Artery","Coronary Artery","Amygdala",
                  "Cortex","Cerebellum",
                  "Hypothalamus","Bladder","Mammary Tissue","Sigmoid Colon","Espohagus Muscularis","Fallopian Tube",
                  "Adrenal Gland","Kidney","Liver","Lung","Pancreas","Prostate","Spleen","Thyroid","Whole Blood")
  
row.names(g) <- g$Gene
g <- g[,c(-1)]
  
g$heart_avg = rowMeans(g[,c("Adult Atrium","Adult Left Ventricle")])
g$other_avg = rowMeans(g[,c(-1,-2,-23)])
g$heart_specificity = g$heart_avg / rowSums(g[,c(-1,-2,-23)])
df$specificity_score = g$heart_specificity[match(df$Gene, row.names(g))]
nrow(df)

## Save out table of characteristics for manuscript

In [26]:
df

Gene,alias,CHROM,POS,REF,ALT,Blinded.ID,Variant.Class,variant_type,Interactome,...,Heart.Expression.Percentile.Rank,total_connections,dnv_node_degree,corrected_dnv_node_degree,chd_node_degree,corrected_chd_node_degree,p_haploinsufficient,n_mutations,mutperkb,specificity_score
9-Sep,SEPT9,17,75484879,C,T,SSC05717,misD,Control Non-synonymous,Non-interactome,...,92.364685,25,4,0.16000000,0,0.00000000,,,,
A2M,A2M,12,9265970,A,G,14397,mis,Control Non-synonymous,Non-interactome,...,33.295628,87,16,0.18390805,1,0.01149425,0.264,2,0.4522840,0.016150049
A3GALT2,A3GALT2,1,33777757,C,G,1-07571,mis,Case Non-synonymous,Non-interactome,...,,0,0,,0,,0.090,1,0.9803922,0.009409438
AAAS,AAAS,12,53701325,G,C,1-02674,mis,Case Non-synonymous,Non-interactome,...,80.148182,47,47,1.00000000,0,0.00000000,0.619,4,2.4420024,0.022944931
AACS,AACS,12,125621409,C,T,GT04016012,mis,Case Non-synonymous,Non-interactome,...,45.834512,0,0,,0,,0.159,2,0.9920635,0.022971941
AAK1,AAK1,2,69748004,C,T,1-06459,mis,Case Non-synonymous,Non-interactome,...,83.530343,7,7,1.00000000,0,0.00000000,0.085,2,0.6937218,0.031645830
AAMP,AAMP,2,219130141,T,A,11984,mis,Control Non-synonymous,Non-interactome,...,93.648549,12,1,0.08333333,0,0.00000000,0.423,,,0.023836931
AASS,AASS,7,121769559,C,T,1-00100,syn,Case Synonymous,Non-interactome,...,23.923986,1,1,1.00000000,0,0.00000000,0.450,2,0.7199424,0.024173806
AATK,AATK,17,79094043,G,T,1-08754,syn,Case Synonymous,Non-interactome,...,30.733556,2,0,0.00000000,0,0.00000000,0.213,2,0.4852014,0.006143793
ABAT,ABAT,16,8870306,C,T,1-04638,mis,Case Non-synonymous,Non-interactome,...,86.567502,1,1,1.00000000,0,0.00000000,0.369,1,0.6666667,0.006440546


In [27]:
names(df)
df <- df[,c('Gene','CHROM','POS','REF','ALT','Blinded.ID','Variant.Class','variant_type',
            'Interactome','variant_type_2','CADD.score','PHRED.score','oe','pLI','cds_length',
            'Heart.Expression.Percentile.Rank', 'total_connections','dnv_node_degree','corrected_dnv_node_degree',
            'chd_node_degree','corrected_chd_node_degree','p_haploinsufficient','mutperkb','specificity_score')]

In [28]:
write.table(df, file = "../manuscript/tables/variant_characteristics.tsv", sep="\t",
           quote=FALSE, row.names = FALSE)

## Properties of reference variants

In [29]:
ref = read.csv('../data/variants/reference_variants.csv', stringsAsFactors = FALSE)

In [30]:
ref = ref[,c('Gene','CHR','POS','REF','ALT','Blinded.ID','ExonicFunc.refGene','CADD.score','PHRED.score')]
names(ref)[which(names(ref)=="CHR")] <- "CHROM"
names(ref)[which(names(ref)=='ExonicFunc.refGene')] <- "Variant.Class"

In [31]:
ref

Gene,CHROM,POS,REF,ALT,Blinded.ID,Variant.Class,CADD.score,PHRED.score
GATA4,8,11607722,G,A,GATA4,misD,4.027798,28.3
NKX2-5,5,172660192,C,A,OLIGO,misD,1.651977,16.52
MKL2,16,14341127,G,T,OLIGO,misD,3.082313,23.8
MYH7,20,23898536,G,A,OLIGO,misD,0.023524,2.992
NKX2-5,5,172660387,T,G,NKX25,misD,0.476938,9.166
TBX5,12,114839635,C,T,TBX5,misD,4.471066,32.0


In [32]:
ref$Interactome = ifelse(ref$Gene %in% gt_interactome, "Interactome", "Non-interactome")
ref = merge(ref, gnomad[,c('gene','oe_mis','oe_lof','oe_syn','pLI','cds_length')], 
            by.x='Gene', by.y='gene', all.x = TRUE)
ref = merge(ref, hhe_df, by.x='Gene', by.y='Gene', all.x = TRUE)
ref = merge(ref, gene_conn, by.x='Gene', by.y='Gene', all.x = TRUE)

In [33]:
# Create column with relevant oe score (oe_mis for missense variants, oe_lof for loss-of-function variants)
ref$oe = NA

for(i in c(1:nrow(ref))){
    if (ref$Variant.Class[i] %in% c("mis","misD")){
        ref$oe[i] = ref$oe_mis[i]
    } else if(ref$Variant.Class[i] %in% c("non","frameshift","startloss","splice","stoploss")){
        ref$oe[i] = ref$oe_lof[i]
    } else if(ref$Variant.Class[i] == 'syn'){
        ref$oe[i] = ref$oe_syn[i]
    }
}

In [34]:
ref = merge(ref, haploinsufficiency[,c('Gene','p_haploinsufficient')])
ref = merge(ref, gene_counts, all.x=TRUE)
ref$mutperkb = (ref$n_mutations / ref$cds_length) * 1000
ref$specificity_score = g$heart_specificity[match(ref$Gene, row.names(g))]

In [35]:
ref <- ref[,c('Gene','CHROM','POS','REF','ALT','Blinded.ID','Variant.Class',
            'Interactome','CADD.score','PHRED.score','oe','pLI','cds_length',
            'Heart.Expression.Percentile.Rank', 'total_connections','dnv_node_degree','corrected_dnv_node_degree',
            'chd_node_degree','corrected_chd_node_degree','p_haploinsufficient','mutperkb','specificity_score')]

write.table(ref, file = "../intermediate/reference_variant_characteristics.tsv", sep="\t",
           quote=FALSE, row.names = FALSE)