In [1]:
library(RCurl)
library(XML)
library(biomaRt)
library(data.table)

Loading required package: bitops
Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


In [2]:
DNV_cases = read.csv('../data/variants/DNV_cases.csv', stringsAsFactors = FALSE)
DNV_ctrls = read.csv('../data/variants/DNV_ctrls.csv', stringsAsFactors = FALSE)
DNV_cases$variant_type = ifelse(DNV_cases$Variant.Class == "syn", "Case Synonymous", "Case Non-synonymous")
DNV_ctrls$variant_type = ifelse(DNV_ctrls$Variant.Class == "syn", "Control Synonymous", "Control Non-synonymous")

names(DNV_cases)[which(names(DNV_cases) == "CHR")] <- "CHROM"
cols  = c("Blinded.ID","CHROM","POS",'REF','ALT','Variant.Class','variant_type','Gene', 'HHE.Rank')
combined = rbind(DNV_cases[,cols], DNV_ctrls[,cols])
head(combined)

Blinded.ID,CHROM,POS,REF,ALT,Variant.Class,variant_type,Gene,HHE.Rank
GT04011952,19,8491545,C,G,mis,Case Non-synonymous,MARCH2,70.2
1-00788,10,94070938,G,A,mis,Case Non-synonymous,MARCH5,75.1
1-01469,2,242275458,C,T,mis,Case Non-synonymous,SEPT2,98.1
1-06549,22,42382110,G,A,mis,Case Non-synonymous,SEPT3,50.4
1-05720,17,56604065,A,C,mis,Case Non-synonymous,SEPT4,54.4
1-07417,5,132097207,C,G,misD,Case Non-synonymous,SEPT8,86.4


In [3]:
combined$ID = "."
cadd_input = combined[,c('CHROM','POS','ID','REF','ALT')]
write.table(cadd_input, file = '../intermediate/cadd_upload.vcf', 
            sep = "\t", row.names=FALSE, quote=FALSE)

In [4]:
gt_df = read.table('../intermediate/interactome_lists/GATA4-TBX5_genes.txt', stringsAsFactors=FALSE)
gt_interactome = gt_df$V1

genes = strsplit(gt_interactome, ';')
gene_list = unlist(genes, recursive=FALSE)

combined$Interactome = ifelse(combined$Gene %in% gt_interactome, "Interactome", "Non-interactome")
combined$variant_type_2 = paste0(combined$variant_type, '_', combined$Interactome)
head(combined)


Blinded.ID,CHROM,POS,REF,ALT,Variant.Class,variant_type,Gene,HHE.Rank,ID,Interactome,variant_type_2
GT04011952,19,8491545,C,G,mis,Case Non-synonymous,MARCH2,70.2,.,Non-interactome,Case Non-synonymous_Non-interactome
1-00788,10,94070938,G,A,mis,Case Non-synonymous,MARCH5,75.1,.,Non-interactome,Case Non-synonymous_Non-interactome
1-01469,2,242275458,C,T,mis,Case Non-synonymous,SEPT2,98.1,.,Non-interactome,Case Non-synonymous_Non-interactome
1-06549,22,42382110,G,A,mis,Case Non-synonymous,SEPT3,50.4,.,Non-interactome,Case Non-synonymous_Non-interactome
1-05720,17,56604065,A,C,mis,Case Non-synonymous,SEPT4,54.4,.,Non-interactome,Case Non-synonymous_Non-interactome
1-07417,5,132097207,C,G,misD,Case Non-synonymous,SEPT8,86.4,.,Non-interactome,Case Non-synonymous_Non-interactome


In [5]:
unique(combined$variant_type)

## CADD score

In [6]:
cadd_df = read.table('../data/cadd/cadd_scores.tsv', stringsAsFactors = FALSE, sep="\t", 
                     col.names = c('CHROM','POS','REF','ALT','CADD.score','PHRED.score'))

df = merge(combined, cadd_df, all.x = TRUE)

## gnomAD pLI and observed/expected score 

In [7]:
gnomad = read.table('../data/databases/gnomad_constraint.txt', sep = "\t", stringsAsFactors=FALSE, header=TRUE)
df$Gene[which(!df$Gene %in% gnomad$gene)]

In [8]:
df$Gene[which(df$Gene == "9-Sep")] <- 'SEPT9'
df = merge(df, gnomad[,c('gene','oe_mis','oe_lof','oe_syn','pLI')], by.x='Gene', by.y='gene', all.x = TRUE)

In [9]:
# Create column with relevant oe score (oe_mis for missense variants, oe_lof for loss-of-function variants)
df$oe = NA

for(i in c(1:nrow(df))){
    if (df$Variant.Class[i] %in% c("mis","misD")){
        df$oe[i] = df$oe_mis[i]
    } else if(df$Variant.Class[i] %in% c("non","frameshift","startloss","splice","stoploss")){
        df$oe[i] = df$oe_lof[i]
    } else if(df$Variant.Class[i] == 'syn'){
        df$oe[i] = df$oe_syn[i]
    }
}

## Node degree: other DNV genes

In [10]:
iref = read.table('../data/databases/mammalian_iRefIndex.txt', sep='\t', stringsAsFactors=F, header=T)

In [11]:
nonsyn_dnvs = unique(DNV_cases[which(DNV_cases$Variant.Class != "syn"), 'Gene'])

In [12]:
#Create data table to save overall connectivity of each interactome gene
gene_conn = data.frame(Gene = unique(df$Gene))
gene_conn$total_connections = 0

for (g in gene_conn$Gene){
  lines = iref[which(iref$aliasA == g | iref$aliasB == g),]
  
  # Remove self-loops
  lines = lines[which(lines$aliasA != lines$aliasB),c('aliasA','aliasB')]
  
  # Remove transitive interactions
  swap_lines = lines[,c('aliasB','aliasA')]
  names(swap_lines) = c('aliasA','aliasB')
  both = rbind(lines, swap_lines)
  both = unique(both)
  degree = nrow(both)/2
  
  gene_conn$total_connections[which(gene_conn$Gene == g)] = degree
  
}

# Connectivity to other DNV genes
first_interactome = iref[which(iref$aliasA %in% nonsyn_dnvs | iref$aliasB %in% nonsyn_dnvs),]
df$dnv_node_degree = 0
df$corrected_dnv_node_degree = 0

for(i in c(1:nrow(df))){
  lines = unique(first_interactome[which(first_interactome$aliasA == df$Gene[i] | 
                                           first_interactome$aliasB == df$Gene[i]), c('aliasA','aliasB')])
  
  # Remove self-loops
  lines = lines[which(lines$aliasA != lines$aliasB),]
  
  # Remove transitive interactions
  swap_lines = lines[,c('aliasB','aliasA')]
  names(swap_lines) = c('aliasA','aliasB')
  both = rbind(lines, swap_lines)
  both = unique(both)
  degree = nrow(both)/2
  df$dnv_node_degree[i] = degree
  df$corrected_dnv_node_degree[i] = degree/gene_conn$total_connections[which(gene_conn$Gene == df$Gene[i])]
}

## Node degree - known CHD genes

In [13]:
known = read.table('../data/databases/known_CHD_genes.txt', stringsAsFactors = FALSE)
known_genes = known$V1

In [14]:
# Connectivity to known CHD genes
first_interactome = iref[which(iref$aliasA %in% known_genes | iref$aliasB %in% known_genes),]
df$chd_node_degree = 0
df$corrected_chd_node_degree = 0

for(i in c(1:nrow(df))){
  lines = unique(first_interactome[which(first_interactome$aliasA == df$Gene[i] | 
                                           first_interactome$aliasB == df$Gene[i]), c('aliasA','aliasB')])
  
  # Remove self-loops
  lines = lines[which(lines$aliasA != lines$aliasB),]
  
  # Remove transitive interactions
  swap_lines = lines[,c('aliasB','aliasA')]
  names(swap_lines) = c('aliasA','aliasB')
  both = rbind(lines, swap_lines)
  both = unique(both)
  degree = nrow(both)/2
  df$chd_node_degree[i] = degree
  df$corrected_chd_node_degree[i] = degree/gene_conn$total_connections[which(gene_conn$Gene == df$Gene[i])]
}

In [15]:
df

Gene,CHROM,POS,REF,ALT,Blinded.ID,Variant.Class,variant_type,HHE.Rank,ID,...,PHRED.score,oe_mis,oe_lof,oe_syn,pLI,oe,dnv_node_degree,corrected_dnv_node_degree,chd_node_degree,corrected_chd_node_degree
A2M,12,9265970,A,G,14397,mis,Control Non-synonymous,33.3,.,...,24.300,0.81065,0.405260,0.87995,4.5229e-11,0.81065,14,0.16091954,1,0.011494253
AAAS,12,53701325,G,C,1-02674,mis,Case Non-synonymous,80.1,.,...,22.400,1.05910,0.707340,0.97190,2.4356e-13,1.05910,47,1.00000000,0,0.000000000
AACS,12,125621409,C,T,GT04016012,mis,Case Non-synonymous,45.8,.,...,33.000,0.97211,0.798010,0.93551,9.1275e-17,0.97211,0,,0,
AAK1,2,69748004,C,T,1-06459,mis,Case Non-synonymous,83.5,.,...,31.000,0.67376,0.041108,0.96954,1.0000e+00,0.67376,7,1.00000000,0,0.000000000
AAMP,2,219130141,T,A,11984,mis,Control Non-synonymous,93.6,.,...,24.200,0.69331,0.179410,0.97252,7.6172e-01,0.69331,1,0.08333333,0,0.000000000
AASS,7,121769559,C,T,1-00100,syn,Case Synonymous,23.9,.,...,11.330,0.88950,0.583280,0.96105,2.3101e-12,0.96105,0,0.00000000,0,0.000000000
ABAT,16,8870306,C,T,1-04638,mis,Case Non-synonymous,86.6,.,...,22.600,0.90591,0.324760,1.07510,6.3216e-03,0.90591,1,1.00000000,0,0.000000000
ABCA12,2,215818801,G,T,11405,mis,Control Non-synonymous,24.2,.,...,20.800,0.87450,0.377260,1.03200,2.9220e-15,0.87450,0,0.00000000,0,0.000000000
ABCA12,2,215917234,C,T,12994,mis,Control Non-synonymous,24.2,.,...,13.460,0.87450,0.377260,1.03200,2.9220e-15,0.87450,0,0.00000000,0,0.000000000
ABCA13,7,48315282,C,A,11513,mis,Control Non-synonymous,14.2,.,...,0.040,1.08650,0.915340,1.04120,8.7753e-114,1.08650,2,1.00000000,0,0.000000000


## Save out table of characteristics for manuscript

In [16]:
write.table(df, file = "../manuscript/tables/variant_characteristics.tsv", sep="\t",
           quote=FALSE, row.names = FALSE)

## Calculating connectivity of reference gene not in interactome: NKX2-5

In [19]:
get_degree <- function(gene, subnetwork){
    
    lines = subnetwork[which(subnetwork$aliasA == gene | subnetwork$aliasB == gene),]
    lines = lines[which(lines$aliasA != lines$aliasB),c('aliasA','aliasB')]
    swap_lines = lines[,c('aliasB','aliasA')]
    names(swap_lines) = c('aliasA','aliasB')
    both = rbind(lines, swap_lines)
    both = unique(both)
    degree = nrow(both)/2
    
    return(degree)
}

In [21]:
#Create data table to save overall connectivity of each interactome gene
gene_conn = data.frame(Gene = c("NKX2-5", "MKL2", "MYH7"))
gene_conn$total_connections = 0
gene_conn$dnv_node_degree = 0
gene_conn$corrected_dnv_node_degree = 0
gene_conn$chd_node_degree = 0
gene_conn$corrected_chd_node_degree = 0


for (i in c(1:nrow(gene_conn))){
    degree = get_degree(gene_conn$Gene[i], iref)
    gene_conn$total_connections[i] = degree
    
    first_interactome = iref[which(iref$aliasA %in% nonsyn_dnvs | iref$aliasB %in% nonsyn_dnvs),]
    dnv_degree = get_degree(gene_conn$Gene[i], first_interactome)
    gene_conn$dnv_node_degree[i] = dnv_degree
    gene_conn$corrected_dnv_node_degree[i] = dnv_degree/gene_conn$total_connections[i]
    
    first_interactome = iref[which(iref$aliasA %in% known_genes | iref$aliasB %in% known_genes),]
    chd_degree = get_degree(gene_conn$Gene[i], first_interactome)
    gene_conn$chd_node_degree[i] = chd_degree
    gene_conn$corrected_chd_node_degree[i] = chd_degree/gene_conn$total_connections[i]
}

gene_conn

Gene,total_connections,dnv_node_degree,corrected_dnv_node_degree,chd_node_degree,corrected_chd_node_degree
NKX2-5,9,3,0.3333333,9,1
MKL2,6,0,0.0,0,0
MYH7,19,2,0.1052632,0,0
