In [1]:
library(RCurl)
library(XML)
library(biomaRt)
library(data.table)

Loading required package: bitops
Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


In [11]:
# Load in case and control dnvs
cases = read.csv('~/lab_notebooks/outer_join_dnvs.csv', stringsAsFactors = FALSE)
ctrls = read.csv('../data/variants/DNV_ctrls.csv', stringsAsFactors = FALSE)

names(cases) <- c('Blinded.ID','CHROM','POS','REF','ALT','Gene','Variant.Class','AA_change')

cases$variant_type = ifelse(cases$Variant.Class == "syn", "Case Synonymous", "Case Non-synonymous")
ctrls$variant_type = ifelse(ctrls$Variant.Class == "syn", "Control Synonymous", "Control Non-synonymous")

cols  = c("Blinded.ID","CHROM","POS",'REF','ALT','Variant.Class','variant_type','Gene','AA_change')
combined = rbind(cases[,cols], ctrls[,cols])

Blinded.ID,CHROM,POS,REF,ALT,Variant.Class,variant_type,Gene,AA_change
GT04011952,19,8491545,C,G,mis,Case Non-synonymous,MARCH2,p.L77V
1-00788,10,94070938,G,A,mis,Case Non-synonymous,MARCH5,p.E28K
1-01469,2,242275458,C,T,mis,Case Non-synonymous,SEPT2,p.R106C
1-06549,22,42382110,G,A,mis,Case Non-synonymous,SEPT3,p.G109R
1-05720,17,56604065,A,C,mis,Case Non-synonymous,SEPT4,p.L127R
1-07417,5,132097207,C,G,misD,Case Non-synonymous,SEPT8,p.R302P


In [12]:
gt_df = read.table('../intermediate/interactome_lists/GATA4-TBX5_genes.txt', stringsAsFactors=FALSE)
gt_interactome = gt_df$V1

combined$Interactome = ifelse(combined$Gene %in% gt_interactome, "Interactome", "Non-interactome")
combined$variant_type_2 = paste0(combined$variant_type, '_', combined$Interactome)
head(combined)

Blinded.ID,CHROM,POS,REF,ALT,Variant.Class,variant_type,Gene,AA_change,Interactome,variant_type_2
GT04011952,19,8491545,C,G,mis,Case Non-synonymous,MARCH2,p.L77V,Non-interactome,Case Non-synonymous_Non-interactome
1-00788,10,94070938,G,A,mis,Case Non-synonymous,MARCH5,p.E28K,Non-interactome,Case Non-synonymous_Non-interactome
1-01469,2,242275458,C,T,mis,Case Non-synonymous,SEPT2,p.R106C,Non-interactome,Case Non-synonymous_Non-interactome
1-06549,22,42382110,G,A,mis,Case Non-synonymous,SEPT3,p.G109R,Non-interactome,Case Non-synonymous_Non-interactome
1-05720,17,56604065,A,C,mis,Case Non-synonymous,SEPT4,p.L127R,Non-interactome,Case Non-synonymous_Non-interactome
1-07417,5,132097207,C,G,misD,Case Non-synonymous,SEPT8,p.R302P,Non-interactome,Case Non-synonymous_Non-interactome


### CADD

In [13]:
dat = read.table('/pollard/home/mpittman/ctf-apms/data/cadd/v11_variants.tsv', sep = "\t", 
               stringsAsFactors = FALSE, fill=TRUE)
names(dat) <- c("CHROM","POS","REF","ALT","CADD.score","Phred.score")

In [17]:
cdf1 = read.table('../data/cadd/cadd_scores.tsv',sep = "\t", 
               stringsAsFactors = FALSE, fill=TRUE)
names(cdf1) <- c("CHROM","POS","REF","ALT","CADD.score","Phred.score")

In [27]:
cadd = unique(rbind(cdf1, dat))
df = merge(combined, cadd)
df$Gene <- gsub("\\\\x3b", ';', df$Gene)

### gnomAD pLI scores

In [28]:
gnomad = read.table('../data/databases/gnomad_constraint.txt', sep = "\t", stringsAsFactors=FALSE, header=TRUE)
df$Gene[which(!df$Gene %in% gnomad$gene)]

In [31]:
"HCG25371" %in% gnomad$gene

"MFSD4" %in% gnomad$gene

### Haploinsufficiency

### Expression specificity

### Node degree