In [1]:
# Load list of interactome genes
interactome = read.csv("../intermediate/interactome_lists/GATA4-TBX5_genes.txt", stringsAsFactors=F,
                      header = F)
interactome_genes = interactome$V1
interactome_genes = c(interactome_genes,"FRG1")
interactome_genes = c(interactome_genes,"H4-16")
interactome_genes = c(interactome_genes,'RFWD2')
interactome_genes = trimws(interactome_genes)

In [2]:
# Load in rna-seq data to identify CP-expressed genes not in the interactome
rnaseq = read.csv("../data/rnaseq/log2_cpm_results_gata4_ko_study.csv", stringsAsFactors=F)
head(rnaseq)

non_expr = interactome_genes[which(!interactome_genes %in% rnaseq$hgnc_symbol)]

EnsemblIDs,ensembl_gene_id,hgnc_symbol,WTC11_DR4.1,WTC11_DR4.2,WTC11_DR4.3,WTC11_DR4.4,WTC11_DR4.5,gko.1,gko.2,gko.3,gko.4,gko.5,gko.6
ENSG00000223972,ENSG00000223972,DDX11L1,-0.246438,-0.3818135,0.08900291,0.78382265,0.3917166,1.4230111,1.3126161,1.81041023,1.1633844,0.965871,1.4299702
ENSG00000227232,ENSG00000227232,WASH7P,3.2867177,3.4193737,3.62444559,3.68051742,3.320722,3.6519681,4.1997256,4.02832584,3.8583636,3.7940827,4.0173132
ENSG00000243485,ENSG00000243485,MIR1302-2HG,-1.7975716,-1.2148499,-2.06460937,-1.42242351,-1.740225,-0.691343,-0.8491803,-0.73714287,-1.792218,-1.8937911,-0.8706978
ENSG00000237613,ENSG00000237613,FAM138A,-2.6059938,-4.2630771,-2.37152808,-2.20106409,-4.437045,-0.1512308,-2.1866962,-0.84220942,-1.0064382,-1.8937911,-0.681354
ENSG00000238009,ENSG00000238009,,1.6299828,2.1330723,2.48512938,1.82257191,2.0087581,1.9460318,1.6954012,2.00470674,1.8613446,1.7562918,2.3425641
ENSG00000239945,ENSG00000239945,,-0.3002277,0.6862612,0.80800437,-0.08368036,0.267185,0.5488229,0.389406,0.03040981,-0.3649395,0.2338211,0.8881017


In [3]:
# load in gnomad constraint
constraint = read.csv("../data/databases/gnomad_constraint.txt", sep="\t", stringsAsFactors=F)
constraint <- constraint[,c('gene','oe_lof','lof_z')]
head(constraint)

gene,oe_lof,lof_z
MED13,0.0,9.1935
NIPBL,0.0066527,11.286
SMC3,0.0,8.2618
CNOT1,0.0079978,10.279
RLF,0.0,7.9294
PCF11,0.0,8.0014


In [4]:
# Load in mutability table from https://doi.org/10.1038/ng.3050
mut_table = read.csv('../data/databases/mutability_table.csv',stringsAsFactors=FALSE)
head(mut_table)

hgncID,hgncSymbol,enstID,ensgID,geneName,syn,mis,non,splice,frameshift,lof,prot,all
658,ARF5,ENST00000000233,ENSG00000004059,ARF5,2.704295e-06,6.234164e-06,2.016875e-07,2.207269e-07,1.05904e-06,1.481454e-06,7.715618e-06,1.041991e-05
6752,M6PR,ENST00000000412,ENSG00000003056,M6PR,2.877349e-06,7.767629e-06,5.370605e-07,3.469603e-07,7.57183e-08,9.597391e-07,8.727368e-06,1.160472e-05
3720,FKBP4,ENST00000001008,ENSG00000004478,FKBP4,5.608786e-06,1.346427e-05,7.503486e-07,3.996519e-07,4.035024e-08,1.190351e-06,1.465462e-05,2.026341e-05
20581,CYP26B1,ENST00000001146,ENSG00000003137,CYP26B1,1.013847e-05,2.091603e-05,4.151766e-07,2.143277e-07,6.023546e-07,1.231859e-06,2.214789e-05,3.228636e-05
28816,NDUFAF7,ENST00000002125,ENSG00000003509,NDUFAF7,5.079037e-06,1.208114e-05,7.784025e-07,3.826662e-07,5.197949e-07,1.680864e-06,1.376201e-05,1.884104e-05
4008,FUCA2,ENST00000002165,ENSG00000001036,FUCA2,5.445799e-06,1.222216e-05,8.668046e-07,2.605275e-07,1.396694e-06,2.524026e-06,1.474619e-05,2.019199e-05


In [5]:
# Merge expression data with mutability table
t = merge(mut_table, rnaseq, by.x = "geneName", by.y = "hgnc_symbol")
tab = merge(t, constraint, by.x="geneName", by.y="gene")
tab

geneName,hgncID,hgncSymbol,enstID,ensgID,syn,mis,non,splice,frameshift,...,WTC11_DR4.4,WTC11_DR4.5,gko.1,gko.2,gko.3,gko.4,gko.5,gko.6,oe_lof,lof_z
A1BG,5,A1BG,ENST00000263100,ENSG00000121410,8.997970e-06,1.738961e-05,5.763794e-07,2.639868e-07,6.532817e-07,...,0.6941654,0.26718503,0.2969615,0.6811068,0.3495099,0.16981114,1.05851869,-0.36406049,0.784570,0.87287
A1CF,24086,A1CF,ENST00000373995,ENSG00000148584,6.365968e-06,1.535849e-05,1.437698e-06,4.480121e-07,8.024687e-07,...,-0.4418889,-0.07354546,-3.2690262,-2.4922970,-1.5265737,-3.64960321,-1.89379115,-2.05833831,0.605370,2.10190
A2M,7,A2M,ENST00000318602,ENSG00000175899,1.543159e-05,3.545894e-05,1.960148e-06,1.477263e-06,4.616751e-07,...,5.9297063,6.43431815,6.4698272,5.4565275,4.9155455,6.48351956,5.41823491,4.94325022,0.405260,4.89730
A2ML1,23336,A2ML1,ENST00000299698,ENSG00000166535,1.778759e-05,3.993763e-05,1.996995e-06,1.668077e-06,4.371224e-07,...,-0.3436603,-1.05844331,-1.0628825,-0.9624933,-0.4612849,-0.02100296,-1.32681094,-0.51400193,0.771710,1.91140
AAAS,13666,AAAS,ENST00000209873,ENSG00000094914,6.285383e-06,1.616090e-05,1.249937e-06,6.651967e-07,1.192104e-06,...,3.9940251,4.14820473,3.9822487,3.8260502,3.9867508,3.89936297,3.97135856,4.04439026,0.707340,1.61230
AACS,21298,AACS,ENST00000316519,ENSG00000081760,1.119456e-05,2.157404e-05,7.599163e-07,7.486510e-07,3.136301e-06,...,2.5279440,2.78270655,2.6954841,2.8607689,2.6908755,2.40757799,2.75415498,3.02501670,0.798010,1.12840
AADACL2,24427,AADACL2,ENST00000356517,ENSG00000197953,3.844588e-06,1.025116e-05,9.549878e-07,1.714647e-07,1.050620e-06,...,-1.8924649,-6.64385619,-1.7798938,-2.4922970,-3.4069782,-1.00643821,-6.64385619,-3.52786631,1.038400,-0.12089
AADACL3,32037,AADACL3,ENST00000359318,ENSG00000188984,4.516789e-06,1.040186e-05,3.757272e-07,8.896304e-08,5.319836e-08,...,-4.0146326,-4.43704500,-2.3402955,-4.2672665,-1.3612743,-1.00643821,-1.89379115,-6.64385619,0.740910,0.48312
AADAT,17929,AADAT,ENST00000337664,ENSG00000109576,4.201216e-06,1.068241e-05,4.576510e-07,5.174663e-07,3.083867e-06,...,2.9902292,3.34659816,2.9616233,3.6298902,3.6467231,3.13585099,3.56059315,3.31913681,0.257590,3.32030
AAGAB,25662,AAGAB,ENST00000261880,ENSG00000103591,3.096023e-06,7.354511e-06,5.012149e-07,4.673855e-07,9.366351e-07,...,2.5279440,3.05169763,3.0043823,2.9338371,2.8256056,3.13585099,2.86091744,3.01135864,0.652480,1.43740


In [6]:
# Find mutability of interactome genes
im = merge(interactome, mut_table, by.x="V1", by.y="geneName", how="left", all.x=T)
interactome_mutability = merge(im, constraint, by.x="V1", by.y="gene",how="left", all.x=T)
#missingdf <- interactome_mutability[rowSums(is.na(interactome_mutability)) > 0,]
#missingdf
interactome_mutability

V1,hgncID,hgncSymbol,enstID,ensgID,syn,mis,non,splice,frameshift,lof,prot,all,oe_lof,lof_z
HIST1H4A,,,,,,,,,,,,,,
HIST1H4B,,,,,,,,,,,,,,
HIST1H4C,,,,,,,,,,,,,,
HIST1H4D,,,,,,,,,,,,,,
HIST1H4E,,,,,,,,,,,,,,
HIST1H4F,,,,,,,,,,,,,,
HIST1H4I,,,,,,,,,,,,,,
HIST1H4J,,,,,,,,,,,,,,
HIST1H4K,,,,,,,,,,,,,,
HIST1H4L,,,,,,,,,,,,,,


In [7]:
'RFWD2' %in% mut_table$geneName
interactome_mutability <- interactome_mutability[complete.cases(interactome_mutability), ]

In [8]:
interactome_mutability$mutscore_to_compare <- lapply(interactome_mutability$all,round,5)
interactome_mutability$lofscore_to_compare <- lapply(interactome_mutability$oe_lof,round,2)

tab$mutscore_to_compare <- lapply(tab$all, round, 5)
tab$lofscore_to_compare <- lapply(tab$oe_lof, round, 2)

head(tab)
head(interactome_mutability)

geneName,hgncID,hgncSymbol,enstID,ensgID,syn,mis,non,splice,frameshift,...,gko.1,gko.2,gko.3,gko.4,gko.5,gko.6,oe_lof,lof_z,mutscore_to_compare,lofscore_to_compare
A1BG,5,A1BG,ENST00000263100,ENSG00000121410,8.99797e-06,1.738961e-05,5.763794e-07,2.639868e-07,6.532817e-07,...,0.2969615,0.6811068,0.3495099,0.16981114,1.058519,-0.3640605,0.78457,0.87287,3e-05,0.78
A1CF,24086,A1CF,ENST00000373995,ENSG00000148584,6.365968e-06,1.535849e-05,1.437698e-06,4.480121e-07,8.024687e-07,...,-3.2690262,-2.492297,-1.5265737,-3.64960321,-1.893791,-2.0583383,0.60537,2.1019,2e-05,0.61
A2M,7,A2M,ENST00000318602,ENSG00000175899,1.543159e-05,3.545894e-05,1.960148e-06,1.477263e-06,4.616751e-07,...,6.4698272,5.4565275,4.9155455,6.48351956,5.418235,4.9432502,0.40526,4.8973,5e-05,0.41
A2ML1,23336,A2ML1,ENST00000299698,ENSG00000166535,1.778759e-05,3.993763e-05,1.996995e-06,1.668077e-06,4.371224e-07,...,-1.0628825,-0.9624933,-0.4612849,-0.02100296,-1.326811,-0.5140019,0.77171,1.9114,6e-05,0.77
AAAS,13666,AAAS,ENST00000209873,ENSG00000094914,6.285383e-06,1.61609e-05,1.249937e-06,6.651967e-07,1.192104e-06,...,3.9822487,3.8260502,3.9867508,3.89936297,3.971359,4.0443903,0.70734,1.6123,3e-05,0.71
AACS,21298,AACS,ENST00000316519,ENSG00000081760,1.119456e-05,2.157404e-05,7.599163e-07,7.48651e-07,3.136301e-06,...,2.6954841,2.8607689,2.6908755,2.40757799,2.754155,3.0250167,0.79801,1.1284,4e-05,0.8


Unnamed: 0,V1,hgncID,hgncSymbol,enstID,ensgID,syn,mis,non,splice,frameshift,lof,prot,all,oe_lof,lof_z,mutscore_to_compare,lofscore_to_compare
14,ACACA,84,ACACA,ENST00000353139,ENSG00000132142,2.471055e-05,6.977824e-05,4.436066e-06,2.696757e-06,2.763303e-06,9.896126e-06,7.967437e-05,0.0001043849,0.14874,9.1472,0.0001,0.15
15,ACTL6A,24124,ACTL6A,ENST00000429709,ENSG00000136518,4.453527e-06,1.14806e-05,4.601103e-07,6.858729e-07,3.482313e-06,4.628296e-06,1.61089e-05,2.056242e-05,0.078124,4.3223,2e-05,0.08
16,ADNP,15766,ADNP,ENST00000396029,ENSG00000101126,1.142904e-05,2.835871e-05,1.91803e-06,8.454175e-08,2.060842e-07,2.208656e-06,3.056737e-05,4.19964e-05,0.025868,5.6124,4e-05,0.03
17,ADNP2,23803,ADNP2,ENST00000262198,ENSG00000101544,1.476806e-05,3.131705e-05,1.324894e-06,8.753835e-08,3.241618e-07,1.736594e-06,3.305364e-05,4.78217e-05,0.19856,4.0825,5e-05,0.2
18,AHCTF1,24618,AHCTF1,ENST00000326225,ENSG00000153207,2.145e-05,5.454373e-05,3.963252e-06,1.470944e-06,1.233824e-05,1.777244e-05,7.231617e-05,9.376617e-05,0.071387,9.1094,9e-05,0.07
19,AKAP8L,29857,AKAP8L,ENST00000397410,ENSG00000011243,9.805068e-06,2.186063e-05,1.070648e-06,5.554298e-07,3.012469e-07,1.927325e-06,2.378796e-05,3.359302e-05,0.27593,4.0392,3e-05,0.28


In [9]:
colname = "lofscore_to_compare"
gene_matches = c()

for (i in c(1:nrow(interactome_mutability))){
    gene = interactome_mutability[i,'V1']
    score = interactome_mutability[i,colname]
    df = tab[which(as.numeric(tab[,colname]) == as.numeric(score)),]
    if (nrow(df)==0){
        print("No matches!")
        print(gene)
        print(score)
    } else {
        print(paste(nrow(df), "matches"))
    }
        
}

[1] "185 matches"
[1] "168 matches"
[1] "103 matches"
[1] "176 matches"
[1] "195 matches"
[1] "140 matches"
[1] "927 matches"
[1] "70 matches"
[1] "204 matches"
[1] "192 matches"
[1] "927 matches"
[1] "176 matches"
[1] "168 matches"
[1] "176 matches"
[1] "137 matches"
[1] "152 matches"
[1] "185 matches"
[1] "169 matches"
[1] "927 matches"
[1] "159 matches"
[1] "185 matches"
[1] "183 matches"
[1] "927 matches"
[1] "142 matches"
[1] "204 matches"
[1] "204 matches"
[1] "176 matches"
[1] "127 matches"
[1] "39 matches"
[1] "927 matches"
[1] "138 matches"
[1] "927 matches"
[1] "178 matches"
[1] "156 matches"
[1] "181 matches"
[1] "190 matches"
[1] "176 matches"
[1] "151 matches"
[1] "103 matches"
[1] "70 matches"
[1] "142 matches"
[1] "142 matches"
[1] "176 matches"
[1] "146 matches"
[1] "185 matches"
[1] "21 matches"
[1] "181 matches"
[1] "103 matches"
[1] "125 matches"
[1] "184 matches"
[1] "176 matches"
[1] "162 matches"
[1] "169 matches"
[1] "126 matches"
[1] "140 matches"
[1] "192 match

In [10]:
#hist(gene_matches[which(gene_matches < 500)])

In [11]:
names(interactome_mutability)
names(rnaseq)
names(tab)


In [12]:
rnaseq$wt_avg = rowMeans(subset(rnaseq, select = c('WTC11_DR4.1','WTC11_DR4.2','WTC11_DR4.3','WTC11_DR4.4')), 
                         na.rm = TRUE)
rnaseq$rna_to_compare <- lapply(rnaseq$wt_avg,round,1)


tab$wt_avg = rowMeans(subset(tab, select = c('WTC11_DR4.1','WTC11_DR4.2','WTC11_DR4.3','WTC11_DR4.4')), 
                         na.rm = TRUE)
tab$rna_to_compare <- lapply(tab$wt_avg,round,1)

head(tab)
head(rnaseq)

geneName,hgncID,hgncSymbol,enstID,ensgID,syn,mis,non,splice,frameshift,...,gko.3,gko.4,gko.5,gko.6,oe_lof,lof_z,mutscore_to_compare,lofscore_to_compare,wt_avg,rna_to_compare
A1BG,5,A1BG,ENST00000263100,ENSG00000121410,8.99797e-06,1.738961e-05,5.763794e-07,2.639868e-07,6.532817e-07,...,0.3495099,0.16981114,1.058519,-0.3640605,0.78457,0.87287,3e-05,0.78,0.5246414,0.5
A1CF,24086,A1CF,ENST00000373995,ENSG00000148584,6.365968e-06,1.535849e-05,1.437698e-06,4.480121e-07,8.024687e-07,...,-1.5265737,-3.64960321,-1.893791,-2.0583383,0.60537,2.1019,2e-05,0.61,-0.5977267,-0.6
A2M,7,A2M,ENST00000318602,ENSG00000175899,1.543159e-05,3.545894e-05,1.960148e-06,1.477263e-06,4.616751e-07,...,4.9155455,6.48351956,5.418235,4.9432502,0.40526,4.8973,5e-05,0.41,5.7870653,5.8
A2ML1,23336,A2ML1,ENST00000299698,ENSG00000166535,1.778759e-05,3.993763e-05,1.996995e-06,1.668077e-06,4.371224e-07,...,-0.4612849,-0.02100296,-1.326811,-0.5140019,0.77171,1.9114,6e-05,0.77,-0.8990438,-0.9
AAAS,13666,AAAS,ENST00000209873,ENSG00000094914,6.285383e-06,1.61609e-05,1.249937e-06,6.651967e-07,1.192104e-06,...,3.9867508,3.89936297,3.971359,4.0443903,0.70734,1.6123,3e-05,0.71,4.1666222,4.2
AACS,21298,AACS,ENST00000316519,ENSG00000081760,1.119456e-05,2.157404e-05,7.599163e-07,7.48651e-07,3.136301e-06,...,2.6908755,2.40757799,2.754155,3.0250167,0.79801,1.1284,4e-05,0.8,2.7151819,2.7


EnsemblIDs,ensembl_gene_id,hgnc_symbol,WTC11_DR4.1,WTC11_DR4.2,WTC11_DR4.3,WTC11_DR4.4,WTC11_DR4.5,gko.1,gko.2,gko.3,gko.4,gko.5,gko.6,wt_avg,rna_to_compare
ENSG00000223972,ENSG00000223972,DDX11L1,-0.246438,-0.3818135,0.08900291,0.78382265,0.3917166,1.4230111,1.3126161,1.81041023,1.1633844,0.965871,1.4299702,0.06114351,0.1
ENSG00000227232,ENSG00000227232,WASH7P,3.2867177,3.4193737,3.62444559,3.68051742,3.320722,3.6519681,4.1997256,4.02832584,3.8583636,3.7940827,4.0173132,3.50276361,3.5
ENSG00000243485,ENSG00000243485,MIR1302-2HG,-1.7975716,-1.2148499,-2.06460937,-1.42242351,-1.740225,-0.691343,-0.8491803,-0.73714287,-1.792218,-1.8937911,-0.8706978,-1.62486358,-1.6
ENSG00000237613,ENSG00000237613,FAM138A,-2.6059938,-4.2630771,-2.37152808,-2.20106409,-4.437045,-0.1512308,-2.1866962,-0.84220942,-1.0064382,-1.8937911,-0.681354,-2.86041578,-2.9
ENSG00000238009,ENSG00000238009,,1.6299828,2.1330723,2.48512938,1.82257191,2.0087581,1.9460318,1.6954012,2.00470674,1.8613446,1.7562918,2.3425641,2.01768911,2.0
ENSG00000239945,ENSG00000239945,,-0.3002277,0.6862612,0.80800437,-0.08368036,0.267185,0.5488229,0.389406,0.03040981,-0.3649395,0.2338211,0.8881017,0.27758938,0.3


In [13]:
interactome_mutability = merge(interactome_mutability, rnaseq, by.x = "V1", by.y = "hgnc_symbol")

In [14]:
interactome_mutability$equivalent_rna = 'NA'
gene_matches = c()

for (i in c(1:nrow(interactome_mutability))){
    gene = interactome_mutability[i,'V1']
    score = interactome_mutability[i,'rna_to_compare']
    df = rnaseq[which(as.numeric(rnaseq$rna_to_compare) == as.numeric(score)),]
    if (nrow(df)==0){
        print("No matches!")
        print(gene)
        print(score)
        
    } else {
        gene_matches = c(gene_matches, nrow(df))
        #interactome_mutability[i,'equivalent_rna'] = sample(df$geneName,1)
    }
}

In [15]:
#hist(gene_matches[which(gene_matches < 50)])

In [16]:
names(interactome_mutability)
names(tab)

In [17]:
tab[is.na(tab$geneName),]

“number of rows of result is not a multiple of vector length (arg 2)”

geneName,hgncID,hgncSymbol,enstID,ensgID,syn,mis,non,splice,frameshift,...,gko.3,gko.4,gko.5,gko.6,oe_lof,lof_z,mutscore_to_compare,lofscore_to_compare,wt_avg,rna_to_compare


In [18]:
equivalent_matches = c()
conservative_matches = c()
genes_w_few_swaps = c()
few_swaps = c()
no_match = c()

for (i in c(1:nrow(interactome_mutability))){
    gene = interactome_mutability[i,'V1']
    mutscore = interactome_mutability[i,'mutscore_to_compare']
    rnascore = interactome_mutability[i,'rna_to_compare']
    
    df = tab[which(as.numeric(tab$mutscore_to_compare) == as.numeric(mutscore)),]
    df = df[which(as.numeric(df$rna_to_compare) == as.numeric(rnascore)),]
    df = df[which(!is.na(df$geneName)),]
    df = df[which(!df$geneName %in% interactome_genes),]
    
    cdf = tab[which(as.numeric(tab$mutscore_to_compare) >= as.numeric(mutscore)),]
    cdf = cdf[which(as.numeric(cdf$rna_to_compare) == as.numeric(rnascore)),]
    cdf = cdf[which(!is.na(cdf$geneName)),]
    cdf = cdf[which(!cdf$geneName %in% interactome_genes),]
    
    conservative_matches = c(conservative_matches,nrow(cdf))
    equivalent_matches = c(equivalent_matches, nrow(df))
    if (nrow(df) < 10){
        genes_w_few_swaps = c(genes_w_few_swaps,gene)
        few_swaps = c(few_swaps, df$hgncSymbol)
    }

}

In [19]:
length(genes_w_few_swaps)

## Complete table with inherited LoF counts

In [20]:
# Load in lists of variants
dnv_cases = read.csv('../data/variants/DNV_cases.csv', stringsAsFactors=F)
lof_cases = read.csv('../data/variants/LoF_cases.csv', stringsAsFactors=F)

In [21]:
# Combine relevant columns of these variant types
names(dnv_cases)
names(lof_cases)

to_keep = c('Gene','Variant.Class','cohort')

lof_cases$Variant.Class = "inherited_lof"

dnv_cases$cohort = "case"
lof_cases$cohort = "case"


In [22]:
var_df = rbind(dnv_cases[,to_keep], lof_cases[,to_keep])
head(var_df)

Gene,Variant.Class,cohort
MARCH2,mis,case
MARCH5,mis,case
SEPT2,mis,case
SEPT3,mis,case
SEPT4,mis,case
SEPT8,misD,case


In [23]:
known = read.table('../data/databases/known_CHD_genes.txt', stringsAsFactors=F)
known_genes = known$V1

In [24]:
var_df$interactome = ifelse(var_df$Gene %in% interactome_genes, 1, 0)
var_df$known = ifelse(var_df$Gene %in% known_genes, 1, 0)

In [25]:
var_df <- var_df[which(var_df$cohort=="case"),]

In [28]:
names(interactome_mutability)

In [48]:
interactome_mutability[which(interactome_mutability$V1=='DNMT3A'),
                       c('V1','rna_to_compare','mutscore_to_compare','lofscore_to_compare')]

i=45
mutscore = interactome_mutability[i,'mutscore_to_compare']
rnascore = interactome_mutability[i,'rna_to_compare']
lofscore = interactome_mutability[i,'lofscore_to_compare']

inh_df = tab[which(as.numeric(tab$lofscore_to_compare) == as.numeric(lofscore)),]
inh_df = inh_df[which(as.numeric(inh_df$rna_to_compare) == as.numeric(rnascore)),]
inh_df = inh_df[which(!is.na(inh_df$geneName)),]
inh_df = inh_df[which(!inh_df$geneName %in% interactome_genes),]
        
if (nrow(inh_df) < 100){
    lof_min = as.numeric(lofscore) - 0.05
    lof_max = as.numeric(lofscore) + 0.05
    rna_min = as.numeric(rnascore) - .5
    rna_max = as.numeric(rnascore) + .5
    
    print(lof_min)
    print(lof_max)

    cdf = tab[which(as.numeric(tab$lofscore_to_compare) >= lof_min & 
                    as.numeric(tab$lofscore_to_compare) <= lof_max &
                    as.numeric(tab$rna_to_compare) >= rna_min &
                    as.numeric(tab$rna_to_compare) <= rna_max),]

    cdf1 = cdf[which(!is.na(cdf$geneName)),]
    inh_df = cdf1[which(!cdf1$geneName %in% interactome_genes),]
    print(nrow(inh_df))
}

if (nrow(inh_df) < 10){
    cdf = tab[which(as.numeric(tab$lofscore_to_compare) >= lof_min - 0.05 & 
                    as.numeric(tab$lofscore_to_compare) <= lof_max + 0.05 &
                    as.numeric(tab$rna_to_compare) >= rna_min - .5 &
                    as.numeric(tab$rna_to_compare) <= rna_max + .5),]
    cdf1 = cdf[which(!is.na(cdf$geneName)),]
    inh_df = cdf1[which(!cdf1$geneName %in% interactome_genes),]
    print(nrow(inh_df))
}

Unnamed: 0,V1,rna_to_compare,mutscore_to_compare,lofscore_to_compare
45,DNMT3A,5.8,5e-05,1.25


[1] 1.2
[1] 1.3
[1] 4
[1] 13


In [52]:
n_bootstraps = 10
bootstrap_df = data.frame(matrix(ncol = 3, nrow = n_bootstraps))
names(bootstrap_df) = c("synonymous_dnv","nonsynonymous_dnv","inherited_lof")

n_syn_all = c()
n_dnv_all = c()
n_inh_all = c()
n_dnv_unknown = c()
n_inh_unknown= c()
n_syn_unknown = c()

for (k in c(1:n_bootstraps)){
    
    print(k)
    random_genes = c()
    inh_random_genes = c()
    
    for (i in c(1:nrow(interactome_mutability))){
        gene = interactome_mutability[i,'V1']
        mutscore = interactome_mutability[i,'mutscore_to_compare']
        rnascore = interactome_mutability[i,'rna_to_compare']
        lofscore = interactome_mutability[i,'lofscore_to_compare']

        df = tab[which(as.numeric(tab$mutscore_to_compare) == as.numeric(mutscore)),]
        df = df[which(as.numeric(df$rna_to_compare) == as.numeric(rnascore)),]
        df = df[which(!is.na(df$geneName)),]
        df = df[which(!df$geneName %in% interactome_genes),]
        
        inh_df = tab[which(as.numeric(tab$lofscore_to_compare) == as.numeric(lofscore)),]
        inh_df = inh_df[which(as.numeric(inh_df$rna_to_compare) == as.numeric(rnascore)),]
        inh_df = inh_df[which(!is.na(inh_df$geneName)),]
        inh_df = inh_df[which(!inh_df$geneName %in% interactome_genes),]
        
        if (nrow(df) < 100){
            mut_min = as.numeric(mutscore) - 0.00001
            mut_max = as.numeric(mutscore) + 0.00001
            rna_min = as.numeric(rnascore) - .5
            rna_max = as.numeric(rnascore) + .5
            
            cdf = tab[which(as.numeric(tab$mutscore_to_compare) >= mut_min & 
                            as.numeric(tab$mutscore_to_compare) <= mut_max &
                            as.numeric(tab$rna_to_compare) >= rna_min &
                            as.numeric(tab$rna_to_compare) <= rna_max),]
            
            cdf1 = cdf[which(!is.na(cdf$geneName)),]
            df = cdf1[which(!cdf1$geneName %in% interactome_genes),]
        }
            
        if (nrow(df) < 10){
            cdf = tab[which(as.numeric(tab$mutscore_to_compare) >= mut_min - 0.00001 & 
                            as.numeric(tab$mutscore_to_compare) <= mut_max + 0.00001 &
                            as.numeric(tab$rna_to_compare) >= rna_min - .5 &
                            as.numeric(tab$rna_to_compare) <= rna_max + .5),]
            cdf1 = cdf[which(!is.na(cdf$geneName)),]
            df = cdf1[which(!cdf1$geneName %in% interactome_genes),]
        }
        
        if (nrow(inh_df) < 100){
            lof_min = as.numeric(lofscore) - 0.1
            lof_max = as.numeric(lofscore) + 0.1
            rna_min = as.numeric(rnascore) - .5
            rna_max = as.numeric(rnascore) + .5
            
            cdf = tab[which(as.numeric(tab$lofscore_to_compare) >= lof_min & 
                            as.numeric(tab$lofscore_to_compare) <= lof_max &
                            as.numeric(tab$rna_to_compare) >= rna_min &
                            as.numeric(tab$rna_to_compare) <= rna_max),]
            
            cdf1 = cdf[which(!is.na(cdf$geneName)),]
            inh_df = cdf1[which(!cdf1$geneName %in% interactome_genes),]
        }

        if (nrow(inh_df) < 10){
            cdf = tab[which(as.numeric(tab$lofscore_to_compare) >= lof_min - 0.1 & 
                            as.numeric(tab$lofscore_to_compare) <= lof_max + 0.1 &
                            as.numeric(tab$rna_to_compare) >= rna_min - .5 &
                            as.numeric(tab$rna_to_compare) <= rna_max + .5),]
            cdf1 = cdf[which(!is.na(cdf$geneName)),]
            inh_df = cdf1[which(!cdf1$geneName %in% interactome_genes),]
            print(paste(gene, nrow(inh_df)))
            print(inh_df)
        }
        
        random_genes = c(random_genes, sample(df$geneName,1))
        inh_random_genes = c(inh_random_genes, sample(inh_df$geneName,1))

    }
    
    var_df$random_selection = ifelse(var_df$Gene %in% random_genes, 1, 0)
    var_df$inh_selection = ifelse(var_df$Gene %in% inh_random_genes, 1, 0)
    
    syn_df = var_df[which(var_df$Variant.Class == "syn" & var_df$random_selection==1),]
    inh_df = var_df[which(var_df$Variant.Class == 'inherited_lof' & var_df$inh_selection==1),]
    dnv_df = var_df[which(!var_df$Variant.Class %in% c('syn','inherited_lof' ) & var_df$random_selection==1),]
    
    n_syn_all = c(n_syn_all, nrow(syn_df))
    n_inh_all = c(n_inh_all, nrow(inh_df))
    n_dnv_all = c(n_dnv_all, nrow(dnv_df))
    
    bootstrap_df[k,"synonymous_dnv"] = nrow(syn_df)
    bootstrap_df[k,"nonsynonymous_dnv"] = nrow(dnv_df)
    bootstrap_df[k,"inherited_lof"] = nrow(inh_df)
    
}

[1] 1
[1] "DNMT3A 26"
       geneName hgncID hgncSymbol          enstID          ensgID          syn
154        ACP1    122       ACP1 ENST00000272067 ENSG00000143727 1.919322e-06
1898      CBWD3  18519      CBWD3 ENST00000360171 ENSG00000196873 8.221670e-07
2861      CPNE3   2316      CPNE3 ENST00000521271 ENSG00000085719 5.377842e-06
3104     CYB5D1  26516     CYB5D1 ENST00000332439 ENSG00000182224 4.545539e-06
3529    DNAJC15  20325    DNAJC15 ENST00000379221 ENSG00000120675 1.809795e-06
3977       EMP2   3334       EMP2 ENST00000359543 ENSG00000213853 2.938981e-06
3994       ENO3   3354       ENO3 ENST00000323997 ENSG00000108515 6.787244e-06
4527      FDFT1   3629      FDFT1 ENST00000220584 ENSG00000079459 4.836612e-06
5177    GOLGA8B  31973    GOLGA8B ENST00000267731 ENSG00000215252 1.986501e-06
5643  HIST1H2BC   4757  HIST1H2BC ENST00000314332 ENSG00000180596 2.879371e-06
5723  HNRNPA1L2  27067  HNRNPA1L2 ENST00000342657 ENSG00000139675 3.443807e-06
5946     IFITM2   5413     IFI

In [54]:
'HIST1H2BD' %in% var_df$Gene
'HIST1H1A' %in% var_df$Gene

# This swap should therefore not skew results

In [None]:
# bootstrap_df$category = "bootstrap"
true_row = c(nrow(var_df[which(var_df$Variant.Class == "syn" & var_df$interactome==1),]),
             nrow(var_df[which(!var_df$Variant.Class %in% c("syn","inherited_lof") & var_df$interactome==1),]),
             nrow(var_df[which(var_df$Variant.Class == "inherited_lof" & var_df$interactome==1),]), 'true')
names(true_row) <- c('synonymous_dnv','nonsynonymous_dnv','inherited_lof','category')

bootstrap_df <- rbind(bootstrap_df, true_row)
bootstrap_df

In [None]:
write.csv(bootstrap_df,"../intermediate/testnoninteractome_comparison.csv", quote=F, row.names=F)

In [None]:
bootstrap_df = read.csv("../intermediate/noninteractome_comparison.csv", stringsAsFactors=F)
bootstrap_df

In [None]:
for (test in c('inherited_lof','synonymous_dnv','nonsynonymous_dnv')){
    perm_list = bootstrap_df[1:1000,test]
    true_n = bootstrap_df[which(bootstrap_df$category=="true"), test]
    print(paste("Mean: ", mean(perm_list)))
    print(paste("SD:", sd(perm_list)))
    print(paste("True:", true_n))
    
    lower_x_bd = min(perm_list)-10
    upper_x_bd = max(perm_list)+20
    
    if (test=="inherited_lof"){
        upper_y_bd = 300
    } else{
        upper_y_bd = 250
    }
    
    pdf(paste0("../manuscript/figures/noninteractome_selection/",test,".pdf"))
    hist(perm_list, main=paste0(test, " mutations in GT-PPIN"), 
         xlab="Number of mutations in CHD probands",
         xlim=c(lower_x_bd,upper_x_bd), ylim = c(0,upper_y_bd))
    
    abline(v = true_n, lty="dotted", lwd=7, col="red")
    dev.off()
}

In [None]:
perm_list = bootstrap_df$inherited_lof
true_n = bootstrap_df[which(bootstrap_df$category=="true"), 'inherited_lof']
mut_type = "inherited_lof"

nGreater = length(perm_list[which(perm_list >= true_n)])
pval = (nGreater/(length(perm_list)+1))

par(lwd=2, cex=2)
pdf("../manuscript/figures/noninteractome_selection/extended-inherited_lof.pdf")
h = hist(perm_list)
plot(h, main=paste0(mut_type, " mutations in GT-PPIN"), 
     sub = paste0("p: ", pval),
     xlab = "Odds Ratio", cex=2, lwd=2)
abline(v = true_n, lty="dotted", lwd=7, col="red")
dev.off()

In [None]:
out_path = "../manuscript/figures/noninteractome_selection"

perm_viz <- function(perm_list, true_n, mut_type){
    nGreater = length(perm_list[which(perm_list >= true_n)])
    pval = (nGreater/(length(perm_list)+1))
    if(nGreater == 0 ){
        pval = "< 0.001"
    }
    print(paste(mut_type, "number of mutations:", true_n))
    print(paste("pval:", pval))
    
    par(lwd=2, cex=2)
    h = hist(perm_list)
    pdf(paste0(out_path, "/", mut_type, ".pdf"))
    plot(h, main=paste0(mut_type, " mutations in GT-PPIN"), 
         sub = paste0("p: ", pval),
         xlab = "Odds Ratio", cex=2, lwd=2)
    abline(v = true_n, lty="dotted", lwd=7, col="red")
    dev.off()
  
  #df = as.data.frame(perm_list)
  #img <- ggplot(data=df, aes(df$perm_list)) + geom_histogram() + 
  #  geom_vline(xintercept = true_OR, col = "red", linetype="dashed") +
  #  labs(title=paste0("Permutation of ",mut_type," mutations in ", int_type, " interactome"),
  #       subtitle = paste0("p = ", pval),
  #       x="Odds Ratio", y = "Frequency")

  #return(img)
}


In [None]:
perm_viz(n_syn_all, 
         nrow(var_df[which(var_df$Variant.Class == "syn" & var_df$interactome==1),]), 
         "all_synonymous_dnvs")

perm_viz(n_dnv_all, 
         nrow(var_df[which(!var_df$Variant.Class %in% c("syn","inherited_lof") & var_df$interactome==1),]), 
         "all_nonsynonymous_dnvs")

perm_viz(n_inh_all, 
         nrow(var_df[which(var_df$Variant.Class == "inherited_lof" & var_df$interactome==1),]), 
         "all_rare-inherited_lofs")


#### HEADS UP THERE'S A BUG HERE
# Unknown
perm_viz(n_syn_unknown, 
         nrow(var_df[which(var_df$Variant.Class == "syn" & var_df$interactome==1 & var_df$known==0),]), 
         "unknown_synonymous_dnvs")

perm_viz(n_dnv_unknown, 
         nrow(var_df[which(!var_df$Variant.Class %in% c("syn","inherited_lof") 
                           & var_df$interactome==1 & var_df$known==0),]), 
         "unknown_nonsynonymous_dnvs")

perm_viz(n_inh_unknown, 
         nrow(var_df[which(var_df$Variant.Class == "inherited_lof" 
                           & var_df$interactome==1 & var_df$known==0),]), 
         "unknown_rare-inherited_lofs")

In [None]:
var_df$Variant.Class <- gsub("misD", "mis", var_df$Variant.Class)
var_df$Variant.Class <- gsub("non", "lof", var_df$Variant.Class)
var_df$Variant.Class <- gsub("frameshift", "lof", var_df$Variant.Class)
var_df$Variant.Class <- gsub("startloss", "lof", var_df$Variant.Class)
var_df$Variant.Class <- gsub("stoploss", "lof", var_df$Variant.Class)

In [None]:
var_df$known = ifelse(var_df$Gene %in% known_genes, 1, 0)

In [None]:
head(var_df)

## Iterate through df, adding proper counts

In [None]:
var_types = c('syn','mis','lof','splice','inherited_lof')
cols = c("int_case","int_ctrl","nonint_case","nonint_ctrl")

create_df <- function(){
    df = data.frame(matrix(ncol = 4, nrow = 5))
    names(df) <- cols
    row.names(df) <- var_types
    return(df)
}

all_int_df <- create_df()
nonchd_int_df <- create_df()
all_rand_df <- create_df()
nonchd_rand_df <- create_df()

In [None]:
for (var_type in var_types){
    
    all_variants = var_df[which(var_df$Variant.Class==var_type),]
    
    # all_int_df
    ints = all_variants[which(all_variants$interactome == 1),]
    nonints = all_variants[which(all_variants$interactome == 0),]
    all_int_df[var_type,'int_case'] = nrow(ints[which(ints$cohort=="case"),])
    all_int_df[var_type,'int_ctrl'] = nrow(ints[which(ints$cohort=="ctrl"),])
    all_int_df[var_type,'nonint_case'] = nrow(nonints[which(nonints$cohort=="case"),])
    all_int_df[var_type,'nonint_ctrl'] = nrow(nonints[which(nonints$cohort=="ctrl"),])
    
    # nonchd_int_df
    ints = all_variants[which(all_variants$interactome == 1 & all_variants$known==0),]
    nonints = all_variants[which(all_variants$interactome == 0 & all_variants$known==0),]
    nonchd_int_df[var_type,'int_case'] = nrow(ints[which(ints$cohort=="case"),])
    nonchd_int_df[var_type,'int_ctrl'] = nrow(ints[which(ints$cohort=="ctrl"),])
    nonchd_int_df[var_type,'nonint_case'] = nrow(nonints[which(nonints$cohort=="case"),])
    nonchd_int_df[var_type,'nonint_ctrl'] = nrow(nonints[which(nonints$cohort=="ctrl"),])
    
    # all_rand_df
    ints = all_variants[which(all_variants$random_selection == 1),]
    nonints = all_variants[which(all_variants$random_selection == 0),]
    all_rand_df[var_type,'int_case'] = nrow(ints[which(ints$cohort=="case"),])
    all_rand_df[var_type,'int_ctrl'] = nrow(ints[which(ints$cohort=="ctrl"),])
    all_rand_df[var_type,'nonint_case'] = nrow(nonints[which(nonints$cohort=="case"),])
    all_rand_df[var_type,'nonint_ctrl'] = nrow(nonints[which(nonints$cohort=="ctrl"),])
    
    # nonchd_rand_df
    ints = all_variants[which(all_variants$random_selection == 1 & all_variants$known==0),]
    nonints = all_variants[which(all_variants$random_selection == 0 & all_variants$known==0),]
    nonchd_rand_df[var_type,'int_case'] = nrow(ints[which(ints$cohort=="case"),])
    nonchd_rand_df[var_type,'int_ctrl'] = nrow(ints[which(ints$cohort=="ctrl"),])
    nonchd_rand_df[var_type,'nonint_case'] = nrow(nonints[which(nonints$cohort=="case"),])
    nonchd_rand_df[var_type,'nonint_ctrl'] = nrow(nonints[which(nonints$cohort=="ctrl"),])
}

In [None]:
calc_or <- function(df, suffix){
    df$OR = (df$int_case * df$nonint_ctrl)/(df$int_ctrl * df$nonint_case)
    nam = paste0(row.names(df), suffix)
    row.names(df) <- nam
    return(df)
}

# Calculate OR for each dataframe
all_int_df = calc_or(all_int_df, "")
nonchd_int_df = calc_or(nonchd_int_df, " (minus CHD genes)")
true_df = rbind(all_int_df, nonchd_int_df)
true_df

all_rand_df = calc_or(all_rand_df, "")
nonchd_rand_df = calc_or(nonchd_rand_df, " (minus CHD genes)")
random_df = rbind(all_rand_df, nonchd_rand_df)
names(random_df) <- c('simulated_int_case', 'simulated_int_ctrl', 'simulated_nonint_case', 
                      'simulated_nonint_ctrl', 'OR')
random_df

In [None]:
write.csv(true_df, "../manuscript/tables/true_count_tables.csv", quote=F)
write.csv(random_df, "../manuscript/tables/representative_simulation_count_tables.csv", quote=F)

In [None]:
true = read.csv("../manuscript/tables/true_count_tables.csv", row.names = 1, stringsAsFactors=F)
random = read.csv("../manuscript/tables/representative_simulation_count_tables.csv", row.names = 1, stringsAsFactors=F)

In [None]:
true_syn_corr = 1 / true['syn','OR']
true_syn_nonchd_corr = 1 / true['syn (minus CHD genes)','OR']
true$correctedOR = 0

for (i in c(1:nrow(true))){
    if(i<6){
        true[i,'correctedOR'] = true[i,'OR']*true_syn_corr
    }else{
        true[i,'correctedOR'] = true[i,'OR']*true_syn_nonchd_corr
    }
}

true

In [None]:
true_syn_corr = 1 / true['syn','OR']
true_syn_nonchd_corr = 1 / true['syn (minus CHD genes)','OR']
true$correctedOR = 0

for (i in c(1:nrow(true))){
    if(i<6){
        true[i,'correctedOR'] = true[i,'OR']*true_syn_corr
    }else{
        true[i,'correctedOR'] = true[i,'OR']*true_syn_nonchd_corr
    }
}

true

## Create a similar table, but with the randomly-selected genes

In [None]:
for (fname in c("DNV_cases","DNV_ctrls", "LoF_cases","LoF_ctrls")){
    
    var_df = read.csv(paste0('../data/variants/',fname,'.csv'), stringsAsFactors=F)
    
    # For each variant type, calculate number in each group
    splice_df = var_df[which(var_df$Func.refGene=="splicing"),]
    lof_df = var_df[which(var_df$ExonicFunc.refGene %in% c(""))]
    
    # Split by interactome/non-interactome
    int_df = var_df[which(var_df$Gene %in% random_genes),]
    non_int_df = var_df[which(!var_df$Gene %in% random_genes),]
    
    
    # Remove known CHD genes and repeat
    
}

In [None]:
'DNV Splice' 
'DNV Loss of Function (LoF)' 
'DNV Missense' 
'Rare inherited LoF' 
'DNV Splice (minus CHD Genes)' 
'DNV LoF (minus CHD Genes)' 
'DNV Missense (minus CHD Genes)' 
'Rare inherited LoF (minus CHD Genes)'