# Plot LMO2 Dotplot

LMO2 is an oncogene locus with a significant interaction term when surveying the 330 enhancer-enhancer pairs where each enhancer has a significant effect. This notebook contains the code necessary to generate a dotplot which shows how the interaction term differs from the other enhancer coefficients. 

Author: Karthik Guruvayurappan

In [2]:
# read in necessary packages for h5 files and GLMs
library(rhdf5)
library(MASS)

In [4]:
# read in covariates
print('reading in covariates!')
covariates <- h5read(
    file = '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5',
    name = 'covariates'
)
cell.barcodes <- h5read(
    file = '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5',
    name = 'cell.barcodes'
)
covariates <- merge(
    data.frame(cell.barcodes),
    covariates,
    by.x = 'cell.barcodes',
    by.y = 'cell',
    sort = FALSE
)
head(covariates)

[1] "reading in covariates!"


Unnamed: 0_level_0,cell.barcodes,guide_count,prep_batch,percent.mito,s.score,g2m.score
Unnamed: 0_level_1,<chr>,<int>,<chr>,<dbl>,<dbl>,<dbl>
1,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,67,prep_batch_1,0.058786706,0.110732311,-0.1319208
2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,26,prep_batch_1,0.036086518,-0.010290919,-0.1535426
3,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,61,prep_batch_1,0.069823051,-0.17586013,-0.3084879
4,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,39,prep_batch_1,0.026186508,0.003057281,-0.1574859
5,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,37,prep_batch_1,0.007991318,-0.144480961,-0.2362154
6,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,57,prep_batch_1,0.022356681,0.026418076,-0.1462899


In [5]:
# read in table mapping enhancers to spacers and reformat enhancer names
print('reading in enhancer-to-spacer table!')
enhancer.to.spacer.table <- read.table(
    '/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_grna_groups.at_scale.txt',
    sep = '\t'
)
colnames(enhancer.to.spacer.table) <- c('target.site', 'spacer.sequence')
enhancer.to.spacer.table$target.site <- sapply(enhancer.to.spacer.table$target.site, FUN = function(x) {
    if (startsWith(x, 'chr')) {
        return (strsplit(x, '_')[[1]][1])
    }
    else {
        return (x)
    }
})
head(enhancer.to.spacer.table)

[1] "reading in enhancer-to-spacer table!"


Unnamed: 0_level_0,target.site,spacer.sequence
Unnamed: 0_level_1,<chr>,<chr>
1,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
3,FAM83A_TSS,AACACACCACGGAGGAGTGG
4,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
5,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG
6,TIPRL_TSS,AACGGCTCGGAAGCCTAGGG


In [6]:
# read in guide efficiency information
print('reading in guide efficiencies!')
guide.efficiencies.table <- h5read(
    '/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5',
    'guidescan.output'
)
guide.efficiencies.table$spacer <- substring(
    guide.efficiencies.table$gRNA,
    1,
    nchar(guide.efficiencies.table$gRNA) - 3
)
head(guide.efficiencies.table)

[1] "reading in guide efficiencies!"


Unnamed: 0_level_0,Index,gRNA,Chromosome,Start,End,Strand,Num.Off.Targets,Off.Target.Summary,Specificity,Cutting.Efficiency,spacer
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>
1,0,CTAAAGCATTGGCTGAGAAGNGG,chr8,23911081,23911103,-,41,2:2 | 3:39,0.136228,0.56501,CTAAAGCATTGGCTGAGAAG
2,1,GTAGTTCACATAATCCCTGTNGG,chr4,25698193,25698215,-,55,2:0 | 3:55,0.165929,0.572492,GTAGTTCACATAATCCCTGT
3,2,AAGTTGACTCTACATAGCAGNGG,chr8,23912565,23912587,+,22,2:1 | 3:21,0.341067,0.636691,AAGTTGACTCTACATAGCAG
4,3,AATATTCTCCCTCATTCTGGNGG,chr5,12539360,12539382,-,803,2:26 | 3:777,0.00274364,0.6198,AATATTCTCCCTCATTCTGG
5,4,AATCCTCTAATGGACGAAGANGG,chr8,23913057,23913079,-,24,2:0 | 3:24,0.334415,0.602272,AATCCTCTAATGGACGAAGA
6,5,AGATACCTATGGCCATATAGNGG,chr5,12540099,12540121,+,14,2:0 | 3:14,0.351723,0.531946,AGATACCTATGGCCATATAG


In [7]:
# read in cell-guide matrix
print('reading in cell-guide matrix!')
cell.guide.matrix <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'cell.guide.matrix')
guide.spacers <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'guide.spacers')
colnames(cell.guide.matrix) <- guide.spacers
head(cell.guide.matrix)

[1] "reading in cell-guide matrix!"


AGAAAGCTCCTCCAGTTCAC,TGATCGCTTTGACTGTGACA,ACAATAAAGAACAGAACACA,GTAAATTGAGACCTCAGGAG,TCTTCCCCCCACCAATAACA,GAGAAAAAAACAATTCAGGC,TCTTAGAGTTCACAGAAGAA,GCTGGGAATTTCTCTCCTGG,AGTGTAACAGAATATCAAAT,ACCCACTGTGACTAGACAAA,⋯,TACCGGAGGAGAAAGATGGG,TATTTCCTCCCCAAGATGTA,CCTTCCCACAGCACACCGCG,TCTTTCCCCAGACTTCTGCA,CTTTCCACTCCCACATAACA,TCCTCCCCACGGCCACCAGA,TCCTTCCCCTCAGTACACCA,ACATGCTGTTTCCAGAGCAG,CTTCTTCCACAAGAAGACAA,CTTTCCACAGACAAGGGGTA
1,1,1,1,1,1,1,1,1,1,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [8]:
# read in counts matrix
print('reading in counts matrix!')
counts.matrix <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'gene.counts')
gene.names <- h5read('/iblm/netapp/data1/external/Gasperini2019/processed/gasperini_data.h5', 'gene.names')
rownames(counts.matrix) <- gene.names
head(counts.matrix)

[1] "reading in counts matrix!"


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,1,0,0,0,1,0,0
ENSG00000228463,1,0,1,1,1,0,0,0,0,1,⋯,0,0,0,0,1,0,0,2,0,2
ENSG00000237094,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000235373,0,0,1,0,0,0,0,1,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSG00000228327,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [9]:
# compute scaling factors based on count matrix
print('computing scaling factors!')
scaling.factors <- colSums(counts.matrix) / 1e6
head(scaling.factors)

[1] "computing scaling factors!"


In [10]:
# read in enhancer-enhancer pairs
enhancer.enhancer.pairs <- read.csv('/iblm/netapp/data1/external/Gasperini2019/processed/enhancer_pairs_suppl_table_2.csv')

In [None]:
# set enhancer and gene names to LMO2 enhancer pair
enhancer.1 <- 'chr11.1735'
enhancer.2 <- 'chr11.1734'
gene <- 'ENSG00000135363'