In [33]:
library(rhdf5)
library(MASS)
library(broom)

# Test reading h5 and accessing data

In [2]:
fh <- "../simulated_data/sim.h5"

Get internal structure of h5 file

In [3]:
h5ls(fh)

Unnamed: 0_level_0,group,name,otype,dclass,dim
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
0,/,coeffs,H5I_DATASET,COMPOUND,13000
1,/,counts,H5I_DATASET,INTEGER,13000 x 50000
2,/,guides,H5I_GROUP,,
3,/guides,metadata,H5I_DATASET,COMPOUND,2000
4,/guides,one_hot,H5I_DATASET,INTEGER,50000 x 2000
5,/,scaling_factors,H5I_DATASET,FLOAT,50000
6,/,x,H5I_GROUP,,
7,/x,cell_cycle_scores,H5I_DATASET,COMPOUND,50000
8,/x,x1,H5I_DATASET,INTEGER,13000 x 50000


Test reading first row from counts matrix

In [4]:
system.time(row.counts <- h5read(file = fh, name = "counts", index = list(1, 1:50000)))

   user  system elapsed 
  0.836   0.141   0.979 

In [5]:
row.counts

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,15,4,2,1,0,0,44,30,20,⋯,6,8,16,1,24,0,0,0,0,4


Check if this matches

In [6]:
counts.mtx <- h5read(fh, "counts")
dim(counts.mtx)
counts.mtx[1,1:10]

Try pulling a row from matrix of $X_1$

In [7]:
system.time(row.vars <- h5read(file = fh, name = "x/x1", index = list(1, 1:50000)))

   user  system elapsed 
  0.518   0.030   0.549 

In [8]:
sum(row.vars)

Check if this is correct

In [9]:
x1.mtx <- h5read(fh, "x/x1")
sum(x1.mtx[1,1:50000])

# Load fixed values

Coefficients (ground truth for evaluating model performance) 

Cell cycle scores (same for every gene)

In [39]:
coeffs <- h5read(file = fh, name = "coeffs")
cell.cycle.scores <- h5read(fh, "x/cell_cycle_scores")
scaling.factors <- h5read(file = fh, name = "scaling_factors")
guides.metadata <- h5read(file = fh, name = "guides/metadata")

In [40]:
head(guides.metadata)

Unnamed: 0_level_0,target.gene,efficiency,effect.size
Unnamed: 0_level_1,<int>,<dbl>,<dbl>
1,8890,0.5839696,-3.091298
2,8344,0.6177571,-2.482861
3,10894,0.7364427,-2.701824
4,5123,0.7752164,-2.813812
5,6571,0.7739046,-2.201368
6,3693,0.5629113,-1.948516


In [42]:
2 %in% guides.metadata$target.gene

# Define null and alternative models

## Model
$$y = \text{NB}(\mu=s \cdot \exp \left( \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_3 \right), n=\sigma)$$
### $H_0: \beta_1 = 0 $
### $H_A: \beta_1 \neq 0 $



Test for one gene

In [52]:
test.gene <- 1

obs.counts <- h5read(file = fh, name = "counts", index = list(test.gene, 1:50000))
gene.data <- data.frame(guide.eff = as.integer(h5read(file = fh, name = "x/x1", index = list(test.gene, 1:50000))),
                       s.score = cell.cycle.scores$s.scores,
                       g2m.score = cell.cycle.scores$g2m.scores,
                       counts = as.integer(obs.counts),
                       scaling.factor = scaling.factors)

In [53]:
head(gene.data)

Unnamed: 0_level_0,guide.eff,s.score,g2m.score,counts,scaling.factor
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<int>,<dbl>
1,0,-0.6605419,-0.31670538,0,0.050167
2,0,1.3238916,-1.14640695,15,0.050184
3,0,-0.1031474,0.2984761,4,0.050145
4,0,-0.3260718,0.6081172,2,0.049669
5,0,0.538556,-0.86269276,1,0.050237
6,0,-0.3940143,-0.02239488,0,0.050092


In [54]:
system.time(ml <- glm.nb(counts ~ guide.eff + s.score + g2m.score + scaling.factor, data = gene.data))

   user  system elapsed 
 17.488  15.248   2.113 

In [55]:
summary(ml)


Call:
glm.nb(formula = counts ~ guide.eff + s.score + g2m.score + scaling.factor, 
    data = gene.data, init.theta = 1.488067264, link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-4.0994  -0.8992  -0.3800   0.3112   3.4680  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    -0.493377   1.082956  -0.456    0.649    
guide.eff      -2.633906   0.089586 -29.401   <2e-16 ***
s.score         3.592147   0.010014 358.723   <2e-16 ***
g2m.score       1.937629   0.008672 223.432   <2e-16 ***
scaling.factor 28.782100  21.658571   1.329    0.184    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for Negative Binomial(1.4881) family taken to be 1)

    Null deviance: 360986  on 49999  degrees of freedom
Residual deviance:  47562  on 49995  degrees of freedom
AIC: 241949

Number of Fisher Scoring iterations: 1


              Theta:  1.4881 
          Std. Err.:  0.0141 

 2 x log-likeli

In [56]:
coeffs[test.gene,]

Unnamed: 0_level_0,baselines,beta1,beta2,beta3
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
1,3.939874,-2.46796,3.600448,1.945159


In [57]:
system.time(ml.null <- update(ml, . ~ . - guide.eff))

   user  system elapsed 
 16.277  13.327   2.043 

In [58]:
ml.null


Call:  glm.nb(formula = counts ~ s.score + g2m.score + scaling.factor, 
    data = gene.data, init.theta = 1.452914358, link = log)

Coefficients:
   (Intercept)         s.score       g2m.score  scaling.factor  
       -0.4006          3.5905          1.9370         26.8179  

Degrees of Freedom: 49999 Total (i.e. Null);  49996 Residual
Null Deviance:	    353900 
Residual Deviance: 47640 	AIC: 242700

# Test for all genes

In [None]:
ml.list <- list()
ml.null.list <- list()

system.time(
for (gene in 1:nrow(coeffs)) {
    print(gene)
    obs.counts <- h5read(file = fh, name = "counts", index = list(gene, 1:50000))
    gene.data <- data.frame(guide.eff = as.integer(h5read(file = fh, name = "x/x1", index = list(gene, 1:50000))),
                       s.score = cell.cycle.scores$s.scores,
                       g2m.score = cell.cycle.scores$g2m.scores,
                       counts = as.integer(obs.counts),
                       scaling.factor = scaling.factors)
    ml <- glm.nb(counts ~ guide.eff + s.score + g2m.score + scaling.factor, data = gene.data)
    ml.null <- update(ml, . ~ . - guide.eff)
    ml.list[[gene]] <- ml
    ml.null.list[[gene]] <- ml.null
}
)

