In [1]:
library(rhdf5)
library(MASS)

# Test reading h5 and accessing data

In [2]:
fh <- "../simulated_data/sim.h5"

Get internal structure of h5 file

In [3]:
h5ls(fh)

Unnamed: 0_level_0,group,name,otype,dclass,dim
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
0,/,coeffs,H5I_DATASET,COMPOUND,13000
1,/,counts,H5I_DATASET,INTEGER,13000 x 50000
2,/,guides,H5I_GROUP,,
3,/guides,metadata,H5I_DATASET,COMPOUND,2000
4,/guides,one_hot,H5I_DATASET,INTEGER,50000 x 2000
5,/,scaling_factors,H5I_DATASET,FLOAT,50000
6,/,x,H5I_GROUP,,
7,/x,cell_cycle_scores,H5I_DATASET,COMPOUND,50000
8,/x,x1,H5I_DATASET,INTEGER,13000 x 50000


Test reading first row from counts matrix

In [4]:
system.time(row.counts <- h5read(file = fh, name = "counts", index = list(1, 1:50000)))

   user  system elapsed 
  0.837   0.241   1.083 

In [5]:
row.counts

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,15,4,2,1,0,0,44,30,20,⋯,6,8,16,1,24,0,0,0,0,4


Check if this matches

In [6]:
counts.mtx <- h5read(fh, "counts")
dim(counts.mtx)
counts.mtx[1,1:10]

Try pulling a row from matrix of $X_1$

In [7]:
system.time(row.vars <- h5read(file = fh, name = "x/x1", index = list(1, 1:50000)))

   user  system elapsed 
  0.490   0.015   0.508 

In [8]:
sum(row.vars)

Check if this is correct

In [9]:
x1.mtx <- h5read(fh, "x/x1")
sum(x1.mtx[1,1:50000])

# Load fixed values

Coefficients (ground truth for evaluating model performance) 

Cell cycle scores (same for every gene)

In [13]:
coeffs <- h5read(file = fh, name = "coeffs")
cell.cycle.scores <- h5read(fh, "x/cell_cycle_scores")

In [14]:
head(cell.cycle.scores)

Unnamed: 0_level_0,s.scores,g2m.scores
Unnamed: 0_level_1,<dbl>,<dbl>
1,-0.6605419,-0.31670538
2,1.3238916,-1.14640695
3,-0.1031474,0.2984761
4,-0.3260718,0.6081172
5,0.538556,-0.86269276
6,-0.3940143,-0.02239488


# Define null and alternative models

## Model
$$y = \text{NB}(\mu=s \cdot \exp \left( \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_3 \right), n=\sigma)$$
### $H_0: \beta_1 = 0 $
### $H_A: \beta_1 \neq 0 $



Test for one gene

In [26]:
test.gene <- 1

obs.counts <- h5read(file = fh, name = "counts", index = list(test.gene, 1:50000))
gene.data <- data.frame(guide.eff = as.integer(h5read(file = fh, name = "x/x1", index = list(test.gene, 1:50000))),
                       s.score = cell.cycle.scores$s.scores,
                       g2m.score = cell.cycle.scores$g2m.scores)

In [28]:
ml <- glm.nb(obs.counts ~ guide.eff + s.score + g2m.score, data = gene.data)

ERROR: Error in model.frame.default(formula = obs.counts ~ guide.eff + s.score + : variable lengths differ (found for 'guide.eff')


In [None]:
null <- function(par, counts, scale, s, g2m){
    # h0: beta1 == 0 (gene is not repressed by any gRNA)
    # par[1] = beta0 (baseline)
    # par[2] = beta2 (s score effect)
    # par[3] = beta3 (g2m score effect)
    # par[4] = ln(disp)
    # counts = counts for gene being modeled 
    # scale = scaling factors (per cell)
    # s = s scores
    # g2m = g2m scores
    
    disp <- exp(par[4])
    mu <- s*exp(par[1] + par[2] * s + par[3] * g2m)
    -sum(dnbinom(data, mean = mu, size = disp, log = TRUE))
}

alt <- function(par, counts, scale, gRNA.fx, s, g2m){
    # hA: beta1 != 0 (gene expression is repressed by gRNA)
    # par[1] = beta0 (baseline)
    # par[2] = beta2 (s score effect)
    # par[3] = beta3 (g2m score effect)
    # par[4] = beta1 (gRNA effect size on target gene)
    # par[5] = ln(disp)
    # counts = counts for gene being modeled 
    # scale = scaling factors (per cell)
    # gRNA.fx = x1, binary indicator for gRNA effectiveness
    # s = s scores
    # g2m = g2m scores
    
    disp <- exp(par[5])
    mu <- s*exp(par[1] + par[2] * s + par[3] * g2m + par[4] * gRNA.fx)
    -sum(dnbinom(data, mean = mu, size = disp, log = TRUE))
}