From 56154b1f7c81060f583920f7ee2f550641d9e0e3 Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Fri, 5 May 2023 14:46:53 +0200
Subject: [PATCH 1/3] fix #640

---
 docs/usage/output.rst | 107 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 100 insertions(+), 7 deletions(-)

diff --git a/docs/usage/output.rst b/docs/usage/output.rst
index 3cf62b21..8bcd2f8f 100644
--- a/docs/usage/output.rst
+++ b/docs/usage/output.rst
@@ -183,12 +183,28 @@ This rule produces the following output file for the whole dataset.
 
 
 
-Since version 2.15 the output of the counts are stored in a hdf file.
+Since version 2.15 the output of the quantification are stored in 2 hdf-files`in the folder ``Genecatalog/counts/``:	
+  - ``median_coverage.h5``
+  - ``Nmapped_reads.h5.fna``
+
+Together with the statistics per gene and per sample.
+  - ``gene_coverage_stats.parquet``
+  - ``sample_coverage_stats.tsv``
+
+
+
+The hdf only contains a matrix of abundances or counts under the name ``data``. The sample names are stored as attributes.
+The gene names (e.g. ``Gene00001``) are simply the row number.
+
+
+
+
+
 
 You can open the hdf file in R or python as following:
 
   
-..code: python
+.. code-block:: python
 
   import h5py
 
@@ -200,7 +216,7 @@ You can open the hdf file in R or python as following:
       sample_names = hdf_file['data'].attrs['sample_names'].astype(str)
 
 
-..code: R
+.. code-block:: R
 
   library(rhdf5)
 
@@ -214,16 +230,93 @@ You can open the hdf file in R or python as following:
   colnames(data) <- attributes$sample_names
 
 
-You don't need to load the full data. You could only select genes with annotations.
-You can use the file ``Genecatalog/counts/gene_coverage_stats.tsv`` To normalize the counts.
+You don't need to load the full data.
+You could only select a subset of genes, e.g. the genes with annotations, or genes that are not singletons.
+To find out which gene is a singleton or not you can use the file ``gene_coverage_stats.parquet``
+
+
+.. code-block:: R
+
+  library(rhdf5)
+  library(dplyr)
+  library(tibble)
+
+  # read only subset of data
+  indexes_of_genes_to_load = c(2,5,100,150) # e.g. genes with annotations
+  abundance_file <- file.path(atlas_dir,"Genecatalog/counts/median_coverage.h5")
+ 
+
+  # get dimension of data
+
+  h5overview=h5ls(abundance_file)
+  dim= h5overview[1,"dim"] %>% stringr::str_split(" x ",simplify=T) %>% as.numeric
+  cat("Load ",length(indexes_of_genes_to_load), " out of ", dim[1] , " genes\n")
+
+
+  data <- h5read(file = abundance_file, name = "data", 
+                  index = list(indexes_of_genes_to_load, NULL))
+
+  # add sample names
+  attributes= h5readAttributes(abundance_file, "data")
+  colnames(data) <- attributes$sample_names
+
 
+  # add gene names (e.g. Gene00001) as rownames
+  gene_names = paste0("Gene", formatC(format="d",indexes_of_genes_to_load,flag="0",width=ceiling(log10(max(dim[1])))))
+  rownames(data) <- gene_names
+
+
+  data[1:5,1:5]
+
+If you do this you can use the information in the file ``Genecatalog/counts/sample_coverage_stats.tsv`` to normalize the counts.
+
+Here is the R code to calculate the gene copies per million (analogous to transcript per million) for the subset of genes.
+
+.. code-block:: R
+
+  # Load gene stats per sample
+  gene_stats_file = file.path(atlas_dir,"Genecatalog/counts/sample_coverage_stats.tsv") 
+
+  gene_stats <- read.table(gene_stats_file,sep='\t',header=T,row.names=1)
+
+  gene_stats <- t(gene_stats) # might be transposed, sample names should be index
+
+  head(gene_stats)
+
+  # calculate copies per million
+  total_covarage <- gene_stats[colnames(data)  ,"Sum_coverage"] 
+
+  # gives wrong results
+  #gene_gcpm<- data / total_covarage *1e6 
+
+  gene_gcpm<- data %*% diag(1/total_covarage) *1e6 
+  colnames(gene_gcpm) <- colnames(data)
+
+  gene_gcpm[1:5,1:5]
 
 .. seealso:: See in Atlas Tutorial
 
 
-Before version 2.15 the output of the counts were stored in a parquet file. The parquet file can be opended easily with ``pandas.read_parquet`` or ``arrow::read_parquet```.
+Before version 2.15 the output of the counts were stored in a parquet file.
+The parquet file can be opended easily with ``pandas.read_parquet`` or ``arrow::read_parquet```.
+However you need to load the full data into memory.
+
+.. code-block:: R
+
+  parquet_file <- file.path(atlas_dir,"Genecatalog/counts/median_coverage.parquet")
+  gene_abundances<- arrow::read_parquet(parquet_file)
+
+  # transform tibble to a matrix
+  gene_matrix= as.matrix(gene_abundances[,-1])
+  rownames(gene_matrix) <- gene_abundances$GeneNr 
+
+
+  #calculate copies per million
+  gene_gcpm= gene_matrix/ colSums(gene_matrix) *1e6
+
+
+  gene_gcpm[1:5,1:5]
 
-      
 
 
 

From 72f9e64d9c091db70a9ef9d04c44a02f063ee1a3 Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Fri, 5 May 2023 14:57:38 +0200
Subject: [PATCH 2/3] add ci for docs

---
 .github/workflows/documentation.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .github/workflows/documentation.yml

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
new file mode 100644
index 00000000..7cb565bd
--- /dev/null
+++ b/.github/workflows/documentation.yml
@@ -0,0 +1,10 @@
+name: ReadTheDocs Preview
+on: [pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: ReadTheDocs Preview
+      uses: readthedocs/actions/preview@v1
\ No newline at end of file

From 9e11c54af992d31cc79806be5ed204424e66403f Mon Sep 17 00:00:00 2001
From: Silas Kieser <silas.kieser@gmail.com>
Date: Fri, 5 May 2023 15:07:26 +0200
Subject: [PATCH 3/3] drop docs ci

---
 .github/workflows/documentation.yml | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 .github/workflows/documentation.yml

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
deleted file mode 100644
index 7cb565bd..00000000
--- a/.github/workflows/documentation.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: ReadTheDocs Preview
-on: [pull_request]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v2
-    - name: ReadTheDocs Preview
-      uses: readthedocs/actions/preview@v1
\ No newline at end of file