From 56154b1f7c81060f583920f7ee2f550641d9e0e3 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Fri, 5 May 2023 14:46:53 +0200 Subject: [PATCH 1/3] fix #640 --- docs/usage/output.rst | 107 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 7 deletions(-) diff --git a/docs/usage/output.rst b/docs/usage/output.rst index 3cf62b21..8bcd2f8f 100644 --- a/docs/usage/output.rst +++ b/docs/usage/output.rst @@ -183,12 +183,28 @@ This rule produces the following output file for the whole dataset. -Since version 2.15 the output of the counts are stored in a hdf file. +Since version 2.15 the output of the quantification are stored in 2 hdf-files`in the folder ``Genecatalog/counts/``: + - ``median_coverage.h5`` + - ``Nmapped_reads.h5.fna`` + +Together with the statistics per gene and per sample. + - ``gene_coverage_stats.parquet`` + - ``sample_coverage_stats.tsv`` + + + +The hdf only contains a matrix of abundances or counts under the name ``data``. The sample names are stored as attributes. +The gene names (e.g. ``Gene00001``) are simply the row number. + + + + + You can open the hdf file in R or python as following: -..code: python +.. code-block:: python import h5py @@ -200,7 +216,7 @@ You can open the hdf file in R or python as following: sample_names = hdf_file['data'].attrs['sample_names'].astype(str) -..code: R +.. code-block:: R library(rhdf5) @@ -214,16 +230,93 @@ You can open the hdf file in R or python as following: colnames(data) <- attributes$sample_names -You don't need to load the full data. You could only select genes with annotations. -You can use the file ``Genecatalog/counts/gene_coverage_stats.tsv`` To normalize the counts. +You don't need to load the full data. +You could only select a subset of genes, e.g. the genes with annotations, or genes that are not singletons. +To find out which gene is a singleton or not you can use the file ``gene_coverage_stats.parquet`` + + +.. code-block:: R + + library(rhdf5) + library(dplyr) + library(tibble) + + # read only subset of data + indexes_of_genes_to_load = c(2,5,100,150) # e.g. genes with annotations + abundance_file <- file.path(atlas_dir,"Genecatalog/counts/median_coverage.h5") + + + # get dimension of data + + h5overview=h5ls(abundance_file) + dim= h5overview[1,"dim"] %>% stringr::str_split(" x ",simplify=T) %>% as.numeric + cat("Load ",length(indexes_of_genes_to_load), " out of ", dim[1] , " genes\n") + + + data <- h5read(file = abundance_file, name = "data", + index = list(indexes_of_genes_to_load, NULL)) + + # add sample names + attributes= h5readAttributes(abundance_file, "data") + colnames(data) <- attributes$sample_names + + # add gene names (e.g. Gene00001) as rownames + gene_names = paste0("Gene", formatC(format="d",indexes_of_genes_to_load,flag="0",width=ceiling(log10(max(dim[1]))))) + rownames(data) <- gene_names + + + data[1:5,1:5] + +If you do this you can use the information in the file ``Genecatalog/counts/sample_coverage_stats.tsv`` to normalize the counts. + +Here is the R code to calculate the gene copies per million (analogous to transcript per million) for the subset of genes. + +.. code-block:: R + + # Load gene stats per sample + gene_stats_file = file.path(atlas_dir,"Genecatalog/counts/sample_coverage_stats.tsv") + + gene_stats <- read.table(gene_stats_file,sep='\t',header=T,row.names=1) + + gene_stats <- t(gene_stats) # might be transposed, sample names should be index + + head(gene_stats) + + # calculate copies per million + total_covarage <- gene_stats[colnames(data) ,"Sum_coverage"] + + # gives wrong results + #gene_gcpm<- data / total_covarage *1e6 + + gene_gcpm<- data %*% diag(1/total_covarage) *1e6 + colnames(gene_gcpm) <- colnames(data) + + gene_gcpm[1:5,1:5] .. seealso:: See in Atlas Tutorial -Before version 2.15 the output of the counts were stored in a parquet file. The parquet file can be opended easily with ``pandas.read_parquet`` or ``arrow::read_parquet```. +Before version 2.15 the output of the counts were stored in a parquet file. +The parquet file can be opended easily with ``pandas.read_parquet`` or ``arrow::read_parquet```. +However you need to load the full data into memory. + +.. code-block:: R + + parquet_file <- file.path(atlas_dir,"Genecatalog/counts/median_coverage.parquet") + gene_abundances<- arrow::read_parquet(parquet_file) + + # transform tibble to a matrix + gene_matrix= as.matrix(gene_abundances[,-1]) + rownames(gene_matrix) <- gene_abundances$GeneNr + + + #calculate copies per million + gene_gcpm= gene_matrix/ colSums(gene_matrix) *1e6 + + + gene_gcpm[1:5,1:5] - From 72f9e64d9c091db70a9ef9d04c44a02f063ee1a3 Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Fri, 5 May 2023 14:57:38 +0200 Subject: [PATCH 2/3] add ci for docs --- .github/workflows/documentation.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/workflows/documentation.yml diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 00000000..7cb565bd --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,10 @@ +name: ReadTheDocs Preview +on: [pull_request] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: ReadTheDocs Preview + uses: readthedocs/actions/preview@v1 \ No newline at end of file From 9e11c54af992d31cc79806be5ed204424e66403f Mon Sep 17 00:00:00 2001 From: Silas Kieser Date: Fri, 5 May 2023 15:07:26 +0200 Subject: [PATCH 3/3] drop docs ci --- .github/workflows/documentation.yml | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 .github/workflows/documentation.yml diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml deleted file mode 100644 index 7cb565bd..00000000 --- a/.github/workflows/documentation.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: ReadTheDocs Preview -on: [pull_request] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: ReadTheDocs Preview - uses: readthedocs/actions/preview@v1 \ No newline at end of file