Skip to content

Commit

Permalink
exporting versions in mtx_to_h5ad and unified cellranger w other alig…
Browse files Browse the repository at this point in the history
…ners
  • Loading branch information
kafkasl committed Nov 15, 2022
1 parent ccfeba3 commit d6bc6ca
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 92 deletions.
68 changes: 0 additions & 68 deletions bin/cellranger_mtx_to_h5ad.py

This file was deleted.

66 changes: 50 additions & 16 deletions bin/mtx_to_h5ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,25 @@
from scipy import io
from anndata import AnnData

def _10x_h5_to_adata(mtx_h5: str, sample: str):
adata = sc.read_10x_h5(mtx_h5)
adata.var["gene_symbols"] = adata.var_names
adata.var.set_index("gene_ids", inplace=True)
adata.obs["sample"] = sample

# reorder columns for 10x mtx files
adata.var = adata.var[['gene_symbols', 'feature_types', 'genome']]

return adata


def mtx_to_adata(
def _mtx_to_adata(
mtx_file: str,
barcode_file: str,
feature_file: str,
sample: str,
aligner: str,
verbose: bool = True,
):

if verbose:
print("Reading in {}".format(mtx_file))

adata = sc.read_mtx(mtx_file)
if (
aligner == "star"
Expand All @@ -31,44 +37,69 @@ def mtx_to_adata(

return adata

def input_to_adata(
input_data: str,
barcode_file: str,
feature_file: str,
sample: str,
aligner: str,
verbose: bool = True,
):

if verbose:
print("Reading in {}".format(input_data))

if aligner == 'cellranger':
return _10x_h5_to_adata(input_data, sample)
else:
return _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner)


def write_counts(
adata: AnnData,
txp2gene: str,
star_index: str,
aligner: str,
out: str,
verbose: bool = False,
):

if verbose:
print("Reading in {}".format(txp2gene))

t2g = None
if txp2gene:
t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2])
elif star_index:
t2g = pd.read_table(f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["transcript_id", "gene_symbol"])
t2g = pd.read_table(f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1])

if txp2gene or star_index:
t2g = t2g.drop_duplicates(subset='gene_id').set_index("gene_id")
adata.var["gene_symbol"] = t2g["gene_symbol"]

# when assigning a pandas column to a dataframe, pandas already takes care of matching the index.
# therefore, setting the index is enough.
t2g = t2g.drop_duplicates(subset='gene_id').set_index("gene_id")
adata.var["gene_symbol"] = t2g["gene_symbol"]

pd.DataFrame(adata.obs.index).to_csv(os.path.join(out, "barcodes.tsv"), sep="\t", index=False, header=None)
pd.DataFrame(adata.var).to_csv(os.path.join(out, "features.tsv"), sep="\t", index=True, header=None)
io.mmwrite(os.path.join(out, "matrix.mtx"), adata.X.T, field="integer")

if verbose:
print("Wrote features.tsv, barcodes.tsv, and matrix.mtx files to {}".format(args["out"]))

def dump_versions():
import pkg_resources
with open('versions.yml', 'w') as f:
f.write("\n".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set]))


if __name__ == "__main__":

parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.")

parser.add_argument("-m", "--mtx", dest="mtx", help="Path to mtx file.")
parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.")
parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=False)
parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.")
parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.")
parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="")
parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="")
parser.add_argument("-s", "--sample", dest="sample", help="Sample name")
parser.add_argument("-o", "--out", dest="out", help="Output path.")
parser.add_argument("-a", "--aligner", dest="aligner", help="Which aligner has been used?")
Expand All @@ -82,12 +113,15 @@ def write_counts(
# create the directory with the sample name
os.makedirs(os.path.dirname(args["out"]), exist_ok=True)

adata = mtx_to_adata(
args["mtx"], args["barcode"], args["feature"], args["sample"], args["aligner"], verbose=args["verbose"]
print(args)
adata = input_to_adata(
args["input_data"], args["barcode"], args["feature"], args["sample"], args["aligner"], verbose=args["verbose"]
)

write_counts(adata, args["txp2gene"], args["star_index"], args["sample"], verbose=args["verbose"])
write_counts(adata, args["txp2gene"], args["star_index"], args["aligner"], args["sample"], verbose=args["verbose"])

adata.write_h5ad(args["out"], compression="gzip")

print("Wrote h5ad file to {}".format(args["out"]))

dump_versions()
12 changes: 7 additions & 5 deletions modules/local/mtx_to_h5ad.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ process MTX_TO_H5AD {
output:
path "${meta.id}/*h5ad", emit: h5ad
path "${meta.id}/*", emit: counts
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when
Expand All @@ -43,10 +44,10 @@ process MTX_TO_H5AD {
if (params.aligner == 'cellranger')
"""
# convert file types
cellranger_mtx_to_h5ad.py \\
--mtx filtered_feature_bc_matrix.h5 \\
mtx_to_h5ad.py \\
--aligner ${params.aligner} \\
--input filtered_feature_bc_matrix.h5 \\
--sample ${meta.id} \\
--txp2gene ${txp2gene} \\
--out ${meta.id}/${meta.id}_matrix.h5ad
"""

Expand All @@ -57,7 +58,7 @@ process MTX_TO_H5AD {
mtx_to_h5ad.py \\
--aligner ${params.aligner} \\
--sample ${meta.id} \\
--mtx *count/counts_unfiltered/\${input_type}.mtx \\
--input *count/counts_unfiltered/\${input_type}.mtx \\
--barcode *count/counts_unfiltered/\${input_type}.barcodes.txt \\
--feature *count/counts_unfiltered/\${input_type}.genes.txt \\
--txp2gene ${txp2gene} \\
Expand All @@ -72,7 +73,7 @@ process MTX_TO_H5AD {
mtx_to_h5ad.py \\
--aligner ${params.aligner} \\
--sample ${meta.id} \\
--mtx $mtx_matrix \\
--input $mtx_matrix \\
--barcode $barcodes_tsv \\
--feature $features_tsv \\
--txp2gene ${txp2gene} \\
Expand All @@ -84,5 +85,6 @@ process MTX_TO_H5AD {
"""
mkdir ${meta.id}
touch ${meta.id}/${meta.id}_matrix.h5ad
touch versions.yml
"""
}
2 changes: 1 addition & 1 deletion subworkflows/local/align_cellranger.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,5 @@ workflow CELLRANGER_ALIGN {
emit:
ch_versions
cellranger_out = CELLRANGER_COUNT.out.outs
txp2gene = cellranger_index
star_index = cellranger_index
}
2 changes: 1 addition & 1 deletion subworkflows/local/mtx_conversion.nf
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ workflow MTX_CONVERSION {
)

//TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output
ch_version = ch_versions.mix(MTX_TO_SEURAT.out.versions)
ch_version = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions)

emit:
ch_versions
Expand Down
2 changes: 1 addition & 1 deletion workflows/scrnaseq.nf
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ workflow SCRNASEQ {
)
ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions)
ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_out)
ch_txp2gene = CELLRANGER_ALIGN.out.txp2gene
ch_star_index = CELLRANGER_ALIGN.out.star_index
}

// Run mtx to h5ad conversion subworkflow
Expand Down

0 comments on commit d6bc6ca

Please sign in to comment.