In [1]:
!splitcode --version
!kb --version
!fastq-dump --version

splitcode, version 0.30.0
usage: kb [-h] [--list] <CMD> ...

kb_python 0.29.3

positional arguments:
  <CMD>
    info      Display package and citation information
    compile   Compile `kallisto` and `bustools` binaries from source
    ref       Build a kallisto index and transcript-to-gene mapping
    count     Generate count matrices from a set of single-cell FASTQ files
    extract   Extract sequencing reads that were pseudoaligned to specific
              genes/transcripts (or extract all reads that were / were not
              pseudoaligned)

options:
  -h, --help  Show this help message and exit
  --list      Display list of supported single-cell technologies

fastq-dump : 3.0.3 ( 3.0.2 )



In [None]:
# Directories where data will be stored
outdir = "/mnt/data1/10XvParse/Analysis_3/"
fasta_folder = "FASTA/Processed/"
sra_folder = "SRA/"
tmp_folder = "tmp/"
file_names = []
srrs = []
sample_names = ['H1','H2']

In [None]:
# Get SRA numbers organized by sample
with open("SRA_accession/10x_accession.txt") as access:
    for line in access:
        line = line.strip()
        if line.startswith('H'):
            name = f"10x_{line}"
        else:
            file_names.append(name)
            srrs.append(line)

In [None]:
# Fetch fasta files given SRR numbers
for srr, name in zip(srrs, file_names):
    print(outdir+fasta_folder+name)
    !prefetch {srr} --max-size u -O {outdir+sra_folder}
    !fasterq-dump --outdir {outdir+fasta_folder} --temp {outdir+tmp_folder} \
        --outfile {name+".fasta"} --split-files --skip-technical -f \
        {outdir+sra_folder+srr+"/"+srr+".sra"} --threads 8 --fasta-unsorted
    !pigz {outdir+fasta_folder+name+"_1.fasta"} -p 8
    !pigz {outdir+fasta_folder+name+"_2.fasta"} -p 8

/mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H1

2025-08-19T18:09:18 prefetch.3.0.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.
2025-08-19T18:09:18 prefetch.3.0.3: 1) 'SRR26594154' is found locally
2025-08-19T18:09:19 prefetch.3.0.3: 'SRR26594154' has 0 unresolved dependencies
spots read      : 203,134,663
reads read      : 406,269,326
reads written   : 406,269,326
/mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H2

2025-08-19T18:38:24 prefetch.3.0.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.
2025-08-19T18:38:25 prefetch.3.0.3: 1) 'SRR26594153' is found locally
2025-08-19T18:38:25 prefetch.3.0.3: 'SRR26594153' has 0 unresolved dependencies
spots read      : 254,103,388
reads read      : 508,206,776
reads written   : 508,206,776


In [16]:
# Download human genome
!kb ref --overwrite -d human -i index.idx -g t2g.txt

[2025-08-07 14:11:04,583]    INFO [download] Downloading files for human (standard workflow) from https://github.com/pachterlab/kallisto-transcriptome-indices/releases/download/v1/human_index_standard.tar.xz to tmp/human_index_standard.tar.xz
100%|████████████████████████████████████████| 138M/138M [00:03<00:00, 40.7MB/s]
[2025-08-07 14:11:08,147]    INFO [download] Extracting files from tmp/human_index_standard.tar.xz


In [4]:
import multiprocessing

# Perform pseudoalignment of 10x reads
def align(name):
    read_in = outdir + fasta_folder + '10x_' + name
    read_out = outdir + '10x_' + name + '_out'
    !kb count --overwrite --h5ad -i index.idx -g t2g.txt -x 10XV3 -o {read_out} {read_in + "_1.fasta.gz"} {read_in + "_2.fasta.gz"}

with multiprocessing.Pool(processes=2) as pool:
    pool.map(align,sample_names)
    print("done")

[2025-08-19 14:32:41,902]    INFO [count] Using index index.idx to generate BUS file to /mnt/data1/10XvParse/Analysis_3/10x_H1_out from
[2025-08-19 14:32:41,903]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H1_1.fasta.gz
[2025-08-19 14:32:41,903]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H1_2.fasta.gz
[2025-08-19 14:32:41,895]    INFO [count] Using index index.idx to generate BUS file to /mnt/data1/10XvParse/Analysis_3/10x_H2_out from
[2025-08-19 14:32:41,895]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H2_1.fasta.gz
[2025-08-19 14:32:41,895]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H2_2.fasta.gz
[2025-08-19 14:48:34,867]    INFO [count] Sorting BUS file /mnt/data1/10XvParse/Analysis_3/10x_H1_out/output.bus to /mnt/data1/10XvParse/Analysis_3/10x_H1_out/tmp/output.s.bus
[2025-08-19 14:49:04,321]    INFO [count] On-list not provided
[2025-08-19 14:49:04,321]