In [1]:
!splitcode --version
!kb --version
!fastq-dump --version

splitcode, version 0.30.0
usage: kb [-h] [--list] <CMD> ...

kb_python 0.29.3

positional arguments:
  <CMD>
    info      Display package and citation information
    compile   Compile `kallisto` and `bustools` binaries from source
    ref       Build a kallisto index and transcript-to-gene mapping
    count     Generate count matrices from a set of single-cell FASTQ files
    extract   Extract sequencing reads that were pseudoaligned to specific
              genes/transcripts (or extract all reads that were / were not
              pseudoaligned)

options:
  -h, --help  Show this help message and exit
  --list      Display list of supported single-cell technologies

fastq-dump : 3.0.3 ( 3.0.2 )



In [None]:
# Directories where data will be stored
outdir = "/mnt/data1/10XvParse/Analysis_2/"
fasta_folder = "FASTA/Raw/"
sra_folder = "SRA/"
tmp_folder = "tmp/"
file_names = []
srrs = []

In [None]:
# Get SRA numbers
with open("SRA_accession/10x_accession.txt") as access:
    for i, line in enumerate(access):
        name = name = f"10x_{i+1}"
        srr = line.strip()
        file_names.append(name)
        srrs.append(srr)

In [None]:
# Fetch fasta files given SRR numbers
for srr, name in zip(srrs, file_names):
    print(outdir+fasta_folder+name)
    !prefetch {srr} --max-size u -O {outdir+sra_folder}
    !fasterq-dump --outdir {outdir+fasta_folder} --temp {outdir+tmp_folder} \
        --outfile {name+".fasta"} --split-files --skip-technical -f \
        {outdir+sra_folder+srr+"/"+srr+".sra"} --threads 8 --fasta_unsorted
    !pigz {outdir+fasta_folder+name+"_3.fasta"} -p 8
    !pigz {outdir+fasta_folder+name+"_4.fasta"} -p 8

TypeError: 'list' object cannot be interpreted as an integer

In [None]:
# Generate a batch file to combine fasta files
batch_file = "batch/10x_batch.txt"
with open(batch_file, "w") as batch:
    for file in file_names:
        fastq_files = [outdir + fasta_folder + file + "_3.fasta.gz", 
                   outdir + fasta_folder + file + "_4.fasta.gz"]
        batch.write(f"{file}\t{fastq_files[0]}\t{fastq_files[1]}\n")

In [None]:
proc_folder = "FASTA/Processed/"
proc_files = [outdir + proc_folder + "10x_1.fasta.gz", 
                outdir + proc_folder + "10x_2.fasta.gz"]
mapping = "run_info/10x_mapping.txt"
outb = "run_info/10x_barcodes.fastq.gz"

In [6]:
!splitcode --remultiplex \
    --nFastqs=2 --gzip -o {proc_files[0]},{proc_files[1]} \
    --outb={outb} --mapping={mapping} {batch_file} -t 8

* Using a list of 0 tags (vector size: 0; map size: 0; num elements in map: 0)
* will process sample 1: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_1_3.fastq.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_1_4.fastq.gz
* will process sample 2: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_2_3.fastq.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_2_4.fastq.gz
* will process sample 3: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_3_3.fastq.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_3_4.fastq.gz
* will process sample 4: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_4_3.fastq.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_4_4.fastq.gz
* will process sample 5: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_5_3.fastq.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_5_4.fastq.gz
631M reads processed         
done 
* processed 639,108,133 reads


In [3]:
# Download mouse genome. This experiment used C57BL/6N (Black 6) mice, so we can use a general reference genome. 
!kb ref -d mouse -i index.idx -g t2g.txt

[2025-07-24 16:28:36,636]    INFO [download] Skipping download because some files already exist. Use the --overwrite flag to overwrite.


In [6]:
# Perform pseudoalignment of 10x reads
read_out = outdir + "10x_out/"
!kb count --overwrite --h5ad -i index.idx -g t2g.txt -x 10XV3 -o {read_out} {proc_files[0]} {proc_files[1]}

[2025-07-24 16:29:24,070]    INFO [count] Using index index.idx to generate BUS file to /mnt/data1/10XvParse/Analysis_2/10x_out/ from
[2025-07-24 16:29:24,070]    INFO [count]         /mnt/data1/10XvParse/Analysis_2/FASTA/Processed/10x_1.fastq.gz
[2025-07-24 16:29:24,070]    INFO [count]         /mnt/data1/10XvParse/Analysis_2/FASTA/Processed/10x_2.fastq.gz
[2025-07-24 17:24:07,058]    INFO [count] Sorting BUS file /mnt/data1/10XvParse/Analysis_2/10x_out/output.bus to /mnt/data1/10XvParse/Analysis_2/10x_out/tmp/output.s.bus
[2025-07-24 17:25:16,158]    INFO [count] On-list not provided
[2025-07-24 17:25:16,158]    INFO [count] Copying pre-packaged 10XV3 on-list to /mnt/data1/10XvParse/Analysis_2/10x_out/
[2025-07-24 17:25:16,894]    INFO [count] Inspecting BUS file /mnt/data1/10XvParse/Analysis_2/10x_out/tmp/output.s.bus
[2025-07-24 17:25:37,028]    INFO [count] Correcting BUS records in /mnt/data1/10XvParse/Analysis_2/10x_out/tmp/output.s.bus to /mnt/data1/10XvParse/Analysis_2/10x_out