In [6]:
!splitcode --version
!kb --version
!fastq-dump --version

splitcode, version 0.30.0
usage: kb [-h] [--list] <CMD> ...

kb_python 0.29.3

positional arguments:
  <CMD>
    info      Display package and citation information
    compile   Compile `kallisto` and `bustools` binaries from source
    ref       Build a kallisto index and transcript-to-gene mapping
    count     Generate count matrices from a set of single-cell FASTQ files
    extract   Extract sequencing reads that were pseudoaligned to specific
              genes/transcripts (or extract all reads that were / were not
              pseudoaligned)

options:
  -h, --help  Show this help message and exit
  --list      Display list of supported single-cell technologies

fastq-dump : 3.0.3 ( 3.0.2 )



In [7]:
# Directories where data will be stored
outdir = "/mnt/data1/10XvParse/Analysis_2/"
fasta_folder = "FASTA/Raw/"
proc_folder = "FASTA/Processed/"
sra_folder = "SRA/"
tmp_folder = "tmp/"
file_names = []
srrs = []

In [3]:
import os

def make_dir(directory_name, outdir = ""):
    path = outdir+directory_name
    try:
        os.makedirs(path)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [4]:
# Ensure necessary sub-directories exist
for directory_name in [fasta_folder, sra_folder, tmp_folder, proc_folder]:
    make_dir(directory_name, outdir)

Directory 'FASTA/Raw/' already exists.
Directory 'SRA/' already exists.
Directory 'tmp/' already exists.
Directory 'FASTA/Processed/' already exists.


In [3]:
# Get SRA numbers
with open("SRA_accession/10x_accession.txt") as access:
    for i, line in enumerate(access):
        name = name = f"10x_{i+1}"
        srr = line.strip()
        file_names.append(name)
        srrs.append(srr)

In [7]:
# Fetch fasta files given SRR numbers
for srr, name in zip(srrs, file_names):
    print(outdir+fasta_folder+name)
    #!prefetch {srr} --max-size u -O {outdir+sra_folder}
    !fasterq-dump --outdir {outdir+fasta_folder} --temp {outdir+tmp_folder} \
        --outfile {name+".fasta"} --split-files --skip-technical -f \
        {outdir+sra_folder+srr+"/"+srr+".sra"} --threads 8 --fasta-unsorted
    !pigz {outdir+fasta_folder+name+"_3.fasta"} -p 8
    !pigz {outdir+fasta_folder+name+"_4.fasta"} -p 8

/mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_1
spots read      : 215,517,133
reads read      : 862,068,532
reads written   : 431,034,266
technical reads : 431,034,266
/mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_2
spots read      : 41,508,492
reads read      : 166,033,968
reads written   : 83,016,984
technical reads : 83,016,984
/mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_3
spots read      : 18,620,924
reads read      : 74,483,696
reads written   : 37,241,848
technical reads : 37,241,848
/mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_4
spots read      : 146,491,184
reads read      : 585,964,736
reads written   : 292,982,368
technical reads : 292,982,368
/mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_5
spots read      : 216,970,400
reads read      : 867,881,600
reads written   : 433,940,800
technical reads : 433,940,800


In [15]:
make_dir("batch")
make_dir("run_info")

Directory 'batch' already exists.
Directory 'run_info' already exists.


In [14]:
# Generate a batch file to combine fasta files
batch_file = "batch/10x_batch.txt"
with open(batch_file, "w") as batch:
    for file in file_names:
        fastq_files = [outdir + fasta_folder + file + "_3.fasta.gz", 
                   outdir + fasta_folder + file + "_4.fasta.gz"]
        batch.write(f"{file}\t{fastq_files[0]}\t{fastq_files[1]}\n")

In [None]:
proc_files = [outdir + proc_folder + "10x_1.fasta.gz", 
                outdir + proc_folder + "10x_2.fasta.gz"]
mapping = "run_info/10x_mapping.txt"
outb = "run_info/10x_barcodes.fastq.gz"

In [17]:
!splitcode --remultiplex \
    --nFastqs=2 --gzip -o {proc_files[0]},{proc_files[1]} \
    --outb={outb} --mapping={mapping} {batch_file} -t 8

* Using a list of 0 tags (vector size: 0; map size: 0; num elements in map: 0)
* will process sample 1: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_1_3.fasta.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_1_4.fasta.gz
* will process sample 2: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_2_3.fasta.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_2_4.fasta.gz
* will process sample 3: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_3_3.fasta.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_3_4.fasta.gz
* will process sample 4: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_4_3.fasta.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_4_4.fasta.gz
* will process sample 5: /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_5_3.fasta.gz
                         /mnt/data1/10XvParse/Analysis_2/FASTA/Raw/10x_5_4.fasta.gz
631M reads processed         
done 
* processed 639,108,133 reads


In [11]:
# Download mouse genome. This experiment used C57BL/6N (Black 6) mice, so we can use a general reference genome. 
!kb ref -d mouse -i index.idx -g t2g.txt

[2025-09-22 17:26:17,041]    INFO [download] Skipping download because some files already exist. Use the --overwrite flag to overwrite.


In [20]:
# Perform pseudoalignment of 10x reads
read_out = outdir + "10x_out/"
!kb count --overwrite --h5ad -i index.idx -g t2g.txt -x 10XV3 -o {read_out} {proc_files[0]} {proc_files[1]}

[2025-08-27 17:45:43,973]    INFO [count] Using index index.idx to generate BUS file to /mnt/data1/10XvParse/Analysis_2/10x_out/ from
[2025-08-27 17:45:43,973]    INFO [count]         /mnt/data1/10XvParse/Analysis_2/FASTA/Processed/10x_1.fasta.gz
[2025-08-27 17:45:43,973]    INFO [count]         /mnt/data1/10XvParse/Analysis_2/FASTA/Processed/10x_2.fasta.gz
[2025-08-27 18:40:39,674]    INFO [count] Sorting BUS file /mnt/data1/10XvParse/Analysis_2/10x_out/output.bus to /mnt/data1/10XvParse/Analysis_2/10x_out/tmp/output.s.bus
[2025-08-27 18:41:44,468]    INFO [count] On-list not provided
[2025-08-27 18:41:44,468]    INFO [count] Copying pre-packaged 10XV3 on-list to /mnt/data1/10XvParse/Analysis_2/10x_out/
[2025-08-27 18:41:45,054]    INFO [count] Inspecting BUS file /mnt/data1/10XvParse/Analysis_2/10x_out/tmp/output.s.bus
[2025-08-27 18:42:03,583]    INFO [count] Correcting BUS records in /mnt/data1/10XvParse/Analysis_2/10x_out/tmp/output.s.bus to /mnt/data1/10XvParse/Analysis_2/10x_out