In [None]:
!splitcode --version
!kb --version
!fastq-dump --version

splitcode, version 0.30.0
usage: kb [-h] [--list] <CMD> ...

kb_python 0.29.3

positional arguments:
  <CMD>
    info      Display package and citation information
    compile   Compile `kallisto` and `bustools` binaries from source
    ref       Build a kallisto index and transcript-to-gene mapping
    count     Generate count matrices from a set of single-cell FASTQ files
    extract   Extract sequencing reads that were pseudoaligned to specific
              genes/transcripts (or extract all reads that were / were not
              pseudoaligned)

options:
  -h, --help  Show this help message and exit
  --list      Display list of supported single-cell technologies
/bin/bash: kallisto: command not found

fastq-dump : 3.0.3 ( 3.0.2 )



In [2]:
# Directories where data will be stored
outdir = "/mnt/data1/10XvParse/Analysis_3/"
fasta_folder = "FASTA/Processed/"
sra_folder = "SRA/"
tmp_folder = "tmp/"
file_names = []
srrs = []
sample_names = ['H1','H2']

In [3]:
import os

def make_dir(directory_name, outdir = ""):
    path = outdir+directory_name
    try:
        os.makedirs(path)
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [4]:
# Ensure necessary sub-directories exist
for directory_name in [fasta_folder, sra_folder, tmp_folder]:
    make_dir(directory_name, outdir)

Directory 'FASTA/Processed/' already exists.
Directory 'SRA/' already exists.
Directory 'tmp/' already exists.


In [5]:
# Get SRA numbers organized by sample
with open("SRA_accession/10x_accession.txt") as access:
    for line in access:
        line = line.strip()
        if line.startswith('H'):
            name = f"10x_{line}"
        else:
            file_names.append(name)
            srrs.append(line)

In [6]:
# Fetch fasta files given SRR numbers
for srr, name in zip(srrs, file_names):
    print(outdir+fasta_folder+name)
    !prefetch {srr} --max-size u -O {outdir+sra_folder}
    !fasterq-dump --outdir {outdir+fasta_folder} --temp {outdir+tmp_folder} \
        --outfile {name+".fasta"} --split-files --skip-technical -f \
        {outdir+sra_folder+srr+"/"+srr+".sra"} --threads 8 --fasta-unsorted
    !pigz {outdir+fasta_folder+name+"_1.fasta"} -p 8
    !pigz {outdir+fasta_folder+name+"_2.fasta"} -p 8

/mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H1

2025-08-28T05:14:56 prefetch.3.0.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.
2025-08-28T05:14:57 prefetch.3.0.3: 1) 'SRR26594154' is found locally
2025-08-28T05:14:57 prefetch.3.0.3: 'SRR26594154' has 0 unresolved dependencies
spots read      : 203,134,663
reads read      : 406,269,326
reads written   : 406,269,326
/mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H2

2025-08-28T05:41:34 prefetch.3.0.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.
2025-08-28T05:41:34 prefetch.3.0.3: 1) 'SRR26594153' is found locally
2025-08-28T05:41:34 prefetch.3.0.3: 'SRR26594153' has 0 unresolved dependencies
spots read      : 254,103,388
reads read      : 508,206,776
reads written   : 508,206,776


In [7]:
# Download human genome
!kb ref --overwrite -d human -i index.idx -g t2g.txt

[2025-08-27 23:26:38,232]    INFO [download] Downloading files for human (standard workflow) from https://github.com/pachterlab/kallisto-transcriptome-indices/releases/download/v1/human_index_standard.tar.xz to tmp/human_index_standard.tar.xz
100%|████████████████████████████████████████| 138M/138M [00:03<00:00, 45.9MB/s]
[2025-08-27 23:26:41,398]    INFO [download] Extracting files from tmp/human_index_standard.tar.xz


In [8]:
import multiprocessing

# Perform pseudoalignment of 10x reads
def align(name):
    read_in = outdir + fasta_folder + '10x_' + name
    read_out = outdir + '10x_' + name + '_out'
    !kb count --overwrite --h5ad -i index.idx -g t2g.txt -x 10XV3 -o {read_out} {read_in + "_1.fasta.gz"} {read_in + "_2.fasta.gz"}

with multiprocessing.Pool(processes=2) as pool:
    pool.map(align,sample_names)
    print("done")

[2025-08-27 23:26:58,508]    INFO [count] Using index index.idx to generate BUS file to /mnt/data1/10XvParse/Analysis_3/10x_H2_out from
[2025-08-27 23:26:58,508]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H2_1.fasta.gz
[2025-08-27 23:26:58,508]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H2_2.fasta.gz
[2025-08-27 23:26:58,518]    INFO [count] Using index index.idx to generate BUS file to /mnt/data1/10XvParse/Analysis_3/10x_H1_out from
[2025-08-27 23:26:58,519]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H1_1.fasta.gz
[2025-08-27 23:26:58,519]    INFO [count]         /mnt/data1/10XvParse/Analysis_3/FASTA/Processed/10x_H1_2.fasta.gz
[2025-08-27 23:42:55,316]    INFO [count] Sorting BUS file /mnt/data1/10XvParse/Analysis_3/10x_H1_out/output.bus to /mnt/data1/10XvParse/Analysis_3/10x_H1_out/tmp/output.s.bus
[2025-08-27 23:43:28,061]    INFO [count] On-list not provided
[2025-08-27 23:43:28,061]