## Week 6: simple single-cell RNA-seq analysis

In [1]:
# imports 
import tarfile
import os
import urllib.request, gzip, shutil
from pathlib import Path
import subprocess

In [2]:
# directories for output data 
base = Path("data")

# Input data directories
fastq_dir = base / "fastq"
ref_dir = base / "reference"
white_dir = base / "whitelist"

# Output directories
index_dir = base / "alevin_index"
quant_dir = base / "alevin_quant"

for d in [base, fastq_dir, ref_dir, white_dir, index_dir, quant_dir]:
    d.mkdir(parents=True, exist_ok=True)

print("Directory structure created.")

Directory structure created.


In [3]:
# Path to file toy_read_set.tar.gz 
tar_path = "toy_read_ref_set.tar.gz"

# Extract the tar.gz file
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall("toy_data")

print("Extracted files:")
for root, dirs, files in os.walk("toy_data"):
    for f in files:
        print(os.path.join(root, f))


Extracted files:
toy_data/toy_ref_read/toy_human_ref/fasta/genome.fa
toy_data/toy_ref_read/toy_human_ref/genes/genes.gtf
toy_data/toy_ref_read/toy_read_fastq/selected_R1_reads.fastq
toy_data/toy_ref_read/toy_read_fastq/selected_R2_reads.fastq


In [4]:
# path 3M-feb-2018
with gzip.open("3M-february-2018.txt.gz", "rb") as f_in:
    with open("3M-february-2018.txt", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)



In [5]:
# Define destination directories
fastq_dir = Path("data/fastq")
ref_dir = Path("data/reference")
white_dir = Path("data/whitelist")

# Define source file paths
src_R1 = Path("toy_data/toy_ref_read/toy_read_fastq/selected_R1_reads.fastq")
src_R2 = Path("toy_data/toy_ref_read/toy_read_fastq/selected_R2_reads.fastq")

src_fasta = Path("toy_data/toy_ref_read/toy_human_ref/fasta/genome.fa")
src_gtf   = Path("toy_data/toy_ref_read/toy_human_ref/genes/genes.gtf")

src_whitelist = Path("3M-february-2018.txt")  # or adjust name if needed

# Destination paths
dst_R1 = fastq_dir / "selected_R1_reads.fastq"
dst_R2 = fastq_dir / "selected_R2_reads.fastq"

dst_fasta = ref_dir / "genome.fa"
dst_gtf   = ref_dir / "genes.gtf"

dst_whitelist = white_dir / src_whitelist.name


# Destination dirs
fastq_dir = Path("data/fastq")
ref_dir = Path("data/reference")
white_dir = Path("data/whitelist")

# Source paths
src_R1 = Path("toy_data/toy_ref_read/toy_read_fastq/selected_R1_reads.fastq")
src_R2 = Path("toy_data/toy_ref_read/toy_read_fastq/selected_R2_reads.fastq")
src_fasta = Path("toy_data/toy_ref_read/toy_human_ref/fasta/genome.fa")
src_gtf   = Path("toy_data/toy_ref_read/toy_human_ref/genes/genes.gtf")
src_whitelist = Path("3M-february-2018.txt")

# Destination paths
dst_R1 = fastq_dir / src_R1.name
dst_R2 = fastq_dir / src_R2.name
dst_fasta = ref_dir / src_fasta.name
dst_gtf   = ref_dir / src_gtf.name
dst_whitelist = white_dir / src_whitelist.name

# --- Move FASTQ files ---
if not dst_R1.exists():
    shutil.move(str(src_R1), str(dst_R1))
    print("Moved selected_R1_reads.fastq")
else:
    print("R1 already exists, skipping")

if not dst_R2.exists():
    shutil.move(str(src_R2), str(dst_R2))
    print("Moved selected_R2_reads.fastq")
else:
    print("R2 already exists, skipping")

# --- Move reference files ---
if not dst_fasta.exists():
    shutil.move(str(src_fasta), str(dst_fasta))
    print("Moved genome.fa")
else:
    print("genome.fa already exists, skipping")

if not dst_gtf.exists():
    shutil.move(str(src_gtf), str(dst_gtf))
    print("Moved genes.gtf")
else:
    print("genes.gtf already exists, skipping")

# --- Move whitelist ---
if src_whitelist.exists() and not dst_whitelist.exists():
    shutil.move(str(src_whitelist), str(dst_whitelist))
    print("Moved whitelist")
else:
    print("Whitelist already exists or missing, skipping")

toy_data_dir = Path("toy_data")

if toy_data_dir.exists():
    shutil.rmtree(toy_data_dir)
    print("toy_data directory deleted.")
else:
    print("toy_data directory does not exist.")

top_level_whitelist = Path("3M-february-2018.txt")

if top_level_whitelist.exists():
    top_level_whitelist.unlink()
    print("Deleted top-level 3M-february-2018.txt")
else:
    print("Top-level whitelist file not found.")





R1 already exists, skipping
R2 already exists, skipping
genome.fa already exists, skipping
genes.gtf already exists, skipping
Whitelist already exists or missing, skipping
toy_data directory deleted.
Deleted top-level 3M-february-2018.txt


## 2. Alevin-fry

toy dataset uses R2 ≈ 91 bp

Single-cell Best Practices: “Generating splici transcriptomes”: Trim ~5 bp from each end to avoid pseudoalignment to boundary artifacts

Salmon indexing parameters: threads 4-8, picking 4 

Chemistry = Chromium v3

etc 

In [10]:
print("=== Setting up ALEVIN_FRY_HOME ===")
alevin_home = os.path.abspath('alevin_fry_home')
os.makedirs(alevin_home, exist_ok=True)
os.environ['ALEVIN_FRY_HOME'] = alevin_home

print("=== Configuring simpleaf paths ===")
!simpleaf set-paths

print("=== Building simpleaf index ===")
!simpleaf index \
    -o simpleaf_index \
    -f data/reference/genome.fa \
    -g data/reference/genes.gtf \
    -r 90 \
    -t 8


=== Setting up ALEVIN_FRY_HOME ===
✓ Created ALEVIN_FRY_HOME: /Users/oliviawhitelaw/week6/alevin_fry_home

=== Configuring simpleaf paths ===
found `piscem` in the PATH at /Users/oliviawhitelaw/miniforge3/envs/scrna/bin/piscem
found `salmon` in the PATH at /Users/oliviawhitelaw/miniforge3/envs/scrna/bin/salmon
found `alevin-fry` in the PATH at /Users/oliviawhitelaw/miniforge3/envs/scrna/bin/alevin-fry
[2m2025-11-24T03:59:57.287043Z[0m [32m INFO[0m [2msimpleaf::utils::prog_utils[0m[2m:[0m Could not find macs3 executable, peak calling cannot be peformed by simpleaf


=== Building simpleaf index ===
Running: simpleaf index -o simpleaf_index -f data/reference/genome.fa -g data/reference/genes.gtf -r 90 -t 8
[2m2025-11-24T03:59:57.500100Z[0m [32m INFO[0m [2msimpleaf::simpleaf_commands::indexing[0m[2m:[0m preparing to make reference with roers
[2m2025-11-24T03:59:57.507984Z[0m [32m INFO[0m [2mgrangers::reader::gtf[0m[2m:[0m Finished parsing the input file. Found 3 com

RuntimeError: simpleaf index failed with code 1