In [1]:
!date

Tue Apr 27 11:45:40 PDT 2021


{
    "accession": "SRR9916613",
    "experiment": {
        "accession": "SRX6665859",
        "title": "NextSeq 500 paired end sequencing; GSM4012688: TASC_DIFFEX, sample 1; Homo sapiens; RNA-Seq",
        "platform": "ILLUMINA",
        "instrument": "NextSeq 500"
    },
    "study": {
        "accession": "SRP217685",
        "title": "Targeted single-cell transcriptome readouts for high-throughput genetics and functional genomics",
        "abstract": "We show that targeted single-cell RNA-sequencing (TAP-seq) permits reliable mapping of cell (sub)types with as little as 100 reads per cell and reduces the sequencing costs for differential expression testing by a factor of 10-30, thereby enabling a cost-effective profiling of a large number of genotypes at the single cell level. We demonstrate the use of TAP-seq by generating comprehensive perturbation-based enhancer-target gene maps for 1.5% of the human genome. Overall design: The performance of targeted single-cell RNA-seq (TAP)

In [5]:
!tail -15 ../../../references/tapseq/TAP_DIFFEX/SRR9916613.json

    },
    "title": "NextSeq 500 paired end sequencing; GSM4012688: TASC_DIFFEX, sample 1; Homo sapiens; RNA-Seq",
    "files": [
        {
            "url": "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR991/003/SRR9916613/SRR9916613_1.fastq.gz",
            "md5": "00b566e49e4ba511a44e051fbc54ab24",
            "size": "825262568"
        },
        {
            "url": "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR991/003/SRR9916613/SRR9916613_2.fastq.gz",
            "md5": "56f33bbbc2774742fbdd408be8d37256",
            "size": "1261114788"
        }
    ]
}

In [6]:
!tail -15 ../../../references/tapseq/TAP_DIFFEX/SRR9916614.json

    },
    "title": "NextSeq 500 paired end sequencing; GSM4012689: TASC_DIFFEX, sample 2; Homo sapiens; RNA-Seq",
    "files": [
        {
            "url": "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR991/004/SRR9916614/SRR9916614_1.fastq.gz",
            "md5": "94b923c0a629058cc374879289f70c60",
            "size": "815990487"
        },
        {
            "url": "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR991/004/SRR9916614/SRR9916614_2.fastq.gz",
            "md5": "1fa1c974998d70a8d6f1fc286ff533ab",
            "size": "1259156375"
        }
    ]
}

# Install packages and download data

In [7]:
import sys
COLAB = "google.colab" in sys.modules


In [12]:
if COLAB:
    !git clone https://github.com/sbooeshaghi/BMGP_2020.git

Cloning into 'BMGP_2020'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 75 (delta 16), reused 67 (delta 8), pack-reused 0[K
Unpacking objects: 100% (75/75), done.


In [4]:
if COLAB:
    !pip install --quiet \
    kb-python==0.26.0 \
    anndata==0.7.6 \


In [5]:
if COLAB:
    # download cellranger (30sec)
    !wget -O cellranger-6.0.1.tar.gz \
    -q --show-progress --progress=bar:force  \
    "https://cf.10xgenomics.com/releases/cell-exp/cellranger-6.0.1.tar.gz?Expires=1619593023&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly9jZi4xMHhnZW5vbWljcy5jb20vcmVsZWFzZXMvY2VsbC1leHAvY2VsbHJhbmdlci02LjAuMS50YXIuZ3oiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE2MTk1OTMwMjN9fX1dfQ__&Signature=EfUmpmiiOtatL9nfNGsODQnSC2a5GmvKesMiVee~04H2ndGVBNkLGB7uWuU3c1NvgG5R1GMk6MYNaPcTaF100IPncjjnFFot7qawOb1LiZQx7sxXLlDfyEPr4TshlXdwzVkU8DX1yZeq82iHVc3etUHiugVLVttUW7wrfd1do35jVoJzjHacWeFGBS6CWvFpvAhNsT29-YULeJRDbdYe~TYhjsKnl1BVWSQzKUtdNsrsiPgXAnAf4GsNmEDNA~b~Gnwyubs3cGGna4TDXw65IcoduOHcIxOMHlhgX5wIabfcMZx3i9Jaw7TpwFxRHHEpoFftHuEsGHs1lusxB8pKYQ__&Key-Pair-Id=APKAI7S6A5RYOXBWRPDA" \
    2>&1
    
    !tar -xf cellranger-6.0.1.tar.gz

    
    # download cellranger reference (5min)
    !wget \
    -q --show-progress --progress=bar:force  \
    https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz \
    2>&1
    
    !tar -xf refdata-gex-GRCh38-2020-A.tar.gz




In [29]:
%%bash
source cellranger-6.0.1/sourceme.bash

In [8]:
!mkdir -p fastqs
if COLAB:
    # download the relevant data
    !wget -O fastqs/SRR9916613_R1.fastq.gz \
    -q --show-progress --progress=bar:force  \
    ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR991/003/SRR9916613/SRR9916613_1.fastq.gz \
    2>&1

    # download the relevant data
    !wget -O fastqs/SRR9916613_R2.fastq.gz \
    -q --show-progress --progress=bar:force  \
    ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR991/003/SRR9916613/SRR9916613_2.fastq.gz \
    2>&1



In [None]:
# targets gene in panel 2 (panel_2_chr8.csv)
# uses guide RNA from control perturbations chr8 (gRNA_chr8_control.csv)

# Preprocess with kb

In [14]:
%%time
# !FB="BMGP_2020/references/tapseq/kite/feature_barcodes.txt" && \
# INDEX="BMGP_2020/references/tapseq/kite/features.idx" && \
# F2B="BMGP_2020/references/tapseq/kite/f2b.txt" && \
# FASTA="BMGP_2020/references/tapseq/kite/features.fa" && \
!FB="../../../references/tapseq/kite/feature_barcodes.txt" && \
INDEX="../../../references/tapseq/kite/features.idx" && \
F2B="../../../references/tapseq/kite/f2b.txt" && \
FASTA="../../../references/tapseq/kite/features.fa" && \
kb ref \
-i $INDEX \
-g $F2B \
-f1 $FASTA \
--workflow kite \
$FB

[2021-06-09 12:17:57,130]    INFO Generating mismatch FASTA at ../../../references/tapseq/kite/features.fa
[2021-06-09 12:17:57,182]    INFO Creating transcript-to-gene mapping at ../../../references/tapseq/kite/f2b.txt
[2021-06-09 12:17:57,219]    INFO Indexing ../../../references/tapseq/kite/features.fa to ../../../references/tapseq/kite/features.idx
CPU times: user 18 ms, sys: 19.4 ms, total: 37.3 ms
Wall time: 1.79 s


In [15]:
%%bash
# FB="BMGP_2020/references/10xFB_5k_pbmc_v3/kite/feature_barcodes.txt" && \
# INDEX="BMGP_2020/references/10xFB_5k_pbmc_v3/kite/features.idx" && \
# F2B="BMGP_2020/references/10xFB_5k_pbmc_v3/kite/f2b.txt" && \
# FASTA="BMGP_2020/references/10xFB_5k_pbmc_v3/kite/features.fa" && \
FB="../../../references/tapseq/kite/feature_barcodes.txt" && \
INDEX="../../../references/tapseq/kite/features.idx" && \
F2B="../../../references/tapseq/kite/f2b.txt" && \
FASTA="../../../references/tapseq/kite/features.fa" && \
OUT="kite" && \
FASTQS=$(FASTQDIR="fastqs/" && paste -d" " \
<(ls $FASTQDIR | awk -v p=$FASTQDIR '{print p$0}' | grep R1) \
<(ls $FASTQDIR | awk -v p=$FASTQDIR '{print p$0}' | grep R2)) && \
kb count \
-i $INDEX \
-g $F2B \
-x 10xv2 \
-o $OUT \
--h5ad \
--filter bustools \
$FASTQS

[2021-06-09 12:19:10,908]    INFO Using index ../../../references/tapseq/kite/features.idx to generate BUS file to kite from
[2021-06-09 12:19:10,908]    INFO         fastqs/SRR9916613_R1.fastq.gz
[2021-06-09 12:19:10,909]    INFO         fastqs/SRR9916613_R2.fastq.gz
[2021-06-09 12:20:05,695]    INFO Sorting BUS file kite/output.bus to kite/tmp/output.s.bus
[2021-06-09 12:20:09,318]    INFO Whitelist not provided
[2021-06-09 12:20:09,319]    INFO Copying pre-packaged 10XV2 whitelist to kite
[2021-06-09 12:20:09,408]    INFO Inspecting BUS file kite/tmp/output.s.bus
[2021-06-09 12:20:09,772]    INFO Correcting BUS records in kite/tmp/output.s.bus to kite/tmp/output.s.c.bus with whitelist kite/10xv2_whitelist.txt
[2021-06-09 12:20:10,362]    INFO Sorting BUS file kite/tmp/output.s.c.bus to kite/output.unfiltered.bus
[2021-06-09 12:20:13,306]    INFO Generating count matrix kite/counts_unfiltered/cells_x_genes from BUS file kite/output.unfiltered.bus
[2021-06-09 12:20:13,567]    INFO Rea