<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/align_macaque_PBMC_data/align_to_host.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Align the data from Kotliar *et al.* to the hosts, macaque and dog (to account for the MDCK spike-in)

# Download raw data

In [1]:
!pip install -q ffq
import json

out = "GSE158390_data.json"

# # Download the complete dataset (106 paired fastqs containing a total of 30 billion reads)
# !ffq GSE158390 --ftp -o $out

# Download only two fastq pairs to demonstrate this notebook
!ffq SRR12698499 SRR12698500 --ftp -o $out

f = open(out)
data = json.load(f)
f.close()

print(len(data))

for dataset in data:
    url = dataset["url"]
    !curl -O $url

[2023-12-13 04:56:38,436]    INFO Parsing run SRR12698499
[2023-12-13 04:56:40,716]    INFO Parsing run SRR12698500
4
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4558M  100 4558M    0     0  45.4M      0  0:01:40  0:01:40 --:--:-- 43.6M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11.9G  100 11.9G    0     0  42.4M      0  0:04:47  0:04:47 --:--:-- 42.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4046M  100 4046M    0     0  45.9M      0  0:01:28  0:01:28 --:--:-- 47.2M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.6G  10

# Align to the macaque and dog reference genomes

In [2]:
!pip install -q gget kb_python
import os
import glob
# Number of threads used for the alignment
threads = 8 # Set to 2 if not using a TPU runtime

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.1/43.1 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.2/119.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.9/21.9 MB[0m [31m68.2 MB/s[

Generate batch file so we can align all fastq files at once:

In [3]:
fastqs = []
for filename in glob.glob("*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])

fastqs.sort()

In [4]:
len(fastqs)

4

In [5]:
samples = []
for fastq in fastqs:
    samples.append(fastq.split("_")[0])

samples = list(set(samples))
len(samples)

2

In [6]:
sample_batch_file = "batch.txt"
with open(sample_batch_file, "w") as batch_file:
    for sample in samples:
        fastq1 = sample + "_1.fastq.gz"
        fastq2 = sample + "_2.fastq.gz"
        batch_file.write(sample + "\t" + fastq1 + "\t" + fastq2 + "\n")

### Generate combined macaque / canine reference index

In [7]:
# Download macaque and dog reference genomes and gtf files
!gget ref -w dna,gtf -r 110 -d macaca_mulatta
!gget ref -w dna,gtf -r 110 -d canis_lupus_familiaris

macaque_fasta = "Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz"
macaque_gtf = "Macaca_mulatta.Mmul_10.110.gtf.gz"
canine_fasta = "Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz"
canine_gtf = "Canis_lupus_familiaris.ROS_Cfam_1.0.110.gtf.gz"

Wed Dec 13 05:09:40 2023 INFO Fetching reference information for macaca_mulatta from Ensembl release: 110.
{
    "macaca_mulatta": {
        "genome_dna": {
            "ftp": "http://ftp.ensembl.org/pub/release-110/fasta/macaca_mulatta/dna/Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz",
            "ensembl_release": 110,
            "release_date": "2023-04-21",
            "release_time": "17:52",
            "bytes": "832M"
        },
        "annotation_gtf": {
            "ftp": "http://ftp.ensembl.org/pub/release-110/gtf/macaca_mulatta/Macaca_mulatta.Mmul_10.110.gtf.gz",
            "ensembl_release": 110,
            "release_date": "2023-04-26",
            "release_time": "11:45",
            "bytes": "19M"
        }
    }
}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  832M  100  832M    0     0   687k      0  0:20:39  0:20:39 --:--:--  687k
  % Total    % Receive

In [8]:
index = "host_index.idx"
t2g = "host_t2g.txt"
fa = "host_transcriptome.fa"

!kb ref \
    -t $threads \
    -i $index \
    -g $t2g \
    -f1 $fa \
    $canine_fasta,$macaque_fasta $canine_gtf,$macaque_gtf

[2023-12-13 05:48:49,496]    INFO [ref] Preparing Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz, Canis_lupus_familiaris.ROS_Cfam_1.0.110.gtf.gz
[2023-12-13 05:49:28,082]    INFO [ref] Splitting genome Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz into cDNA at /content/tmp/tmpinekqsth
[2023-12-13 05:50:46,307]    INFO [ref] Preparing Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz, Macaca_mulatta.Mmul_10.110.gtf.gz
[2023-12-13 05:51:29,833]    INFO [ref] Splitting genome Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz into cDNA at /content/tmp/tmp2nvjiol7
[2023-12-13 05:55:11,349]    INFO [ref] Concatenating 2 cDNAs to host_transcriptome.fa
[2023-12-13 05:55:12,690]    INFO [ref] Creating transcript-to-gene mapping at host_t2g.txt
[2023-12-13 05:55:16,041]    INFO [ref] Indexing host_transcriptome.fa to host_index.idx


### Align to combined macaque and canine reference index

In [9]:
host_out_folder = "host"

In [10]:
# Install kallisto from source
!git clone -q https://github.com/pachterlab/kallisto.git
!cd kallisto && mkdir build && cd build && cmake .. && make

# Install bustools from source
!git clone -q https://github.com/BUStools/bustools.git
!cd bustools && mkdir build && cd build && cmake .. && make

# Define paths to kallisto and bustools binaries
kallisto = "/content/kallisto/build/src/kallisto"
bustools = "/content/bustools/build/src/bustools"

  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Performing Test COMPILER_SUPPORTS_CXX17
-- Performing Test COMPILER_SUPPORTS_CXX17 - Success
[0mshared build[0m
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE  
-- Found ZLIB: /us

In [11]:
%%time
# Align
!$kallisto bus \
      -i $index \
      -o $host_out_folder \
      -t $threads \
      -B $sample_batch_file \
      --batch-barcodes \
      -x 0,0,12:0,12,20:1,0,0


[bus] will try running read files supplied in batch file
[bus] Note: Strand option was not specified; setting it to --unstranded for specified technology
[index] k-mer length: 31
[index] number of targets: 119,563
[index] number of k-mers: 146,102,401
[index] number of D-list k-mers: 5,872,378
[quant] running in single-end mode
[quant] will process file 1: SRR12698500_1.fastq.gz
[quant] will process file 2: SRR12698500_2.fastq.gz
[quant] will process file 1: SRR12698499_1.fastq.gz
[quant] will process file 2: SRR12698499_2.fastq.gz
[progress] 575M reads processed (52.0% mapped)              done
[quant] processed 575,902,616 reads, 299,180,862 reads pseudoaligned

CPU times: user 1min 57s, sys: 15.2 s, total: 2min 12s
Wall time: 5h 48min 30s


In [12]:
%%time
# Sort bus file
!$bustools sort \
    -m 4G \
    -t $threads \
    -o $host_out_folder/output_sorted.bus \
    $host_out_folder/output.bus

partition time: 0.941355s
partition time: 0.868547s
partition time: 0.331474s
Read in 299180862 BUS records
reading time 5.92495s
sorting time 88.7832s
writing time 0s
CPU times: user 970 ms, sys: 121 ms, total: 1.09 s
Wall time: 2min 46s


In [13]:
%%time
# Generate a barcode onlist from data since SeqWell does not have an onlist
# threshold: Minimum number of times a barcode must appear to be included in the onlist
!$bustools allowlist \
    --threshold 1000 \
    -o $host_out_folder/bustools_onlist.txt \
    $host_out_folder/output_sorted.bus

Read in 79373167 BUS records, wrote 34678 barcodes to on-list with threshold 1000
CPU times: user 14.1 ms, sys: 1.95 ms, total: 16.1 ms
Wall time: 1.01 s


In [14]:
%%time
# Correct barcodes based on generated onlist
!$bustools correct \
    -w $host_out_folder/bustools_onlist.txt \
    -o $host_out_folder/output_sorted_corrected.bus \
    $host_out_folder/output_sorted.bus

Found 34624 barcodes in the on-list
Processed 79373167 BUS records
In on-list = 59409606
Corrected    = 9088773
Uncorrected  = 10874788
CPU times: user 134 ms, sys: 13.6 ms, total: 148 ms
Wall time: 23.4 s


In [15]:
%%time
# Sort corrected bus file
!$bustools sort \
    -m 4G \
    -t $threads \
    -o $host_out_folder/output_sorted_corrected_sorted.bus \
    $host_out_folder/output_sorted_corrected.bus

partition time: 0.368369s
 all fits in buffer
Read in 68498379 BUS records
reading time 0.65573s
sorting time 13.8489s
writing time 3.8505s
CPU times: user 146 ms, sys: 17 ms, total: 163 ms
Wall time: 24.3 s


In [16]:
%%time
# Generate count matrix
!$bustools count \
    --genecounts \
    -o $host_out_folder/bustools_count/ \
    -g $t2g \
    -e $host_out_folder/matrix.ec \
    -t $host_out_folder/transcripts.txt \
    $host_out_folder/output_sorted_corrected_sorted.bus

CPU times: user 120 ms, sys: 13.4 ms, total: 133 ms
Wall time: 21.4 s
