<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/align_macaque_PBMC_data/1_align_to_host.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download raw data

In [None]:
!pip install -q ffq
import json

In [None]:
# Use ffq to get all FTP download links
out = "GSE158390_data.json"
!ffq GSE158390 --ftp -o $out

f = open(out)
data = json.load(f)
f.close()

print(len(data))
data[0]

In [None]:
%%time
# # Download the complete dataset (106 paired fastqs containing a total of 30 billion reads)
# for dataset in data:
#     url = dataset["url"]
#     !curl -O $url

# Download only two datasets to demonstrate this notebook
for dataset in data[:2]:
    url = dataset["url"]
    !curl -O $url

# Align to the macaque and dog (to account for the MDCK spike-in) reference genomes

In [None]:
!pip install -q gget kb_python
import os
import glob
# Number of threads used for the alignment
threads = 8 # Set to 2 if not using a TPU runtime

Generate batch file so we can align all fastq files at once:

In [None]:
fastqs = []
for filename in glob.glob("*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])

fastqs.sort()

In [None]:
len(fastqs)

In [None]:
samples = []
for fastq in fastqs:
    samples.append(fastq.split("_")[0])

samples = list(set(samples))
len(samples)

In [None]:
sample_batch_file = "batch.txt"
with open(sample_batch_file, "w") as batch_file:
    for sample in samples:
        fastq1 = sample + "_1.fastq.gz"
        fastq2 = sample + "_2.fastq.gz"
        batch_file.write(sample + "\t" + fastq1 + "\t" + fastq2 + "\n")

### Generate combined macaque / canine reference index

In [None]:
# Download macaque and dog reference genomes and gtf files
!gget ref -w dna,gtf -r 110 -d macaca_mulatta
!gget ref -w dna,gtf -r 110 -d canis_lupus_familiaris

macaque_fasta = "Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz"
macaque_gtf = "Macaca_mulatta.Mmul_10.110.gtf.gz"
canine_fasta = "Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz"
canine_gtf = "Canis_lupus_familiaris.ROS_Cfam_1.0.110.gtf.gz"

In [None]:
index = "host_index.idx"
t2g = "host_t2g.txt"
fa = "host_transcriptome.fa"

!kb ref \
    -t $threads \
    -i $index \
    -g $t2g \
    -f1 $fa \
    $canine_fasta,$macaque_fasta $canine_gtf,$macaque_gtf

### Align to combined macaque and canine reference index

In [None]:
host_out_folder = "host"

In [None]:
# Install kallisto from source
!git clone -q https://github.com/pachterlab/kallisto.git
!cd kallisto && mkdir build && cd build && cmake .. && make

# Install bustools from source
!git clone -q https://github.com/BUStools/bustools.git
!cd bustools && mkdir build && cd build && cmake .. && make

# Define paths to kallisto and bustools binaries
kallisto = "/content/kallisto/build/src/kallisto"
bustools = "/content/bustools/build/src/bustools"

In [None]:
%%time
# Align
!$kallisto bus \
      -i $index \
      -o $host_out_folder \
      -t $threads \
      -B $sample_batch_file \
      --batch-barcodes \
      -x 0,0,12:0,12,20:1,0,0

In [None]:
%%time
# Sort bus file
!$bustools sort \
    -m 4G \
    -t $threads \
    -o $host_out_folder/output_sorted.bus \
    $host_out_folder/output.bus

In [None]:
%%time
# Generate a barcode onlist from data since SeqWell does not have an onlist
# threshold: Minimum number of times a barcode must appear to be included in the onlist
!$bustools allowlist \
    --threshold 1000 \
    -o $host_out_folder/bustools_onlist.txt \
    $host_out_folder/output_sorted.bus

In [None]:
%%time
# Correct barcodes based on generated onlist
!$bustools correct \
    -w $host_out_folder/bustools_onlist.txt \
    -o $host_out_folder/output_sorted_corrected.bus \
    $host_out_folder/output_sorted.bus

In [None]:
%%time
# Sort corrected bus file
!$bustools sort \
    -m 4G \
    -t $threads \
    -o $host_out_folder/output_sorted_corrected_sorted.bus \
    $host_out_folder/output_sorted_corrected.bus

In [None]:
%%time
# Generate count matrix
!$bustools count \
    --genecounts \
    -o $host_out_folder/bustools_count/ \
    -g $t2g \
    -e $host_out_folder/matrix.ec \
    -t $host_out_folder/transcripts.txt \
    $host_out_folder/output_sorted_corrected_sorted.bus