# Extract Plasmodium loci

In [None]:
#conda activate phyluce

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from Bio import SeqIO
import pathlib
import glob
import re

In [2]:
os.chdir('/master/nplatt/patho_detect/')

In [4]:
with open('data/19047-23/samples.list') as infile:
    samples = infile.read().splitlines()

In [3]:
os.mkdir('results/extract_plasmodium_loci')
os.mkdir('results/extract_plasmodium_loci/logs')

In [5]:
os.chdir("/master/nplatt/patho_detect/results/extract_plasmodium_loci")

## UCEs from reference genomes

Plasmodium

In [None]:
%%bash

# #download genomes - here all refseq representtives (n=36)
# "/master/nplatt/patho_detect/data/plasmodium_genomes/"

#process to appropriate 2bit format (and rename)
IN_DIR="/master/nplatt/patho_detect/data/plasmodium_genomes/"
OUT_DIR="/master/nplatt/patho_detect/results/extract_plasmodium_loci/ncbi-genomes"

mkdir $OUT_DIR

#get plasmodium genomes (to be modified)
cp $IN_DIR/GCA_001625125.1_ASM162512v1_genomic.fna.gz $OUT_DIR/haemoproteus_tartakovskyi_GCA_001625125.fa.gz
cp $IN_DIR/GCF_000002415.2_ASM241v2_genomic.fna.gz $OUT_DIR/plasmodium_vivax_GCF_000002415.fa.gz
cp $IN_DIR/GCF_000002765.5_GCA_000002765_genomic.fna.gz $OUT_DIR/plasmodium_falciparum_GCF_000002765.fa.gz
cp $IN_DIR/GCF_000006355.2_GCA_000006355.2_genomic.fna.gz $OUT_DIR/plasmodium_knowlesi_GCF_000006355.fa.gz
cp $IN_DIR/GCF_000321355.1_PcynB_1.0_genomic.fna.gz $OUT_DIR/plasmodium_cynomolgi_GCF_000321355.fa.gz
cp $IN_DIR/GCF_000524495.1_Plas_inui_San_Antonio_1_V1_genomic.fna.gz $OUT_DIR/plasmodium_inui_GCF_000524495.fa.gz
cp $IN_DIR/GCF_000956335.1_Plas_frag_nilgiri_V1_genomic.fna.gz  $OUT_DIR/plasmodium_fragile_GCF_000956335.fa.gz
cp $IN_DIR/GCF_001601855.1_ASM160185v1_genomic.fna.gz $OUT_DIR/plasmodium_reichenowi_GCF_001601855.fa.gz
cp $IN_DIR/GCF_001602025.1_ASM160202v1_genomic.fna.gz $OUT_DIR/plasmodium_gaboni_GCF_001602025.fa.gz
cp $IN_DIR/GCF_001680005.1_ASM168000v1_genomic.fna.gz $OUT_DIR/plasmodium_coatneyi_GCF_001680005.fa.gz
cp $IN_DIR/GCF_002157705.1_Pgonderi_assembly01_genomic.fna.gz $OUT_DIR/plasmodium_gonderi_GCF_002157705.fa.gz
cp $IN_DIR/GCF_900002375.2_GCA_900002375_genomic.fna.gz $OUT_DIR/plasmodium_berghei_GCF_900002375.fa.gz
cp $IN_DIR/GCF_900002385.2_GCA_900002385_genomic.fna.gz $OUT_DIR/plasmodium_yoelii_GCF_900002385.fa.gz
cp $IN_DIR/GCF_900005765.1_PRELSG_genomic.fna.gz $OUT_DIR/plasmodium_relictum_GCF_900005765.fa.gz
cp $IN_DIR/GCF_900005855.1_PGAL8A_genomic.fna.gz $OUT_DIR/plasmodium_gallinaceum_GCF_900005855.fa.gz
cp $IN_DIR/GCF_900090045.1_PmUG01_genomic.fna.gz $OUT_DIR/plasmodium_malariae_GCF_900090045.fa.gz
cp $IN_DIR/GCF_900097015.1_PADLG01_genomic.fna.gz $OUT_DIR/plasmodium_g2_GCF_900097015.fa.gz
cp $IN_DIR/GCF_900681995.1_PVVCY_v1_genomic.fna.gz $OUT_DIR/plasmodium_vinckei_GCF_900681995.fa.gz

#
for FAS in $(ls $OUT_DIR/*.fa.gz); do
    TWOBIT=$(basename $FAS .fa.gz).2bit
    faToTwoBit $FAS $OUT_DIR/$TWOBIT
    
    INFO=$(basename $TWOBIT .2bit).tab
    twoBitInfo $OUT_DIR/$TWOBIT $OUT_DIR/$INFO
    
    NAME=$(basename $FAS .fa.gz)
    mkdir $OUT_DIR/$NAME
    mv  $OUT_DIR/$NAME*  $OUT_DIR/$NAME
done



In [13]:
%%bash

#get plasmodium probes
cat ~/patho_detect/decon_probes.fas | grep -A1 --no-group-separator -i apicomplexa >apicomplexa_probes.fas

#and clean up the names so they match the Faircloth format
sed -i 's/uce_apicomplexa_/uce-/' apicomplexa_probes.fas

In [None]:
%%bash

#make sure ulimit is greater than 4k
ulimit -n 8192

phyluce_probe_run_multiple_lastzs_sqlite \
    --db ncbi-genomes-lastz/ncbi-genomes.sqlite \
    --output ncbi-genomes-lastz \
    --identity 0.75 \
    --scaffoldlist haemoproteus_tartakovskyi_GCA_001625125 \
        plasmodium_vivax_GCF_000002415 \
        plasmodium_falciparum_GCF_000002765 \
        plasmodium_knowlesi_GCF_000006355 \
        plasmodium_cynomolgi_GCF_000321355 \
        plasmodium_inui_GCF_000524495 \
        plasmodium_fragile_GCF_000956335 \
        plasmodium_reichenowi_GCF_001601855 \
        plasmodium_gaboni_GCF_001602025 \
        plasmodium_coatneyi_GCF_001680005 \
        plasmodium_gonderi_GCF_002157705 \
        plasmodium_berghei_GCF_900002375 \
        plasmodium_yoelii_GCF_900002385 \
        plasmodium_relictum_GCF_900005765 \
        plasmodium_gallinaceum_GCF_900005855 \
        plasmodium_malariae_GCF_900090045 \
        plasmodium_g2_GCF_900097015 \
        plasmodium_vinckei_GCF_900681995 \
    --genome-base-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/ncbi-genomes/ \
    --probefile apicomplexa_probes.fas \
    --cores 24 \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs/

In [16]:
#make a conf file
#os.mkdir("confs")
with open("confs/genomes.conf", 'w') as out_f:
    out_f.write("[scaffolds]\n")
    for twobit in glob.glob("ncbi-genomes/*/*2bit"):
        name=twobit.split("/")[-1].replace(".2bit", "")
        outline="{}:{}\n".format(name, twobit)
        out_f.write(outline)

In [14]:
%%bash

#extract UCE loci +- 1Kb
phyluce_probe_slice_sequence_from_genomes \
    --lastz ncbi-genomes-lastz \
    --conf confs/genomes.conf \
    --flank 1000 \
    --name-pattern "apicomplexa_probes.fas_v_{}.lastz.clean" \
    --output ncbi-genomes-uce-fasta

#the genome uce contigs will be combined with the assemblies

bash: line 4: phyluce_probe_slice_sequence_from_genomes: command not found


## UCEs from enriched samples

In [27]:
os.chdir("/master/nplatt/patho_detect/results/extract_plasmodium_loci")

### Get reads that map to Plasmodium from Bartonella Kraken run

In [18]:
#get the read ids
for sample in tqdm(samples):
    
    #short name
    short_name=sample.split("_")[0]

    #mkdir to save reads and read lists
    pathlib.Path("/master/nplatt/patho_detect/results/extract_plasmodium_loci/plasmodium-reads/{}/".format(short_name)).mkdir(parents=True, exist_ok=True)    

    #get kraken table that lists reads with terminal taxa
    results_tbl="/master/nplatt/patho_detect/results/extract_bartonella_loci/04_kraken2/{}/{}_kraken.tbl".format(short_name, short_name)

    #get reads ids that have target taxa hits
    with open(results_tbl, 'r') as in_f:
        with open("/master/nplatt/patho_detect/results/extract_plasmodium_loci/plasmodium-reads/{}/reads.txt".format(short_name), 'w') as out_f:
            for entry in in_f:
                if "plasmodium" in entry.lower():
                    out_f.write("{}\n".format(entry.split("\t")[1]))
                    

  0%|          | 0/54 [00:00<?, ?it/s]

In [23]:
#extract the reads
                           
samples_w_gt10k_reads = []
for sample in tqdm(samples):
    
    short_name=sample.split("_")[0]

    for read in ["1", "2"]:
        in_reads  = "/master/nplatt/patho_detect/results/extract_bartonella_loci/03_clean-fastq/{}/split-adapter-quality-trimmed/{}-READ{}.fastq.gz".format(short_name, short_name, read)
        in_list   = "/master/nplatt/patho_detect/results/extract_plasmodium_loci/plasmodium-reads/{}/reads.txt".format(short_name)
        out_reads = "/master/nplatt/patho_detect/results/extract_plasmodium_loci/plasmodium-reads/{}/{}-READ{}.fastq".format(short_name, short_name, read)

        #count num reads (only proceed if greater than 10K)
        count = len(open(in_list).readlines(  ))
        
        if count >= 10_000:
            samples_w_gt10k_reads.append(short_name)
            #extract reads with seqtk
            cmd="seqtk subseq {} {} >{}".format(in_reads, in_list, out_reads)
            !{cmd}
            
samples_w_gt10k_reads=list(set(samples_w_gt10k_reads))

  0%|          | 0/54 [00:00<?, ?it/s]

In [24]:
samples_w_gt10k_reads

['TK164728',
 'TK164845',
 'TK25651',
 'TK25656',
 'TK164683',
 'TK164731',
 'TK164702',
 'TK164715']

In [None]:
%%bash

#gzip all of the fastq files
for FASTQ in $(ls plasmodium-reads/*/*-READ*.fastq); do 
    gzip $FASTQ & 
done

### Assemble Plasmodium loci

In [28]:
#make phyluce assembly.conf
with open("confs/assembly.conf", 'w') as out_f:
    out_f.write("[samples]\n")
    
    for sample in samples_w_gt10k_reads:
        short_name=sample.split("_")[0]
        read_dir = "/master/nplatt/patho_detect/results/extract_plasmodium_loci/plasmodium-reads/{}/".format(short_name)
        out_f.write("{}:{}\n".format(short_name, read_dir))

In [None]:
%%bash 

#fun phyluce assembly with spades
phyluce_assembly_assemblo_spades \
    --conf /master/nplatt/patho_detect/results/extract_plasmodium_loci/confs/assembly.conf \
    --output /master/nplatt/patho_detect/results/extract_plasmodium_loci/spades-assemblies \
    --cores 24 \
    --memory 768 \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
%%bash

#add genome uces to the assembly dir
mkdir /master/nplatt/patho_detect/results/extract_plasmodium_loci/all_assemblies

cp /master/nplatt/patho_detect/results/extract_plasmodium_loci/ncbi-genomes-uce-fasta/*.fasta all_assemblies/
cp /master/nplatt/patho_detect/results/extract_plasmodium_loci/spades-assemblies/contigs/*.fasta all_assemblies/

rename .contigs.fasta .fasta all_assemblies/*.contigs.fasta

### Find UCE loci in assemblies

In [None]:
%%bash 

#find probed regions in assemblies
phyluce_assembly_match_contigs_to_probes \
    --contigs /master/nplatt/patho_detect/results/extract_plasmodium_loci/all_assemblies \
    --probes /master/nplatt/patho_detect/results/extract_plasmodium_loci/apicomplexa_probes.fas \
    --min-identity 75 \
    --output /master/nplatt/patho_detect/results/extract_plasmodium_loci/plasmodium_uce-search-results \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

### Extract Loci

In [29]:
#make phyluce assembly.conf
with open("confs/taxon-set.conf", 'w') as out_f:
    out_f.write("[all]\n")
    for assembly in glob.glob("all_assemblies/*.fasta"):
        short_name=assembly.replace(".fasta", "").split("/")[-1]
        out_f.write("{}\n".format(short_name,))

In [None]:
%%bash 

mkdir -p taxon-sets/all

# create the data matrix configuration file
phyluce_assembly_get_match_counts \
    --locus-db plasmodium_uce-search-results/probe.matches.sqlite \
    --taxon-list-config confs/taxon-set.conf \
    --taxon-group 'all' \
    --incomplete-matrix \
    --output taxon-sets/all/all-taxa-incomplete.conf \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
cd taxon-sets/all

# make a log directory to hold our log files - this keeps things neat

# get FASTA data for taxa in our taxon set
phyluce_assembly_get_fastas_from_match_counts \
    --contigs /master/nplatt/patho_detect/results/extract_plasmodium_loci/all_assemblies  \
    --locus-db /master/nplatt/patho_detect/results/extract_plasmodium_loci/plasmodium_uce-search-results/probe.matches.sqlite \
    --match-count-output all-taxa-incomplete.conf \
    --output all-taxa-incomplete.fasta \
    --incomplete-matrix all-taxa-incomplete.incomplete \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
phyluce_assembly_explode_get_fastas_file \
    --input all-taxa-incomplete.fasta \
    --output exploded-fastas \
    --by-taxon
    
# get summary stats on the FASTAS
echo "samples,contigs,total bp,mean length,95 CI length,min length,max length,median legnth,contigs >1kb">uce_stats.csv
    
for i in exploded-fastas/*.fasta; do
    phyluce_assembly_get_fasta_lengths \
        --input $i \
        --csv
done >>uce_stats.csv


### Align and trim loci

In [None]:
# align the data
phyluce_align_seqcap_align \
    --input all-taxa-incomplete.fasta \
    --output mafft-nexus-edge-trimmed \
    --taxa 4 \
    --aligner mafft \
    --cores 12 \
    --incomplete-matrix \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
phyluce_align_seqcap_align \
    --input all-taxa-incomplete.fasta \
    --output mafft-nexus-internal-trimmed \
    --taxa 4 \
    --aligner mafft \
    --cores 12 \
    --incomplete-matrix \
    --output-format fasta \
    --no-trim \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
phyluce_align_get_gblocks_trimmed_alignments_from_untrimmed \
    --alignments mafft-nexus-internal-trimmed \
    --output mafft-nexus-internal-trimmed-gblocks \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
phyluce_align_remove_locus_name_from_files \
    --alignments mafft-nexus-internal-trimmed-gblocks \
    --output mafft-nexus-internal-trimmed-gblocks-clean \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
phyluce_align_get_only_loci_with_min_taxa \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean \
    --taxa 4 \
    --percent 0.25 \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
phyluce_align_concatenate_alignments \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml \
    --nexus \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs

In [None]:
mkdir /master/nplatt/patho_detect/results/extract_plasmodium_loci/uce_alignments

cp mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml* \
    /master/nplatt/patho_detect/results/extract_plasmodium_loci/uce_alignments/

In [None]:
phyluce_align_concatenate_alignments \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml \
    --phylip \
    --log-path /master/nplatt/patho_detect/results/extract_plasmodium_loci/logs 

In [None]:
cp mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml* \
    /master/nplatt/patho_detect/results/extract_plasmodium_loci/uce_alignments/

# Build phylogeny

In [None]:
cd /master/nplatt/patho_detect/results/extract_plasmodium_loci

raxml-ng \
    --all \
    --prefix 25pn_75pid_bacillus \
    --seed 12345 \
    --msa  /master/nplatt/patho_detect/results/extract_plasmodium_loci/uce_alignments/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml.phylip \
    --msa-format PHYLIP \
    --data-type DNA \
    --model GTR+G \
    --tree pars{10} \
    --bs-trees 200 \
    --threads 48 \
    --workers 48