# Extract schistosoma loci

In [None]:
#conda activate phyluce

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from Bio import SeqIO
import pathlib
import glob
import re

In [2]:
os.chdir('/master/nplatt/patho_detect/')

In [3]:
with open('data/19047-23/samples.list') as infile:
    samples = infile.read().splitlines()

In [4]:
os.mkdir('results/extract_schistosoma_loci')
os.mkdir('results/extract_schistosoma_loci/logs')

In [5]:
os.chdir("/master/nplatt/patho_detect/results/extract_schistosoma_loci")

## UCEs from reference genomes

Schistosoma

In [None]:
%%bash

# #download genomes - here all refseq representtives (n=36)
# "/master/nplatt/patho_detect/data/plasmodium_genomes/"

#process to appropriate 2bit format (and rename)
IN_DIR="/master/nplatt/patho_detect/data/schistosoma_genomes/"
OUT_DIR="/master/nplatt/patho_detect/results/extract_schistosoma_loci/ncbi-genomes"

mkdir $OUT_DIR

#get plasmodium genomes (to be modified)
cp $IN_DIR/GCA_000237925.4_GCA_000237925.3_genomic.fna.gz $OUT_DIR/schistosoma_mansoni_GCA_000237925.fa.gz
cp $IN_DIR/GCA_006368765.1_ASM636876v1_genomic.fna.gz $OUT_DIR/schistosoma_japonicum_GCA_006368765.fa.gz
cp $IN_DIR/GCA_900618395.1_S_margrebowiei_Zambia_0011_upd_genomic.fna.gz $OUT_DIR/schistosoma_margrebowiei_GCA_900618395.fa.gz
cp $IN_DIR/GCA_000699445.2_SchHae_2.0_genomic.fna.gz $OUT_DIR/schistosoma_haematobium_GCA_000699445.fa.gz
cp $IN_DIR/GCA_900617995.1_S_mattheei_Denwood_0011_upd_genomic.fna.gz $OUT_DIR/schistosoma_mattheei_GCA_900617995.fa.gz
cp $IN_DIR/GCA_900618515.1_T_regenti_v1_0_4_001_upd_genomic.fna.gz $OUT_DIR/trichobilharzia_regenti_GCA_900618515.fa.gz
cp $IN_DIR/GCA_003958945.1_ASM395894v1_genomic.fna.gz $OUT_DIR/schistosoma_bovis_GCA_003958945.fa.gz
cp $IN_DIR/GCA_900618015.1_S_curassoni_Dakar_0011_upd_genomic.fna.gz $OUT_DIR/schistosoma_curassoni_GCA_900618015.fa.gz


#
for FAS in $(ls $OUT_DIR/*.fa.gz); do
    TWOBIT=$(basename $FAS .fa.gz).2bit
    faToTwoBit $FAS $OUT_DIR/$TWOBIT
    
    INFO=$(basename $TWOBIT .2bit).tab
    twoBitInfo $OUT_DIR/$TWOBIT $OUT_DIR/$INFO
    
    NAME=$(basename $FAS .fa.gz)
    mkdir $OUT_DIR/$NAME
    mv  $OUT_DIR/$NAME*  $OUT_DIR/$NAME
done



In [13]:
%%bash

#get plasmodium probes
cat ~/patho_detect/decon_probes.fas | grep -A1 --no-group-separator -i cestoda >cestoda_probes.fas

#and clean up the names so they match the Faircloth format
sed -i 's/uce_cestoda_/uce-/' cestoda_probes.fas

In [None]:
%%bash

#make sure ulimit is greater than 4k
ulimit -n 8192

phyluce_probe_run_multiple_lastzs_sqlite \
    --db ncbi-genomes-lastz/ncbi-genomes.sqlite \
    --output ncbi-genomes-lastz \
    --identity 0.75 \
    --scaffoldlist schistosoma_mansoni_GCA_000237925 \
        schistosoma_japonicum_GCA_006368765 \
        schistosoma_margrebowiei_GCA_900618395 \
        schistosoma_haematobium_GCA_000699445 \
        schistosoma_mattheei_GCA_900617995 \
        trichobilharzia_regenti_GCA_900618515 \
        schistosoma_bovis_GCA_003958945 \
        schistosoma_curassoni_GCA_900618015 \
    --genome-base-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/ncbi-genomes/ \
    --probefile cestoda_probes.fas \
    --cores 24 \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs/

In [7]:
#make a conf file
os.mkdir("confs")
with open("confs/genomes.conf", 'w') as out_f:
    out_f.write("[scaffolds]\n")
    for twobit in glob.glob("ncbi-genomes/*/*2bit"):
        name=twobit.split("/")[-1].replace(".2bit", "")
        outline="{}:{}\n".format(name, twobit)
        out_f.write(outline)

In [14]:
%%bash

#extract UCE loci +- 1Kb
phyluce_probe_slice_sequence_from_genomes \
    --lastz ncbi-genomes-lastz \
    --conf confs/genomes.conf \
    --flank 1000 \
    --name-pattern "cestoda_probes.fas_v_{}.lastz.clean" \
    --output ncbi-genomes-uce-fasta

#the genome uce contigs will be combined with the assemblies

bash: line 4: phyluce_probe_slice_sequence_from_genomes: command not found


## UCEs from enriched samples

In [8]:
os.chdir("/master/nplatt/patho_detect/results/extract_schistosoma_loci")

### Get reads that map to Schistosoma from Bartonella Kraken run

In [9]:
#get the read ids
for sample in tqdm(samples):
    
    #short name
    short_name=sample.split("_")[0]

    #mkdir to save reads and read lists
    pathlib.Path("/master/nplatt/patho_detect/results/extract_schistosoma_loci/schistosoma-reads/{}/".format(short_name)).mkdir(parents=True, exist_ok=True)    

    #get kraken table that lists reads with terminal taxa
    results_tbl="/master/nplatt/patho_detect/results/extract_bartonella_loci/04_kraken2/{}/{}_kraken.tbl".format(short_name, short_name)

    #get reads ids that have target taxa hits
    with open(results_tbl, 'r') as in_f:
        with open("/master/nplatt/patho_detect/results/extract_schistosoma_loci/schistosoma-reads/{}/reads.txt".format(short_name), 'w') as out_f:
            for entry in in_f:
                if "schistosoma" in entry.lower():
                    out_f.write("{}\n".format(entry.split("\t")[1]))
                    

  0%|          | 0/54 [00:00<?, ?it/s]

In [10]:
#extract the reads
                           
samples_w_gt10k_reads = []
for sample in tqdm(samples):
    
    short_name=sample.split("_")[0]

    for read in ["1", "2"]:
        in_reads  = "/master/nplatt/patho_detect/results/extract_bartonella_loci/03_clean-fastq/{}/split-adapter-quality-trimmed/{}-READ{}.fastq.gz".format(short_name, short_name, read)
        in_list   = "/master/nplatt/patho_detect/results/extract_schistosoma_loci/schistosoma-reads/{}/reads.txt".format(short_name)
        out_reads = "/master/nplatt/patho_detect/results/extract_schistosoma_loci/schistosoma-reads/{}/{}-READ{}.fastq".format(short_name, short_name, read)

        #count num reads (only proceed if greater than 10K)
        count = len(open(in_list).readlines(  ))
        
        if count >= 10_000:
            samples_w_gt10k_reads.append(short_name)
            #extract reads with seqtk
            cmd="seqtk subseq {} {} >{}".format(in_reads, in_list, out_reads)
            !{cmd}
            
samples_w_gt10k_reads=list(set(samples_w_gt10k_reads))

  0%|          | 0/54 [00:00<?, ?it/s]

In [11]:
samples_w_gt10k_reads

['Hamster1', 'TK25651', 'Hamster2', 'Hamster4']

In [None]:
%%bash

#gzip all of the fastq files
for FASTQ in $(ls schistosoma-reads/*/*-READ*.fastq); do 
    gzip $FASTQ & 
done

### Assemble Schistosoma loci

In [12]:
#make phyluce assembly.conf
with open("confs/assembly.conf", 'w') as out_f:
    out_f.write("[samples]\n")
    
    for sample in samples_w_gt10k_reads:
        short_name=sample.split("_")[0]
        read_dir = "/master/nplatt/patho_detect/results/extract_schistosoma_loci/schistosoma-reads/{}/".format(short_name)
        out_f.write("{}:{}\n".format(short_name, read_dir))

In [None]:
%%bash 

#fun phyluce assembly with spades
phyluce_assembly_assemblo_spades \
    --conf /master/nplatt/patho_detect/results/extract_schistosoma_loci/confs/assembly.conf \
    --output /master/nplatt/patho_detect/results/extract_schistosoma_loci/spades-assemblies \
    --cores 24 \
    --memory 768 \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
%%bash

#add genome uces to the assembly dir
mkdir /master/nplatt/patho_detect/results/extract_schistosoma_loci/all_assemblies

cp /master/nplatt/patho_detect/results/extract_schistosoma_loci/ncbi-genomes-uce-fasta/*.fasta all_assemblies/
cp /master/nplatt/patho_detect/results/extract_schistosoma_loci/spades-assemblies/contigs/*.fasta all_assemblies/

rename .contigs.fasta .fasta all_assemblies/*.contigs.fasta

### Find UCE loci in assemblies

In [None]:
%%bash 

#find probed regions in assemblies
phyluce_assembly_match_contigs_to_probes \
    --contigs /master/nplatt/patho_detect/results/extract_schistosoma_loci/all_assemblies \
    --probes /master/nplatt/patho_detect/results/extract_schistosoma_loci/cestoda_probes.fas \
    --min-identity 75 \
    --output /master/nplatt/patho_detect/results/extract_schistosoma_loci/schistosoma_uce-search-results \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

### Extract Loci

In [13]:
#make phyluce assembly.conf
with open("confs/taxon-set.conf", 'w') as out_f:
    out_f.write("[all]\n")
    for assembly in glob.glob("all_assemblies/*.fasta"):
        short_name=assembly.replace(".fasta", "").split("/")[-1]
        out_f.write("{}\n".format(short_name,))

In [None]:
%%bash 

mkdir -p taxon-sets/all

# create the data matrix configuration file
phyluce_assembly_get_match_counts \
    --locus-db schistosoma_uce-search-results/probe.matches.sqlite \
    --taxon-list-config confs/taxon-set.conf \
    --taxon-group 'all' \
    --incomplete-matrix \
    --output taxon-sets/all/all-taxa-incomplete.conf \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
cd taxon-sets/all

# make a log directory to hold our log files - this keeps things neat

# get FASTA data for taxa in our taxon set
phyluce_assembly_get_fastas_from_match_counts \
    --contigs /master/nplatt/patho_detect/results/extract_schistosoma_loci/all_assemblies  \
    --locus-db /master/nplatt/patho_detect/results/extract_schistosoma_loci/schistosoma_uce-search-results/probe.matches.sqlite \
    --match-count-output all-taxa-incomplete.conf \
    --output all-taxa-incomplete.fasta \
    --incomplete-matrix all-taxa-incomplete.incomplete \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
phyluce_assembly_explode_get_fastas_file \
    --input all-taxa-incomplete.fasta \
    --output exploded-fastas \
    --by-taxon
    
# get summary stats on the FASTAS
echo "samples,contigs,total bp,mean length,95 CI length,min length,max length,median legnth,contigs >1kb">uce_stats.csv
    
for i in exploded-fastas/*.fasta; do
    phyluce_assembly_get_fasta_lengths \
        --input $i \
        --csv
done >>uce_stats.csv


### Align and trim loci

In [None]:
# align the data
phyluce_align_seqcap_align \
    --input all-taxa-incomplete.fasta \
    --output mafft-nexus-edge-trimmed \
    --taxa 4 \
    --aligner mafft \
    --cores 12 \
    --incomplete-matrix \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
phyluce_align_seqcap_align \
    --input all-taxa-incomplete.fasta \
    --output mafft-nexus-internal-trimmed \
    --taxa 4 \
    --aligner mafft \
    --cores 12 \
    --incomplete-matrix \
    --output-format fasta \
    --no-trim \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
phyluce_align_get_gblocks_trimmed_alignments_from_untrimmed \
    --alignments mafft-nexus-internal-trimmed \
    --output mafft-nexus-internal-trimmed-gblocks \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
phyluce_align_remove_locus_name_from_files \
    --alignments mafft-nexus-internal-trimmed-gblocks \
    --output mafft-nexus-internal-trimmed-gblocks-clean \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
phyluce_align_get_only_loci_with_min_taxa \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean \
    --taxa 4 \
    --percent 0.25 \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
phyluce_align_concatenate_alignments \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml \
    --nexus \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs

In [None]:
mkdir /master/nplatt/patho_detect/results/extract_schistosoma_loci/uce_alignments

cp mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml* \
    /master/nplatt/patho_detect/results/extract_schistosoma_loci/uce_alignments/

In [None]:
phyluce_align_concatenate_alignments \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml \
    --phylip \
    --log-path /master/nplatt/patho_detect/results/extract_schistosoma_loci/logs 

In [None]:
cp mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml* \
    /master/nplatt/patho_detect/results/extract_schistosoma_loci/uce_alignments/

# Build phylogeny

In [None]:
cd /master/nplatt/patho_detect/results/extract_schistosoma_loci

raxml-ng \
    --all \
    --prefix 25pn_75pid_bacillus \
    --seed 12345 \
    --msa  /master/nplatt/patho_detect/results/extract_schistosoma_loci/uce_alignments/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml.phylip \
    --msa-format PHYLIP \
    --data-type DNA \
    --model GTR+G \
    --tree pars{10} \
    --bs-trees 200 \
    --threads 24 \
    --workers 8