# Extract and analyze bacillus reads

In [None]:
#conda activate phyluce

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from Bio import SeqIO
import pathlib
import glob
import re

In [2]:
os.chdir('/master/nplatt/patho_detect/')

In [3]:
with open('data/19047-23/samples.list') as infile:
    samples = infile.read().splitlines()

In [4]:
os.mkdir('results/extract_bacillus_from_TK25656')
os.mkdir('results/extract_bacillus_from_TK25656/logs')

In [5]:
os.chdir("/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656")

## UCEs from reference genomes

Bartonella and Brucella

In [None]:
%%bash

# #download genomes - here all refseq representtives (n=36)
# ~/patho_detect/data/bartonella-ncbi-genomes-2021-03-10

#process to appropriate 2bit format (and rename)
IN_DIR="/master/nplatt/patho_detect/data/ncbi-genomes-2021-03-24"
OUT_DIR="/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/ncbi-genomes"

mkdir $OUT_DIR

#get brucella genomes (to be modified)
cp $IN_DIR/GCF_000008445.1_ASM844v1_genomic.fna.gz $OUT_DIR/b_anthracis_GCF_000008445.fa.gz
cp $IN_DIR/GCF_000011645.1_ASM1164v1_genomic.fna.gz $OUT_DIR/b_licheniformis_GCF_000011645.fa.gz
cp $IN_DIR/GCF_000021205.1_ASM2120v1_genomic.fna.gz $OUT_DIR/b_cereus_GCF_000021205.fa.gz
cp $IN_DIR/GCF_000177235.2_ASM17723v2_genomic.fna.gz $OUT_DIR/b_cellulosilyticus_GCF_000177235.fa.gz
cp $IN_DIR/GCF_000473245.1_ASM47324v1_genomic.fna.gz $OUT_DIR/b_infantis_GCF_000473245.fa.gz
cp $IN_DIR/GCF_000972685.1_ASM97268v1_genomic.fna.gz $OUT_DIR/b_altitudinis_GCF_000972685.fa.gz
cp $IN_DIR/GCF_001050115.1_ASM105011v1_genomic.fna.gz  $OUT_DIR/b_smithii_GCF_001050115.fa.gz
cp $IN_DIR/GCF_001721685.1_ASM172168v1_genomic.fna.gz $OUT_DIR/b_beveridgei_GCF_001721685.fa.gz
cp $IN_DIR/GCF_001730235.1_ASM173023v1_genomic.fna.gz $OUT_DIR/vulcanibacillus_modesticaldus_GCF_001730235.fa.gz
cp $IN_DIR/GCF_001857925.1_ASM185792v1_genomic.fna.gz $OUT_DIR/b_xiamenensis_GCF_001857925.fa.gz
cp $IN_DIR/GCF_001889165.1_ASM188916v1_genomic.fna.gz  $OUT_DIR/b_weihaiensis_GCF_001889165.fa.gz
cp $IN_DIR/GCF_002117165.1_ASM211716v1_genomic.fna.gz $OUT_DIR/b_velezensis_GCF_002117165.fa.gz
cp $IN_DIR/GCF_002250055.1_ASM225005v1_genomic.fna.gz $OUT_DIR/b_cohnii_GCF_002250055.fa.gz
cp $IN_DIR/GCF_002250945.2_ASM225094v2_genomic.fna.gz $OUT_DIR/b_cytotoxicus_GCF_002250945.fa.gz
cp $IN_DIR/GCF_002993925.1_ASM299392v1_genomic.fna.gz $OUT_DIR/b_paralicheniformis_GCF_002993925.fa.gz
cp $IN_DIR/GCF_003096215.1_ASM309621v1_genomic.fna.gz $OUT_DIR/b_thermoamylovorans_GCF_003096215.fa.gz
cp $IN_DIR/GCF_003667885.1_ASM366788v1_genomic.fna.gz $OUT_DIR/b_vallismortis_GCF_003667885.fa.gz
cp $IN_DIR/GCF_004006435.1_ASM400643v1_genomic.fna.gz $OUT_DIR/b_halotolerans_GCF_004006435.fa.gz
cp $IN_DIR/GCF_006704205.1_ASM670420v1_genomic.fna.gz $OUT_DIR/b_ciccensis_GCF_006704205.fa.gz
cp $IN_DIR/GCF_007995155.1_ASM799515v1_genomic.fna.gz $OUT_DIR/b_dafuensis_GCF_007995155.fa.gz
cp $IN_DIR/GCF_008244765.1_ASM824476v1_genomic.fna.gz $OUT_DIR/b_safensis_GCF_008244765.fa.gz
cp $IN_DIR/GCF_008807735.1_ASM880773v1_genomic.fna.gz $OUT_DIR/b_wiedmannii_GCF_008807735.fa.gz
cp $IN_DIR/GCF_009739945.1_ASM973994v1_genomic.fna.gz $OUT_DIR/b_luti_GCF_009739945.fa.gz
cp $IN_DIR/GCF_012225885.1_ASM1222588v1_genomic.fna.gz $OUT_DIR/b_tequilensis_GCF_012225885.fa.gz
cp $IN_DIR/GCF_012648005.1_ASM1264800v1_genomic.fna.gz $OUT_DIR/b_mojavensis_GCF_012648005.fa.gz
cp $IN_DIR/GCF_013122255.1_ASM1312225v1_genomic.fna.gz $OUT_DIR/b_amyloliquefaciens_GCF_013122255.fa.gz
cp $IN_DIR/GCF_013267435.1_ASM1326743v1_genomic.fna.gz $OUT_DIR/b_circulans_GCF_013267435.fa.gz
cp $IN_DIR/GCF_900093775.1_EVONIK_BGLY_genomic.fna.gz $OUT_DIR/b_glycinifermentans_GCF_900093775.fa.gz
cp $IN_DIR/GCF_900475885.1_49595_C01_genomic.fna.gz $OUT_DIR/haemophilus_aegyptius_GCF_900475885.fa.gz
cp $IN_DIR/GCF_900636945.1_45532_G01_genomic.fna.gz $OUT_DIR/b_freudenreichii_GCF_900636945.fa.gz

#
for FAS in $(ls $OUT_DIR/*.fa.gz); do
    TWOBIT=$(basename $FAS .fa.gz).2bit
    faToTwoBit $FAS $OUT_DIR/$TWOBIT
    
    INFO=$(basename $TWOBIT .2bit).tab
    twoBitInfo $OUT_DIR/$TWOBIT $OUT_DIR/$INFO
    
    NAME=$(basename $FAS .fa.gz)
    mkdir $OUT_DIR/$NAME
    mv  $OUT_DIR/$NAME*  $OUT_DIR/$NAME
done



In [None]:
%%bash

#get bartonella probes
cat ~/patho_detect/decon_probes.fas | grep -A1 --no-group-separator -i bacillus >bacillus_probes.fas

#and clean up the names so they match the Faircloth format
sed -i 's/uce_bacillus_/uce-/' bacillus_probes.fas

####get rid of streptobacills probes

In [None]:
%%bash

#make sure ulimit is greater than 4k
ulimit -n 8192

phyluce_probe_run_multiple_lastzs_sqlite \
    --db ncbi-genomes-lastz/ncbi-genomes.sqlite \
    --output ncbi-genomes-lastz \
    --identity 0.75 \
    --scaffoldlist b_anthracis_GCF_000008445 \
        b_licheniformis_GCF_000011645 \
        b_cereus_GCF_000021205 \
        b_cellulosilyticus_GCF_000177235 \
        b_infantis_GCF_000473245 \
        b_altitudinis_GCF_000972685 \
        b_smithii_GCF_001050115 \
        b_beveridgei_GCF_001721685 \
        vulcanibacillus_modesticaldus_GCF_001730235 \
        b_xiamenensis_GCF_001857925 \
        b_weihaiensis_GCF_001889165 \
        b_velezensis_GCF_002117165 \
        b_cohnii_GCF_002250055 \
        b_cytotoxicus_GCF_002250945 \
        b_paralicheniformis_GCF_002993925 \
        b_thermoamylovorans_GCF_003096215 \
        b_vallismortis_GCF_003667885 \
        b_halotolerans_GCF_004006435 \
        b_ciccensis_GCF_006704205 \
        b_dafuensis_GCF_007995155 \
        b_safensis_GCF_008244765 \
        b_wiedmannii_GCF_008807735 \
        b_luti_GCF_009739945 \
        b_tequilensis_GCF_012225885 \
        b_mojavensis_GCF_012648005 \
        b_amyloliquefaciens_GCF_013122255 \
        b_circulans_GCF_013267435 \
        b_glycinifermentans_GCF_900093775 \
        haemophilus_aegyptius_GCF_900475885 \
        b_freudenreichii_GCF_900636945 \
    --genome-base-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/ncbi-genomes/ \
    --probefile bacillus_probes.fas \
    --cores 24 \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs/

In [19]:
#make a conf file

with open("confs/genomes.conf", 'w') as out_f:
    out_f.write("[scaffolds]\n")
    for twobit in glob.glob("ncbi-genomes/*/*2bit"):
        name=twobit.split("/")[-1].replace(".2bit", "")
        outline="{}:{}\n".format(name, twobit)
        out_f.write(outline)

In [None]:
%%bash

#extract UCE loci +- 1Kb
phyluce_probe_slice_sequence_from_genomes \
    --lastz ncbi-genomes-lastz \
    --conf confs/genomes.conf \
    --flank 1000 \
    --name-pattern "bacillus_probes.fas_v_{}.lastz.clean" \
    --output ncbi-genomes-uce-fasta

#the genome uce contigs will be combined with the assemblies

## UCEs from enriched samples

In [5]:
os.chdir("/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656")

### Get and clean reads

In [None]:
%%bash

#get raw data in the proper name format
mkdir raw-fastq

cp ~/patho_detect/data/19047-23/TK25656_Exp5_43_P4_R*.fastq.gz raw-fastq/
rename R1.fastq.gz L001_R1_001.fastq.gz raw-fastq/*R1*.gz
rename R2.fastq.gz L001_R2_001.fastq.gz raw-fastq/*R2*.gz

In [None]:
%%bash

mkdir confs

echo '[adapters]
i7:GATCGGAAGAGCACACGTCTGAACTCCAGTCAC*ATCTCGTATGCCGTCTTCTGCTTG
i5:AATGATACGGCGACCACCGAGATCTACAC*ACACTCTTTCCCTACACGACGCTCTTCCGATCT

# this is the list of indexes we used
[tag sequences]
H10:TCCGCGAA

# this is how each index maps to each set of reads
[tag map]
TK25656_Exp5_43_P4:H10

# this is how we want files to be renamed
[names]
TK25656_Exp5_43_P4:TK25656' >confs/illumiprocessor.conf

In [None]:
#clean up raw reads
illumiprocessor \
    --input raw-fastq/ \
    --output clean-fastq \
    --config confs/illumiprocessor.conf \
    --cores 24

### Get reads that map to Bacillus

In [None]:
#previously ran kraken and know there is a lot of bacillus reads in th

In [None]:
%%bash

#compare reads to kraken2 to get bartonella mapping reads
for SAMPLE in $(ls clean-fastq); do
    echo $SAMPLE

    mkdir -p kraken2/$SAMPLE

    #quanify/classify
    ~/patho_detect/code/kraken2/kraken2 \
        --use-names \
        --threads 24 \
        --db ~/patho_detect/results/kraken2/pathodb_20201215 \
        --report kraken2/$SAMPLE/"$SAMPLE"_kraken.report \
        --classified-out kraken2/$SAMPLE/"$SAMPLE"_classifed#.fq \
        --unclassified-out kraken2/$SAMPLE/"$SAMPLE"_unclassifed#.fq \
        --gzip-compressed \
        --paired \
        clean-fastq/$SAMPLE/split-adapter-quality-trimmed/"$SAMPLE"-READ1.fastq.gz \
        clean-fastq/$SAMPLE/split-adapter-quality-trimmed/"$SAMPLE"-READ2.fastq.gz \
        >kraken2/$SAMPLE/"$SAMPLE"_kraken.tbl
    #------------------

done

Use the previous kraken runs from bartonella exp to get the bacillus reads

In [9]:
#get the read ids
for sample in ["TK25656"]:
    
    #short name
    short_name=sample

    #mkdir to save reads and read lists
    pathlib.Path("/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus-reads/{}/".format(short_name)).mkdir(parents=True, exist_ok=True)    

    #get kraken table that lists reads with terminal taxa
    results_tbl="/master/nplatt/patho_detect/results/extract_bartonella_loci/04_kraken2/{}/{}_kraken.tbl".format(short_name, short_name)

    #get reads ids that have target taxa hits
    with open(results_tbl, 'r') as in_f:
        with open("/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus-reads/{}/reads.txt".format(short_name), 'w') as out_f:
            for entry in in_f:
                if "bacillus" in entry.lower():
                    out_f.write("{}\n".format(entry.split("\t")[1]))
                    

In [12]:
#extract the reads
            
    
samples_w_gt10k_reads = []
for sample in ["TK25656"]:
    
    #short name
    short_name=sample

    for read in ["1", "2"]:
        in_reads  = "/master/nplatt/patho_detect/results/extract_bartonella_loci/03_clean-fastq/{}/split-adapter-quality-trimmed/{}-READ{}.fastq.gz".format(short_name, short_name, read)
        in_list   = "/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus-reads/{}/reads.txt".format(short_name)
        out_reads = "/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus-reads/{}/{}-READ{}.fastq".format(short_name, short_name, read)

        #count num reads (only proceed if greater than 10K)
        count = len(open(in_list).readlines(  ))
        
        if count >= 10_000:
            samples_w_gt10k_reads.append(short_name)
            #extract reads with seqtk
            cmd="seqtk subseq {} {} >{}".format(in_reads, in_list, out_reads)
            #!{cmd}
            
samples_w_gt10k_reads=list(set(samples_w_gt10k_reads))

In [None]:
%%bash

#gzip all of the fastq files
for FASTQ in $(ls bacillus-reads/*/*-READ*.fastq); do 
    gzip $FASTQ & 
done

### New heading

### Assemble Bacillus reads

In [16]:
#make phyluce assembly.conf
with open("confs/assembly.conf", 'w') as out_f:
    out_f.write("[samples]\n")
    
    for sample in samples_w_gt10k_reads:
        short_name=sample.split("_")[0]
        read_dir = "/master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus-reads/{}/".format(short_name)
        out_f.write("{}:{}\n".format(short_name, read_dir))

In [None]:
%%bash 

#fun phyluce assembly with spades
phyluce_assembly_assemblo_spades \
    --conf /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/confs/assembly.conf \
    --output /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/spades-assemblies \
    --cores 24 \
    --memory 768 \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
%%bash

#add genome uces to the assembly dir
mkdir /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/all_assemblies

cp /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/ncbi-genomes-uce-fasta/*.fasta all_assemblies/
cp /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/spades-assemblies/contigs/*.fasta all_assemblies/

rename .contigs.fasta .fasta all_assemblies/*.contigs.fasta

### Find UCE loci in assemblies

In [None]:
%%bash 

#find probed regions in assemblies
phyluce_assembly_match_contigs_to_probes \
    --contigs /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/all_assemblies \
    --probes /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus_probes.fas \
    --min-identity 75 \
    --output /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus_uce-search-results \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

### Extract Loci

In [20]:
#make phyluce assembly.conf
with open("confs/taxon-set.conf", 'w') as out_f:
    out_f.write("[all]\n")
    for assembly in glob.glob("all_assemblies/*.fasta"):
        short_name=assembly.replace(".fasta", "").split("/")[-1]
        out_f.write("{}\n".format(short_name,))

In [None]:
%%bash 

mkdir -p taxon-sets/all

# create the data matrix configuration file
phyluce_assembly_get_match_counts \
    --locus-db bacillus_uce-search-results/probe.matches.sqlite \
    --taxon-list-config confs/taxon-set.conf \
    --taxon-group 'all' \
    --incomplete-matrix \
    --output taxon-sets/all/all-taxa-incomplete.conf \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
cd taxon-sets/all

# make a log directory to hold our log files - this keeps things neat

# get FASTA data for taxa in our taxon set
phyluce_assembly_get_fastas_from_match_counts \
    --contigs /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/all_assemblies  \
    --locus-db /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/bacillus_uce-search-results/probe.matches.sqlite \
    --match-count-output all-taxa-incomplete.conf \
    --output all-taxa-incomplete.fasta \
    --incomplete-matrix all-taxa-incomplete.incomplete \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
phyluce_assembly_explode_get_fastas_file \
    --input all-taxa-incomplete.fasta \
    --output exploded-fastas \
    --by-taxon
    
# get summary stats on the FASTAS
echo "samples,contigs,total bp,mean length,95 CI length,min length,max length,median legnth,contigs >1kb">uce_stats.csv
    
for i in exploded-fastas/*.fasta; do
    phyluce_assembly_get_fasta_lengths \
        --input $i \
        --csv
done >>uce_stats.csv


### Align and trim loci

In [None]:
# align the data
phyluce_align_seqcap_align \
    --input all-taxa-incomplete.fasta \
    --output mafft-nexus-edge-trimmed \
    --taxa 4 \
    --aligner mafft \
    --cores 12 \
    --incomplete-matrix \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
phyluce_align_seqcap_align \
    --input all-taxa-incomplete.fasta \
    --output mafft-nexus-internal-trimmed \
    --taxa 4 \
    --aligner mafft \
    --cores 12 \
    --incomplete-matrix \
    --output-format fasta \
    --no-trim \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
phyluce_align_get_gblocks_trimmed_alignments_from_untrimmed \
    --alignments mafft-nexus-internal-trimmed \
    --output mafft-nexus-internal-trimmed-gblocks \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
phyluce_align_remove_locus_name_from_files \
    --alignments mafft-nexus-internal-trimmed-gblocks \
    --output mafft-nexus-internal-trimmed-gblocks-clean \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
phyluce_align_get_only_loci_with_min_taxa \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean \
    --taxa 4 \
    --percent 0.25 \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --cores 12 \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
phyluce_align_concatenate_alignments \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml \
    --nexus \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs

In [None]:
mkdir /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/10_uce_alignments

cp mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml* \
    /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/uce_alignments/

In [None]:
phyluce_align_concatenate_alignments \
    --alignments mafft-nexus-internal-trimmed-gblocks-clean-25p \
    --output mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml \
    --phylip \
    --log-path /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/logs 

In [None]:
cp mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml* \
    /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/uce_alignments/

# ...go to bartonella_phylogenetics.ipynb

In [None]:
cd /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656

raxml-ng \
    --all \
    --prefix 25pn_75pid_bacillus \
    --seed 12345 \
    --msa  /master/nplatt/patho_detect/results/extract_bacillus_from_TK25656/uce_alignments/mafft-nexus-internal-trimmed-gblocks-clean-25p-raxml.phylip \
    --msa-format PHYLIP \
    --data-type DNA \
    --model GTR+G \
    --tree pars{10} \
    --bs-trees 200 \
    --threads 24 \
    --workers 4