## Prep python

In [1]:
import os
import subprocess
import pandas as pd
import shutil
from shutil import copy
import time
from collections import defaultdict
from Bio import SeqIO
import glob

os.chdir("/master/nplatt/pathogen_probes/")


def wait_on_running_jobs():
   
    num_jobs = 1
    
    while num_jobs > 0:
        num_jobs = len(subprocess.check_output('qstat', shell=True).split("\n")) - 2
        time.sleep(60)
        print(".")

# Anaplasma

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [171]:
group = 'anaplasma'

In [172]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [173]:
group_taxa = [ "Anaplasma_marginale_GCF_000011945",
               "Anaplasma_marginale_GCF_000020305",
               "Anaplasma_marginale_GCF_000172475",
               "Anaplasma_marginale_GCF_000172495",
               "Anaplasma_marginale_GCF_000172515",
               "Anaplasma_marginale_GCF_000215485",
               "Anaplasma_marginale_GCF_000215505",
               "Anaplasma_marginale_GCF_000215525",
               "Anaplasma_marginale_GCF_000215545",
               "Anaplasma_marginale_GCF_000215565",
               "Anaplasma_marginale_GCF_000215585",
               "Anaplasma_marginale_GCF_000215605",
               "Anaplasma_marginale_GCF_000495495",
               "Anaplasma_marginale_GCF_000495535",
               "Anaplasma_marginale_GCF_002849365",
               "Anaplasma_marginale_GCF_003331125",
               "Anaplasma_marginale_GCF_003515675",
               "Anaplasma_marginale_GCF_003515735",
               "Anaplasma_marginale_GCF_008274665",
               "Anaplasma_marginale_GCF_008690255",
               "Anaplasma_marginale_GCF_008690265",
               "Anaplasma_marginale_GCF_008801275",
               "Anaplasma_marginale_GCF_008801305",
               "Anaplasma_marginale_GCF_008801325",
               "Anaplasma_ovis_GCF_002214625",
               "Anaplasma_ovis_GCF_002849345",
               "Anaplasma_phagocytophilum_GCA_900088675",
               "Anaplasma_phagocytophilum_GCF_000439755",
               "Anaplasma_phagocytophilum_GCF_000439775",
               "Anaplasma_phagocytophilum_GCF_000439795",
               "Anaplasma_phagocytophilum_GCF_000478425",
               "Anaplasma_phagocytophilum_GCF_000478445",
               "Anaplasma_phagocytophilum_GCF_000689615",
               "Anaplasma_phagocytophilum_GCF_000689635",
               "Anaplasma_phagocytophilum_GCF_000689655",
               "Anaplasma_phagocytophilum_GCF_000964685",
               "Anaplasma_phagocytophilum_GCF_000964725",
               "Anaplasma_phagocytophilum_GCF_000964745",
               "Anaplasma_phagocytophilum_GCF_000964785",
               "Anaplasma_phagocytophilum_GCF_000964915",
               "Anaplasma_phagocytophilum_GCF_000964935",
               "Anaplasma_phagocytophilum_GCF_000964945",
               "Anaplasma_phagocytophilum_GCF_000964985",
               "Anaplasma_phagocytophilum_GCF_000965125",
               "Anaplasma_phagocytophilum_GCF_000968455",
               "Anaplasma_phagocytophilum_GCF_000968465",
               "Anaplasma_phagocytophilum_GCF_002849375",
               "Anaplasma_phagocytophilum_GCF_900000025",
               "Anaplasma_phagocytophilum_GCF_900078505",
               "Anaplasma_phagocytophilum_GCF_900088605",
               "Anaplasma_phagocytophilum_GCF_900088615",
               "Anaplasma_phagocytophilum_GCF_900088625",
               "Anaplasma_phagocytophilum_GCF_900088645",
               "Anaplasma_phagocytophilum_GCF_900088665" ]

reference_taxon = "Anaplasma_phagocytophilum_GCF_000013125"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [176]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000024505.1_ASM2450v1_genomic.fna.gz

sent 42 bytes  received 360892 bytes  42462.82 bytes/sec
total size is 360692  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary 

GCA_000215565.2_ASM21556v2_genomic.fna.gz

sent 42 bytes  received 351375 bytes  702834.00 bytes/sec
total size is 351182  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000215585.2_ASM21558v2_genomic.fna.gz

sent 42 bytes  received 429018 bytes  286040.00 bytes/sec
total size is 428801  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_008690265.1_ASM869026v1_genomic.fna.gz

sent 42 bytes  received 357949 bytes  715982.00 bytes/sec
total size is 357755  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_008801275.1_ASM880127v1_genomic.fna.gz

sent 42 bytes  received 356136 bytes  237452.00 bytes/sec
total size is 355942  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, ne

GCA_000478425.1_ASM47842v1_genomic.fna.gz

sent 42 bytes  received 430437 bytes  860958.00 bytes/sec
total size is 430220  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000478445.1_ASM47844v1_genomic.fna.gz

sent 42 bytes  received 442389 bytes  294954.00 bytes/sec
total size is 442172  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000964945.1_ASM96494v1_genomic.fna.gz

sent 42 bytes  received 435834 bytes  290584.00 bytes/sec
total size is 435617  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000964985.1_ASM96498v1_genomic.fna.gz

sent 42 bytes  received 439469 bytes  175804.40 bytes/sec
total size is 439252  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw


GCA_900088645.2_ANAPRD1_2_genomic.fna.gz

sent 42 bytes  received 478411 bytes  50363.47 bytes/sec
total size is 478187  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900088665.2_ANAPC5_genomic.fna.gz

sent 42 bytes  received 519887 bytes  346619.33 bytes/sec
total size is 519658  speedup is 1.00


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [177]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [178]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483454 ("sim_Anaplasma_marginale_GCF_000011945") has been submitted
Your job 5483456 ("sim_Anaplasma_marginale_GCF_000020305") has been submitted
Your job 5483457 ("sim_Anaplasma_marginale_GCF_000172475") has been submitted
Your job 5483459 ("sim_Anaplasma_marginale_GCF_000172495") has been submitted
Your job 5483461 ("sim_Anaplasma_marginale_GCF_000172515") has been submitted
Your job 5483463 ("sim_Anaplasma_marginale_GCF_000215485") has been submitted
Your job 5483465 ("sim_Anaplasma_marginale_GCF_000215505") has been submitted
Your job 5483467 ("sim_Anaplasma_marginale_GCF_000215525") has been submitted
Your job 5483469 ("sim_Anaplasma_marginale_GCF_000215545") has been submitted
Your job 5483471 ("sim_Anaplasma_marginale_GCF_000215565") has been submitted
Your job 5483473 ("sim_Anaplasma_marginale_GCF_000215585") has been submitted
Your job 5483475 ("sim_Anaplasma_marginale_GCF_000215605") has been submitted
Your job 5483477 ("sim_Anaplasma_marginale_GCF_000495495") has b

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [179]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/anaplasma/cleaned_genomes/Anaplasma_phagocytophilum_GCF_000013125_formatted.fas path=results/phyluce/anaplasma/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [180]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483598 ("map_Anaplasma_marginale_GCF_000011945") has been submitted
Your job 5483600 ("map_Anaplasma_marginale_GCF_000020305") has been submitted
Your job 5483602 ("map_Anaplasma_marginale_GCF_000172475") has been submitted
Your job 5483604 ("map_Anaplasma_marginale_GCF_000172495") has been submitted
Your job 5483606 ("map_Anaplasma_marginale_GCF_000172515") has been submitted
Your job 5483608 ("map_Anaplasma_marginale_GCF_000215485") has been submitted
Your job 5483610 ("map_Anaplasma_marginale_GCF_000215505") has been submitted
Your job 5483612 ("map_Anaplasma_marginale_GCF_000215525") has been submitted
Your job 5483614 ("map_Anaplasma_marginale_GCF_000215545") has been submitted
Your job 5483616 ("map_Anaplasma_marginale_GCF_000215565") has been submitted
Your job 5483618 ("map_Anaplasma_marginale_GCF_000215585") has been submitted
Your job 5483620 ("map_Anaplasma_marginale_GCF_000215605") has been submitted
Your job 5483622 ("map_Anaplasma_marginale_GCF_000495495") has b

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [214]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484252 ("merge_Anaplasma_marginale_GCF_000011945") has been submitted
Your job 5484253 ("merge_Anaplasma_marginale_GCF_000020305") has been submitted
Your job 5484254 ("merge_Anaplasma_marginale_GCF_000172475") has been submitted
Your job 5484255 ("merge_Anaplasma_marginale_GCF_000172495") has been submitted
Your job 5484256 ("merge_Anaplasma_marginale_GCF_000172515") has been submitted
Your job 5484257 ("merge_Anaplasma_marginale_GCF_000215485") has been submitted
Your job 5484258 ("merge_Anaplasma_marginale_GCF_000215505") has been submitted
Your job 5484259 ("merge_Anaplasma_marginale_GCF_000215525") has been submitted
Your job 5484260 ("merge_Anaplasma_marginale_GCF_000215545") has been submitted
Your job 5484261 ("merge_Anaplasma_marginale_GCF_000215565") has been submitted
Your job 5484262 ("merge_Anaplasma_marginale_GCF_000215585") has been submitted
Your job 5484263 ("merge_Anaplasma_marginale_GCF_000215605") has been submitted
Your job 5484264 ("merge_Anaplasma_margi

remove loci that were masked in the original genome

In [215]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 18 sequences from Anaplasma_marginale_GCF_000011945_merged.bed.  Filtered 12 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 21 sequences from Anaplasma_marginale_GCF_000020305_merged.bed.  Filtered 15 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 24 sequences from Anaplasma_marginale_GCF_000172475_merged.bed.  Filtered 17 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 7.
Screened 23 sequences from Anaplasma_marginale_GCF_000172495_merged.bed.  Filtered 17 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 20 sequences from Anaplasma_marginale_GCF_000172515_merged.bed.  Filtered 16 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 23 sequences from Anaplasma_marginale_GCF_000215485_merged.bed.  Filtered 19 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 21 sequences from Anaplasma_marginale_GCF_000215505_merged.bed.  Filtered 17 wi

Screened 882 sequences from Anaplasma_phagocytophilum_GCF_900088665_merged.bed.  Filtered 225 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 657.


## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [216]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [217]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/anaplasma/initial_intervals/Anaplasma_phagocytophilum_GCF_900088665_merged.bed --twobit results/phyluce/anaplasma/cleaned_genomes/Anaplasma_phagocytophilum_GCF_000013125_formatted.2bit --output results/phyluce/anaplasma/initial_intervals/Anaplasma_phagocytophilum_GCF_900088665_stripped.bed;
anaplasma_marginale_gcf_000011945.
anaplasma_marginale_gcf_000020305.
anaplasma_marginale_gcf_000172475.
anaplasma_marginale_gcf_000172495.
anaplasma_marginale_gcf_000172515.
anaplasma_marginale_gcf_000215485.
anaplasma_marginale_gcf_000215505.
anaplasma_marginale_gcf_000215525.
anaplasma_marginale_gcf_000215545.
anaplasma_marginale_gcf_000215565.
anaplasma_marginale_gcf_000215585.
anaplasma_marginale_gcf_000215605.
anaplasma_marginale_gcf_000495495.
anaplasma_marginale_gcf_000495535.
anaplasma_marginale_gcf_002849365.
anaplasma_marginale_gcf_003331125.
anaplasma_marginale_gcf_003515675.
anaplasma_marg

Quantify probes and the number of targeted taxa for each.

In [218]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/anaplasma/initial_intervals/Anaplasma_phagocytophilum_GCF_900088665_merged.bed --twobit results/phyluce/anaplasma/cleaned_genomes/Anaplasma_phagocytophilum_GCF_000013125_formatted.2bit --output results/phyluce/anaplasma/initial_intervals/Anaplasma_phagocytophilum_GCF_900088665_stripped.bed;
Screened 882 sequences from Anaplasma_phagocytophilum_GCF_900088665_merged.bed.  Filtered 225 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 657.


In [219]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = str(18)
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/anaplasma/initial_intervals/anaplasma-to-Anaplasma_phagocytophilum_GCF_000013125.sqlite --base-taxon Anaplasma_phagocytophilum_GCF_000013125 --output results/phyluce/anaplasma/initial_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18.bed --specific-counts 18;
Counter({'anaplasma_phagocytophilum_gcf_000964685': 776, 'anaplasma_phagocytophilum_gcf_000964985': 776, 'anaplasma_phagocytophilum_gcf_000964945': 775, 'anaplasma_phagocytophilum_gcf_000968465': 775, 'anaplasma_phagocytophilum_gcf_000968455': 775, 'anaplasma_phagocytophilum_gcf_000964935': 775, 'anaplasma_phagocytophilum_gcf_000964725': 774, 'anaplasma_phagocytophilum_gcf_000965125': 774, 'anaplasma_phagocytophilum_gcf_000689655': 761, 'anaplasma_phagocytophilum_gcf_002849375': 756, 'anaplasma_phagocytophilum_gcf_000964785': 751, 'anaplasma_phagocytophilum_gcf_000964745': 748, 'anaplasma_phagocytophilum_gcf_900088625': 744, 'anaplasma_phagocytophilum_gcf_000689635': 7

## Design temp set of baits

In [220]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/anaplasma/initial_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18.bed --twobit results/phyluce/anaplasma/cleaned_genomes/Anaplasma_phagocytophilum_GCF_000013125_formatted.2bit --buffer-to 160 --output results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18.fasta;
Screened 777 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 777.


design the baits

In [221]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18.fasta --probe-prefix uce_anaplasma_ --design anaplasma_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGG


Conserved locus count = 773
Probe Count = 1540


## Find duplicate baited regions

In [222]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas --query results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  13:23:54
Ended:  Fri Feb 07, 2020  13:23:55
Time for execution:  0.0193908691406 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas                        --lastz results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes_vself.lastz                       --probe-prefix=uce_anaplasma_;
Parsing lastz file...
Screening results...
Screened 1539 fasta sequences.  Filtered 148 

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [223]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [224]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/anaplasma/validate_intervals/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas --scaffoldlist Anaplasma_marginale_GCF_000011945 Anaplasma_marginale_GCF_000020305 Anaplasma_marginale_GCF_000172475 Anaplasma_marginale_GCF_000172495 Anaplasma_marginale_GCF_000172515 Anaplasma_marginale_GCF_000215485 Anaplasma_marginale_GCF_000215505 Anaplasma_marginale_GCF_000215525 Anaplasma_marginale_GCF_000215545 Anaplasma_marginale_GCF_000215565 Anaplasma_marginale_GCF_000215585 Anaplasma_marginale_GCF_000215605 Anaplasma_marginale_GCF_000495495 Anaplasma_marginale_GCF_000495535 Anaplasma_marginale_GCF_002849365 Anaplasma_marginale_GCF_003331125 Anaplasma_marginale_GCF_003515675 Anaplasma_marginale_GCF_003515735 Anaplasma_marginale_GCF_008274665 Anaplasma_marginale_GCF_008690255 Anaplasma_marginale_GCF_008690265 Anaplasma_marginale_GCF_008801275 Anaplasma_marginale_GCF_008801305 Anaplasma_marginale_GCF_008801325 Anaplasma_o

Running the targets against 1 queries...
	/tmp/tmp0NgLLp.fasta

Writing the results file...
	/tmp/tmp8N611q.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/anaplasma/validate_intervals/lastz/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas_v_Anaplasma_marginale_GCF_000215585.lastz
Creating Anaplasma_marginale_GCF_000215585 table
Inserting data to Anaplasma_marginale_GCF_000215585 table

Running against Anaplasma_marginale_GCF_000215605.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp522Hks.fasta

Writing the results file...
	/tmp/tmpuptKFH.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/anaplasma/validate_intervals/lastz/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas_v_Anaplasma_marginale_GCF_000215605.lastz
Creating Anaplasma_marginale_GCF_000215605 table
Inserting data to Anaplasma_marginale_GC

Creating Anaplasma_ovis_GCF_002214625 table
Inserting data to Anaplasma_ovis_GCF_002214625 table

Running against Anaplasma_ovis_GCF_002849345.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp85GxPP.fasta

Writing the results file...
	/tmp/tmpjsYbTD.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/anaplasma/validate_intervals/lastz/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas_v_Anaplasma_ovis_GCF_002849345.lastz
Creating Anaplasma_ovis_GCF_002849345 table
Inserting data to Anaplasma_ovis_GCF_002849345 table

Running against Anaplasma_phagocytophilum_GCA_900088675.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpyotXoG.fasta

Writing the results file...
	/tmp/tmpzlVLge.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/anaplasma/validate_

Creating Anaplasma_phagocytophilum_GCF_000964785 table
Inserting data to Anaplasma_phagocytophilum_GCF_000964785 table

Running against Anaplasma_phagocytophilum_GCF_000964915.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpdG990K.fasta

Writing the results file...
	/tmp/tmphRaFFY.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/anaplasma/validate_intervals/lastz/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas_v_Anaplasma_phagocytophilum_GCF_000964915.lastz
Creating Anaplasma_phagocytophilum_GCF_000964915 table
Inserting data to Anaplasma_phagocytophilum_GCF_000964915 table

Running against Anaplasma_phagocytophilum_GCF_000964935.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpFwewBI.fasta

Writing the results file...
	/tmp/tmpj4eYNQ.lastz
Cleaning up the chunked files...
Cleaning 

Creating Anaplasma_phagocytophilum_GCF_900088645 table
Inserting data to Anaplasma_phagocytophilum_GCF_900088645 table

Running against Anaplasma_phagocytophilum_GCF_900088665.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpiCkIAJ.fasta

Writing the results file...
	/tmp/tmpto8D8I.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/anaplasma/validate_intervals/lastz/Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas_v_Anaplasma_phagocytophilum_GCF_900088665.lastz
Creating Anaplasma_phagocytophilum_GCF_900088665 table
Inserting data to Anaplasma_phagocytophilum_GCF_900088665 table

Running against Anaplasma_phagocytophilum_GCF_000013125.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpLalAcS.fasta

Writing the results file...
	/tmp/tmpNoSVJl.lastz
Cleaning up the chunked files...
Cleaning 

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [225]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/anaplasma/extract_probes_from_group/anaplasma_genome.conf --lastz results/phyluce/anaplasma/validate_intervals/lastz --probes 120 --probe-prefix uce_anaplasma_ --name-pattern "Anaplasma_phagocytophilum_GCF_000013125_+18_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/anaplasma/extract_probes_from_group/probe_fasta;
2020-02-07 13:29:43,212 - Phyluce - INFO - ------ Working on Anaplasma_marginale_GCF_000011945 genome ------
2020-02-07 13:29:43,219 - Phyluce - INFO - Reading Anaplasma_marginale_GCF_000011945 genome
2020-02-07 13:29:44,546 - Phyluce - INFO - Anaplasma_marginale_GCF_000011945: 236 uces, 0 dupes, 236 non-dupes, 41 orient drop, 1 length drop, 194 written
2020-02-07 13:29:44,546 - Phyluce - INFO - ------ Working on Anaplasma_marginale_GCF_000020305 genome ------
2020-02-07 13:29:44,787 - Phyluce - INFO - Reading Anaplasma_marginale_GCF_000020305 genome
2020-02-07 13:29:45,960 - Phyluce - INFO - Anaplas

2020-02-07 13:30:06,909 - Phyluce - INFO - Anaplasma_marginale_GCF_008801305: 233 uces, 31 dupes, 202 non-dupes, 0 orient drop, 2 length drop, 200 written
2020-02-07 13:30:06,909 - Phyluce - INFO - ------ Working on Anaplasma_marginale_GCF_008801325 genome ------
2020-02-07 13:30:06,920 - Phyluce - INFO - Reading Anaplasma_marginale_GCF_008801325 genome
2020-02-07 13:30:07,823 - Phyluce - INFO - Anaplasma_marginale_GCF_008801325: 232 uces, 25 dupes, 207 non-dupes, 0 orient drop, 0 length drop, 207 written
2020-02-07 13:30:07,824 - Phyluce - INFO - --------- Working on Anaplasma_ovis_GCF_002214625 genome --------
2020-02-07 13:30:07,825 - Phyluce - INFO - Reading Anaplasma_ovis_GCF_002214625 genome
2020-02-07 13:30:08,979 - Phyluce - INFO - Anaplasma_ovis_GCF_002214625: 230 uces, 0 dupes, 230 non-dupes, 39 orient drop, 4 length drop, 187 written
2020-02-07 13:30:08,979 - Phyluce - INFO - --------- Working on Anaplasma_ovis_GCF_002849345 genome --------
2020-02-07 13:30:08,999 - Phyluce 

2020-02-07 13:35:51,253 - Phyluce - INFO - Anaplasma_phagocytophilum_GCF_000968465: 772 uces, 134 dupes, 638 non-dupes, 69 orient drop, 67 length drop, 502 written
2020-02-07 13:35:51,254 - Phyluce - INFO - --- Working on Anaplasma_phagocytophilum_GCF_002849375 genome ---
2020-02-07 13:35:51,254 - Phyluce - INFO - Reading Anaplasma_phagocytophilum_GCF_002849375 genome
2020-02-07 13:36:04,064 - Phyluce - INFO - Anaplasma_phagocytophilum_GCF_002849375: 759 uces, 189 dupes, 570 non-dupes, 2 orient drop, 14 length drop, 553 written
2020-02-07 13:36:04,064 - Phyluce - INFO - --- Working on Anaplasma_phagocytophilum_GCF_900000025 genome ---
2020-02-07 13:36:04,114 - Phyluce - INFO - Reading Anaplasma_phagocytophilum_GCF_900000025 genome
2020-02-07 13:36:14,030 - Phyluce - INFO - Anaplasma_phagocytophilum_GCF_900000025: 729 uces, 199 dupes, 530 non-dupes, 2 orient drop, 9 length drop, 515 written
2020-02-07 13:36:14,031 - Phyluce - INFO - --- Working on Anaplasma_phagocytophilum_GCF_900078505

In [226]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/anaplasma/extract_probes_from_group/probe_fasta --output results/phyluce/anaplasma/extract_probes_from_group/multifastas.sqlite --base-taxon Anaplasma_phagocytophilum_GCF_000013125;
anaplasma_marginale_gcf_000011945.
anaplasma_marginale_gcf_000020305.
anaplasma_marginale_gcf_000172475.
anaplasma_marginale_gcf_000172495.
anaplasma_marginale_gcf_000172515.
anaplasma_marginale_gcf_000215485.
anaplasma_marginale_gcf_000215505.
anaplasma_marginale_gcf_000215525.
anaplasma_marginale_gcf_000215545.
anaplasma_marginale_gcf_000215565.
anaplasma_marginale_gcf_000215585.
anaplasma_marginale_gcf_000215605.
anaplasma_marginale_gcf_000495495.
anaplasma_marginale_gcf_000495535.
anaplasma_marginale_gcf_002849365.
anaplasma_marginale_gcf_003331125.
anaplasma_marginale_gcf_003515675.
anaplasma_marginale_gcf_003515735.
anaplasma_marginale_gcf_008274665.
anaplasma_marginale_gcf_008690255.
anaplasma_marginale_gcf_008690265.
anaplasma_marginale_gc

In [227]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(55)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/anaplasma/extract_probes_from_group/multifastas.sqlite  --base-taxon Anaplasma_phagocytophilum_GCF_000013125 --output results/phyluce/anaplasma/extract_probes_from_group/Anaplasma_phagocytophilum_GCF_000013125+18-back-to-55.conf --specific-counts 55;
Counter({'anaplasma_marginale_gcf_008801305': 61, 'anaplasma_ovis_gcf_002214625': 61, 'anaplasma_marginale_gcf_008274665': 61, 'anaplasma_ovis_gcf_002849345': 61, 'anaplasma_marginale_gcf_008801325': 61, 'anaplasma_phagocytophilum_gcf_000964945': 61, 'anaplasma_phagocytophilum_gcf_000478425': 61, 'anaplasma_phagocytophilum_gcf_000478445': 61, 'anaplasma_phagocytophilum_gcf_000964785': 61, 'anaplasma_phagocytophilum_gcf_000968465': 61, 'anaplasma_marginale_gcf_000495535': 61, 'anaplasma_marginale_gcf_000215545': 61, 'anaplasma_phagocytophilum_gcf_000964685': 61, 'anaplasma_marginale_gcf_008801275': 61, 'anaplasma_marginale_gcf_000172475': 61, 'anaplasma_marginale_gcf_003331125': 61,

## Final group specific bait design

In [228]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/anaplasma/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/anaplasma/extract_probes_from_group/Anaplasma_phagocytophilum_GCF_000013125+18-back-to-55.conf --probe-prefix uce_anaplasma_ --designer rnplattii --design anaplasma_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGG


Conserved locus count = 61
Probe Count = 6693


In [229]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list.fasta --query results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  13:38:30
Ended:  Fri Feb 07, 2020  13:39:00
Time for execution:  0.506612567107 minutes


In [230]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list.fasta --lastz results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_anaplasma_;
Parsing lastz file...
Screening results...
Screened 6692 fasta sequences.  Filtered 0 duplicates. Kept 6693.


## CDhit to reduce numbers

In [231]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list.fasta
         -o
         results/phyluce/anaplasma/final_probe_design/anaplasma_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 13:50:40 2020
                            Output                              
----------------------------------------------------------------
total seq: 6693
longest and shortest : 80 and 80
Total letters: 535440
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 88M

Table limit with the given memory limit:
Max number of representatives: 3953615
Max number of word counting entries: 88995891

# comparing sequences from          0  to       1115
.---------- new table with       83 representatives
# comparing sequences from       1115 

# Bacillus cereus group

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [232]:
group = 'bacillus'

In [233]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [234]:
group_taxa = [ "Bacillus_albus_GCF_001884185",
               "Bacillus_albus_GCF_003966295",
               "Bacillus_albus_GCF_004116025",
               "Bacillus_albus_GCF_004116035",
               "Bacillus_albus_GCF_004116065",
               "Bacillus_albus_GCF_004116085",
               "Bacillus_albus_GCF_004116415",
               "Bacillus_albus_GCF_004153665",
               "Bacillus_albus_GCF_009740005",
               "Bacillus_anthracis_GCF_000558965",
               "Bacillus_anthracis_GCF_000559005",
               "Bacillus_anthracis_GCF_000697555",
               "Bacillus_anthracis_GCF_000742875",
               "Bacillus_anthracis_GCF_000783055",
               "Bacillus_anthracis_GCF_001883895",
               "Bacillus_anthracis_GCF_002233635",
               "Bacillus_anthracis_GCF_002525695",
               "Bacillus_anthracis_GCF_002525705",
               "Bacillus_bombysepticus_GCF_000831065",
               "Bacillus_cereus_GCF_000161315",
               "Bacillus_cereus_GCF_000290795",
               "Bacillus_cereus_GCF_000291075",
               "Bacillus_cereus_GCF_001583875",
               "Bacillus_cereus_GCF_001584025",
               "Bacillus_cereus_GCF_001683695",
               "Bacillus_cereus_GCF_002530565",
               "Bacillus_cereus_GCF_002552015",
               "Bacillus_cereus_GCF_002560715",
               "Bacillus_cereus_GCF_002565765",
               "Bacillus_cytotoxicus_GCF_000017425",
               "Bacillus_cytotoxicus_GCF_001050335",
               "Bacillus_cytotoxicus_GCF_002250885",
               "Bacillus_cytotoxicus_GCF_002250905",
               "Bacillus_cytotoxicus_GCF_002250925",
               "Bacillus_cytotoxicus_GCF_002250945",
               "Bacillus_cytotoxicus_GCF_002250965",
               "Bacillus_cytotoxicus_GCF_002251005",
               "Bacillus_cytotoxicus_GCF_002251025",
               "Bacillus_cytotoxicus_GCF_002251045",
               "Bacillus_gaemokensis_GCF_000712615",
               "Bacillus_gaemokensis_GCF_001590835",
               "Bacillus_luti_GCF_001884105",
               "Bacillus_luti_GCF_008923665",
               "Bacillus_luti_GCF_009739945",
               "Bacillus_manliponensis_GCF_000712595",
               "Bacillus_mobilis_GCF_001884045",
               "Bacillus_mobilis_GCF_003612955",
               "Bacillus_mobilis_GCF_007681185",
               "Bacillus_mobilis_GCF_007681195",
               "Bacillus_mobilis_GCF_007682195",
               "Bacillus_mycoides_GCF_000018825",
               "Bacillus_mycoides_GCF_000290695",
               "Bacillus_mycoides_GCF_000517985",
               "Bacillus_mycoides_GCF_001044935",
               "Bacillus_mycoides_GCF_001541985",
               "Bacillus_mycoides_GCF_001757825",
               "Bacillus_mycoides_GCF_001757915",
               "Bacillus_mycoides_GCF_001757985",
               "Bacillus_mycoides_GCF_002014505",
               "Bacillus_mycoides_GCF_002014695",
               "Bacillus_nitratireducens_GCF_001884135",
               "Bacillus_nitratireducens_GCF_007676595",
               "Bacillus_nitratireducens_GCF_007676605",
               "Bacillus_nitratireducens_GCF_007681065",
               "Bacillus_nitratireducens_GCF_007681365",
               "Bacillus_nitratireducens_GCF_007682355",
               "Bacillus_pacificus_GCA_009884315",
               "Bacillus_pacificus_GCF_001884025",
               "Bacillus_pacificus_GCF_003858675",
               "Bacillus_pacificus_GCF_006349595",
               "Bacillus_paramycoides_GCF_001884235",
               "Bacillus_paramycoides_GCF_007682005",
               "Bacillus_paranthracis_GCA_009648955",
               "Bacillus_paranthracis_GCA_009873395",
               "Bacillus_paranthracis_GCA_009884335",
               "Bacillus_paranthracis_GCF_001883995",
               "Bacillus_paranthracis_GCF_004307985",
               "Bacillus_paranthracis_GCF_005117095",
               "Bacillus_paranthracis_GCF_005848985",
               "Bacillus_paranthracis_GCF_007682135",
               "Bacillus_paranthracis_GCF_007682155",
               "Bacillus_paranthracis_GCF_009498675",
               "Bacillus_paranthracis_GCF_009799965",
               "Bacillus_proteolyticus_GCF_001884065",
               "Bacillus_pseudomycoides_GCF_000161415",
               "Bacillus_pseudomycoides_GCF_000161435",
               "Bacillus_pseudomycoides_GCF_000746965",
               "Bacillus_pseudomycoides_GCF_002550095",
               "Bacillus_pseudomycoides_GCF_002555615",
               "Bacillus_pseudomycoides_GCF_002555685",
               "Bacillus_pseudomycoides_GCF_002555995",
               "Bacillus_pseudomycoides_GCF_002556205",
               "Bacillus_pseudomycoides_GCF_002557395",
               "Bacillus_pseudomycoides_GCF_002569075",
               "Bacillus_thuringiensis_GCF_000161735",
               "Bacillus_thuringiensis_GCF_000710255",
               "Bacillus_thuringiensis_GCF_001402735",
               "Bacillus_thuringiensis_GCF_001675515",
               "Bacillus_thuringiensis_GCF_002148135",
               "Bacillus_thuringiensis_GCF_002551895",
               "Bacillus_thuringiensis_GCF_002559825",
               "Bacillus_thuringiensis_GCF_002562665",
               "Bacillus_thuringiensis_GCF_002564505",
               "Bacillus_thuringiensis_GCF_002570925",
               "Bacillus_toyonensis_GCA_009799785",
               "Bacillus_toyonensis_GCF_002550925",
               "Bacillus_toyonensis_GCF_002551965",
               "Bacillus_toyonensis_GCF_002554865",
               "Bacillus_toyonensis_GCF_002555905",
               "Bacillus_toyonensis_GCF_002556675",
               "Bacillus_toyonensis_GCF_002570155",
               "Bacillus_toyonensis_GCF_002580355",
               "Bacillus_toyonensis_GCF_002581325",
               "Bacillus_toyonensis_GCF_002581955",
               "Bacillus_tropicus_GCF_001884035",
               "Bacillus_tropicus_GCF_006349625",
               "Bacillus_tropicus_GCF_006349645",
               "Bacillus_tropicus_GCF_006457285",
               "Bacillus_tropicus_GCF_007672275",
               "Bacillus_tropicus_GCF_007676425",
               "Bacillus_tropicus_GCF_007682035",
               "Bacillus_tropicus_GCF_007682405",
               "Bacillus_wiedmannii_GCF_000160955",
               "Bacillus_wiedmannii_GCF_001645395",
               "Bacillus_wiedmannii_GCF_002014635",
               "Bacillus_wiedmannii_GCF_002555555",
               "Bacillus_wiedmannii_GCF_002555735",
               "Bacillus_wiedmannii_GCF_002555855",
               "Bacillus_wiedmannii_GCF_002555945",
               "Bacillus_wiedmannii_GCF_002556175",
               "Bacillus_wiedmannii_GCF_002556405",
               "Bacillus_wiedmannii_GCF_002569595" ]
                    
reference_taxon = "Bacillus_anthracis_GCF_000008165"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [235]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001884185.1_ASM188418v1_genomic.fna.gz

sent 42 bytes  received 1711674 bytes  380381.33 bytes/sec
total size is 1711144  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
discipli

GCA_000558965.1_Ba8903G_1.0_genomic.fna.gz

sent 42 bytes  received 1612544 bytes  460738.86 bytes/sec
total size is 1612038  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000559005.1_Ba52G_1.0_genomic.fna.gz

sent 42 bytes  received 1613112 bytes  460901.14 bytes/sec
total size is 1612608  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, 

GCA_000290795.1_Baci_cere_VD154_V1_genomic.fna.gz

sent 42 bytes  received 1814030 bytes  213420.24 bytes/sec
total size is 1813469  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000291075.1_Baci_cere_HuA4-10_V1_genomic.fna.gz

sent 42 bytes  received 1693351 bytes  483826.57 bytes/sec
total size is 1692820  speedup is 1.00


You are accessing a U.S. Government information system which includ

GCA_002250885.2_ASM225088v2_genomic.fna.gz

sent 42 bytes  received 1271918 bytes  363417.14 bytes/sec
total size is 1271500  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002250905.2_ASM225090v2_genomic.fna.gz

sent 42 bytes  received 1271118 bytes  363188.57 bytes/sec
total size is 1270700  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_008923665.1_ASM892366v1_genomic.fna.gz

sent 42 bytes  received 1632111 bytes  652861.20 bytes/sec
total size is 1631605  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_009739945.1_ASM973994v1_genomic.fna.gz

sent 42 bytes  received 1550593 bytes  443038.57 bytes/sec
total size is 1550103  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_001044935.1_ASM104493v1_genomic.fna.gz

sent 42 bytes  received 1785317 bytes  510102.57 bytes/sec
total size is 1784771  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001541985.1_ASM154198v1_genomic.fna.gz

sent 42 bytes  received 1748306 bytes  499528.00 bytes/sec
total size is 1747768  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_007681365.1_ASM768136v1_genomic.fna.gz

sent 42 bytes  received 1719053 bytes  382021.11 bytes/sec
total size is 1718523  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_007682355.1_ASM768235v1_genomic.fna.gz

sent 42 bytes  received 1730813 bytes  203630.00 bytes/sec
total size is 1730283  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_001883995.1_ASM188399v1_genomic.fna.gz

sent 42 bytes  received 1624670 bytes  361047.11 bytes/sec
total size is 1624164  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_004307985.1_ASM430798v1_genomic.fna.gz

sent 42 bytes  received 1219474 bytes  128370.11 bytes/sec
total size is 1219064  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_000746965.1_BHU_1_genomic.fna.gz

sent 42 bytes  received 1590023 bytes  244625.38 bytes/sec
total size is 1589531  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002550095.1_ASM255009v1_genomic.fna.gz

sent 42 bytes  received 1680194 bytes  305497.45 bytes/sec
total size is 1679672  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_001675515.1_ASM167551v1_genomic.fna.gz

sent 42 bytes  received 1834626 bytes  333576.00 bytes/sec
total size is 1834072  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002148135.1_ASM214813v1_genomic.fna.gz

sent 42 bytes  received 1766187 bytes  271727.54 bytes/sec
total size is 1765649  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_002555905.1_ASM255590v1_genomic.fna.gz

sent 42 bytes  received 1756276 bytes  206625.65 bytes/sec
total size is 1755738  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002556675.1_ASM255667v1_genomic.fna.gz

sent 42 bytes  received 1745341 bytes  387862.89 bytes/sec
total size is 1744803  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_007676425.1_ASM767642v1_genomic.fna.gz

sent 42 bytes  received 1531947 bytes  1021326.00 bytes/sec
total size is 1531465  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_007682035.1_ASM768203v1_genomic.fna.gz

sent 42 bytes  received 1781098 bytes  1187426.67 bytes/sec
total size is 1780552  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_002556405.1_ASM255640v1_genomic.fna.gz

sent 42 bytes  received 1542952 bytes  1028662.67 bytes/sec
total size is 1542462  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002569595.1_ASM256959v1_genomic.fna.gz

sent 42 bytes  received 1774754 bytes  709918.40 bytes/sec
total size is 1774208  speedup is 1.00


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [236]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [237]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5485056 ("sim_Bacillus_albus_GCF_001884185") has been submitted
Your job 5485057 ("sim_Bacillus_albus_GCF_003966295") has been submitted
Your job 5485058 ("sim_Bacillus_albus_GCF_004116025") has been submitted
Your job 5485059 ("sim_Bacillus_albus_GCF_004116035") has been submitted
Your job 5485060 ("sim_Bacillus_albus_GCF_004116065") has been submitted
Your job 5485061 ("sim_Bacillus_albus_GCF_004116085") has been submitted
Your job 5485062 ("sim_Bacillus_albus_GCF_004116415") has been submitted
Your job 5485063 ("sim_Bacillus_albus_GCF_004153665") has been submitted
Your job 5485064 ("sim_Bacillus_albus_GCF_009740005") has been submitted
Your job 5485065 ("sim_Bacillus_anthracis_GCF_000558965") has been submitted
Your job 5485066 ("sim_Bacillus_anthracis_GCF_000559005") has been submitted
Your job 5485067 ("sim_Bacillus_anthracis_GCF_000697555") has been submitted
Your job 5485068 ("sim_Bacillus_anthracis_GCF_000742875") has been submitted
Your job 5485069 ("sim_Bacillus_ant

Your job 5485162 ("sim_Bacillus_toyonensis_GCF_002551965") has been submitted
Your job 5485163 ("sim_Bacillus_toyonensis_GCF_002554865") has been submitted
Your job 5485164 ("sim_Bacillus_toyonensis_GCF_002555905") has been submitted
Your job 5485165 ("sim_Bacillus_toyonensis_GCF_002556675") has been submitted
Your job 5485166 ("sim_Bacillus_toyonensis_GCF_002570155") has been submitted
Your job 5485167 ("sim_Bacillus_toyonensis_GCF_002580355") has been submitted
Your job 5485168 ("sim_Bacillus_toyonensis_GCF_002581325") has been submitted
Your job 5485169 ("sim_Bacillus_toyonensis_GCF_002581955") has been submitted
Your job 5485170 ("sim_Bacillus_tropicus_GCF_001884035") has been submitted
Your job 5485171 ("sim_Bacillus_tropicus_GCF_006349625") has been submitted
Your job 5485172 ("sim_Bacillus_tropicus_GCF_006349645") has been submitted
Your job 5485173 ("sim_Bacillus_tropicus_GCF_006457285") has been submitted
Your job 5485174 ("sim_Bacillus_tropicus_GCF_007672275") has been submit

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [238]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/bacillus/cleaned_genomes/Bacillus_anthracis_GCF_000008165_formatted.fas path=results/phyluce/bacillus/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [239]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5485188 ("map_Bacillus_albus_GCF_001884185") has been submitted
Your job 5485189 ("map_Bacillus_albus_GCF_003966295") has been submitted
Your job 5485190 ("map_Bacillus_albus_GCF_004116025") has been submitted
Your job 5485191 ("map_Bacillus_albus_GCF_004116035") has been submitted
Your job 5485192 ("map_Bacillus_albus_GCF_004116065") has been submitted
Your job 5485193 ("map_Bacillus_albus_GCF_004116085") has been submitted
Your job 5485194 ("map_Bacillus_albus_GCF_004116415") has been submitted
Your job 5485195 ("map_Bacillus_albus_GCF_004153665") has been submitted
Your job 5485196 ("map_Bacillus_albus_GCF_009740005") has been submitted
Your job 5485197 ("map_Bacillus_anthracis_GCF_000558965") has been submitted
Your job 5485198 ("map_Bacillus_anthracis_GCF_000559005") has been submitted
Your job 5485199 ("map_Bacillus_anthracis_GCF_000697555") has been submitted
Your job 5485200 ("map_Bacillus_anthracis_GCF_000742875") has been submitted
Your job 5485201 ("map_Bacillus_ant

Your job 5485294 ("map_Bacillus_toyonensis_GCF_002551965") has been submitted
Your job 5485295 ("map_Bacillus_toyonensis_GCF_002554865") has been submitted
Your job 5485296 ("map_Bacillus_toyonensis_GCF_002555905") has been submitted
Your job 5485297 ("map_Bacillus_toyonensis_GCF_002556675") has been submitted
Your job 5485298 ("map_Bacillus_toyonensis_GCF_002570155") has been submitted
Your job 5485299 ("map_Bacillus_toyonensis_GCF_002580355") has been submitted
Your job 5485300 ("map_Bacillus_toyonensis_GCF_002581325") has been submitted
Your job 5485301 ("map_Bacillus_toyonensis_GCF_002581955") has been submitted
Your job 5485302 ("map_Bacillus_tropicus_GCF_001884035") has been submitted
Your job 5485303 ("map_Bacillus_tropicus_GCF_006349625") has been submitted
Your job 5485304 ("map_Bacillus_tropicus_GCF_006349645") has been submitted
Your job 5485305 ("map_Bacillus_tropicus_GCF_006457285") has been submitted
Your job 5485306 ("map_Bacillus_tropicus_GCF_007672275") has been submit

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [240]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5485320 ("merge_Bacillus_albus_GCF_001884185") has been submitted
Your job 5485321 ("merge_Bacillus_albus_GCF_003966295") has been submitted
Your job 5485322 ("merge_Bacillus_albus_GCF_004116025") has been submitted
Your job 5485323 ("merge_Bacillus_albus_GCF_004116035") has been submitted
Your job 5485324 ("merge_Bacillus_albus_GCF_004116065") has been submitted
Your job 5485325 ("merge_Bacillus_albus_GCF_004116085") has been submitted
Your job 5485326 ("merge_Bacillus_albus_GCF_004116415") has been submitted
Your job 5485327 ("merge_Bacillus_albus_GCF_004153665") has been submitted
Your job 5485328 ("merge_Bacillus_albus_GCF_009740005") has been submitted
Your job 5485329 ("merge_Bacillus_anthracis_GCF_000558965") has been submitted
Your job 5485330 ("merge_Bacillus_anthracis_GCF_000559005") has been submitted
Your job 5485331 ("merge_Bacillus_anthracis_GCF_000697555") has been submitted
Your job 5485332 ("merge_Bacillus_anthracis_GCF_000742875") has been submitted
Your job 

Your job 5485423 ("merge_Bacillus_thuringiensis_GCF_002570925") has been submitted
Your job 5485424 ("merge_Bacillus_toyonensis_GCA_009799785") has been submitted
Your job 5485425 ("merge_Bacillus_toyonensis_GCF_002550925") has been submitted
Your job 5485426 ("merge_Bacillus_toyonensis_GCF_002551965") has been submitted
Your job 5485427 ("merge_Bacillus_toyonensis_GCF_002554865") has been submitted
Your job 5485428 ("merge_Bacillus_toyonensis_GCF_002555905") has been submitted
Your job 5485429 ("merge_Bacillus_toyonensis_GCF_002556675") has been submitted
Your job 5485430 ("merge_Bacillus_toyonensis_GCF_002570155") has been submitted
Your job 5485431 ("merge_Bacillus_toyonensis_GCF_002580355") has been submitted
Your job 5485432 ("merge_Bacillus_toyonensis_GCF_002581325") has been submitted
Your job 5485433 ("merge_Bacillus_toyonensis_GCF_002581955") has been submitted
Your job 5485434 ("merge_Bacillus_tropicus_GCF_001884035") has been submitted
Your job 5485435 ("merge_Bacillus_tropi

remove loci that were masked in the original genome

In [241]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 3525 sequences from Bacillus_albus_GCF_001884185_merged.bed.  Filtered 566 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2959.
Screened 3372 sequences from Bacillus_albus_GCF_003966295_merged.bed.  Filtered 562 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2810.
Screened 3563 sequences from Bacillus_albus_GCF_004116025_merged.bed.  Filtered 584 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2979.
Screened 3530 sequences from Bacillus_albus_GCF_004116035_merged.bed.  Filtered 580 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2950.
Screened 3566 sequences from Bacillus_albus_GCF_004116065_merged.bed.  Filtered 574 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2992.
Screened 3547 sequences from Bacillus_albus_GCF_004116085_merged.bed.  Filtered 575 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2972.
Screened 3562 sequences from Bacillus_albus_GCF_004116415_merged.bed.  Filtered 59

Screened 7190 sequences from Bacillus_mycoides_GCF_001044935_merged.bed.  Filtered 2061 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 5129.
Screened 7300 sequences from Bacillus_mycoides_GCF_001541985_merged.bed.  Filtered 2035 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 5265.
Screened 6945 sequences from Bacillus_mycoides_GCF_001757825_merged.bed.  Filtered 2067 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4878.
Screened 6885 sequences from Bacillus_mycoides_GCF_001757915_merged.bed.  Filtered 1992 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4893.
Screened 7014 sequences from Bacillus_mycoides_GCF_001757985_merged.bed.  Filtered 2080 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4934.
Screened 7203 sequences from Bacillus_mycoides_GCF_002014505_merged.bed.  Filtered 2033 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 5170.
Screened 7195 sequences from Bacillus_mycoides_GCF_0020146

Screened 6269 sequences from Bacillus_toyonensis_GCA_009799785_merged.bed.  Filtered 1456 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4813.
Screened 6384 sequences from Bacillus_toyonensis_GCF_002550925_merged.bed.  Filtered 1485 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4899.
Screened 6391 sequences from Bacillus_toyonensis_GCF_002551965_merged.bed.  Filtered 1489 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4902.
Screened 6342 sequences from Bacillus_toyonensis_GCF_002554865_merged.bed.  Filtered 1513 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4829.
Screened 6396 sequences from Bacillus_toyonensis_GCF_002555905_merged.bed.  Filtered 1491 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4905.
Screened 6325 sequences from Bacillus_toyonensis_GCF_002556675_merged.bed.  Filtered 1448 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4877.
Screened 6314 sequences from Bacillus_toyonens

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [242]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [243]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/bacillus/initial_intervals/Bacillus_wiedmannii_GCF_002569595_merged.bed --twobit results/phyluce/bacillus/cleaned_genomes/Bacillus_anthracis_GCF_000008165_formatted.2bit --output results/phyluce/bacillus/initial_intervals/Bacillus_wiedmannii_GCF_002569595_stripped.bed;
bacillus_albus_gcf_001884185...
bacillus_albus_gcf_003966295...
bacillus_albus_gcf_004116025...
bacillus_albus_gcf_004116035...
bacillus_albus_gcf_004116065...
bacillus_albus_gcf_004116085...
bacillus_albus_gcf_004116415...
bacillus_albus_gcf_004153665.....
bacillus_albus_gcf_009740005...
bacillus_anthracis_gcf_000558965.
bacillus_anthracis_gcf_000559005.
bacillus_anthracis_gcf_000697555.
bacillus_anthracis_gcf_000742875.
bacillus_anthracis_gcf_000783055.
bacillus_anthracis_gcf_001883895..
bacillus_anthracis_gcf_002233635.
bacillus_anthracis_gcf_002525695.
bacillus_anthracis_gcf_002525705.
bacillus_bombysepticus_gcf_0008310

Quantify probes and the number of targeted taxa for each.

In [244]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/bacillus/initial_intervals/Bacillus_wiedmannii_GCF_002569595_merged.bed --twobit results/phyluce/bacillus/cleaned_genomes/Bacillus_anthracis_GCF_000008165_formatted.2bit --output results/phyluce/bacillus/initial_intervals/Bacillus_wiedmannii_GCF_002569595_stripped.bed;
Loci shared by Bacillus_anthracis_GCF_000008165 + 0 taxa:	4,391.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 1 taxa:	4,391.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 2 taxa:	4,345.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 3 taxa:	4,284.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 4 taxa:	4,253.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 5 taxa:	4,227.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 6 taxa:	4,203.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 7 taxa:	4,187.0
Loci shared by Bacillus_anthracis_GCF_000008165 + 8 taxa:	4,163.0
Loci shared by Bacillus_anthracis_GCF

In [245]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 132
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/bacillus/initial_intervals/bacillus-to-Bacillus_anthracis_GCF_000008165.sqlite --base-taxon Bacillus_anthracis_GCF_000008165 --output results/phyluce/bacillus/initial_intervals/Bacillus_anthracis_GCF_000008165_+132.bed --specific-counts 132;
Counter({'bacillus_cereus_gcf_000290795': 129, 'bacillus_albus_gcf_004116415': 129, 'bacillus_paranthracis_gca_009873395': 129, 'bacillus_pseudomycoides_gcf_002556205': 129, 'bacillus_mycoides_gcf_001757985': 129, 'bacillus_thuringiensis_gcf_002564505': 129, 'bacillus_anthracis_gcf_000697555': 129, 'bacillus_nitratireducens_gcf_007676605': 129, 'bacillus_paranthracis_gcf_009799965': 129, 'bacillus_albus_gcf_003966295': 129, 'bacillus_thuringiensis_gcf_000161735': 129, 'bacillus_cereus_gcf_002530565': 129, 'bacillus_nitratireducens_gcf_007681065': 129, 'bacillus_cereus_gcf_000161315': 129, 'bacillus_nitratireducens_gcf_007676595': 129, 'bacillus_luti_gcf_001884105': 129, 'bacillus_cereus_gcf

## Design temp set of baits

In [246]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/bacillus/initial_intervals/Bacillus_anthracis_GCF_000008165_+132.bed --twobit results/phyluce/bacillus/cleaned_genomes/Bacillus_anthracis_GCF_000008165_formatted.2bit --buffer-to 160 --output results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132.fasta;
Screened 129 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 129.


design the baits

In [247]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132.fasta --probe-prefix uce_bacillus_ --design bacillus_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 121
Probe Count = 235


## Find duplicate baited regions

In [248]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas --query results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  15:31:37
Ended:  Fri Feb 07, 2020  15:31:37
Time for execution:  0.00862280130386 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas                        --lastz results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132_temp_probes_vself.lastz                       --probe-prefix=uce_bacillus_;
Parsing lastz file...
Screening results...
Screened 234 fasta sequences.  Filtered 0 duplicates. Kept 235.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [249]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [250]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/bacillus/validate_intervals/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas --scaffoldlist Bacillus_albus_GCF_001884185 Bacillus_albus_GCF_003966295 Bacillus_albus_GCF_004116025 Bacillus_albus_GCF_004116035 Bacillus_albus_GCF_004116065 Bacillus_albus_GCF_004116085 Bacillus_albus_GCF_004116415 Bacillus_albus_GCF_004153665 Bacillus_albus_GCF_009740005 Bacillus_anthracis_GCF_000558965 Bacillus_anthracis_GCF_000559005 Bacillus_anthracis_GCF_000697555 Bacillus_anthracis_GCF_000742875 Bacillus_anthracis_GCF_000783055 Bacillus_anthracis_GCF_001883895 Bacillus_anthracis_GCF_002233635 Bacillus_anthracis_GCF_002525695 Bacillus_anthracis_GCF_002525705 Bacillus_bombysepticus_GCF_000831065 Bacillus_cereus_GCF_000161315 Bacillus_cereus_GCF_000290795 Bacillus_cereus_GCF_000291075 Bacillus_cereus_GCF_001583875 Bacillus_cereus_GCF_001584025 Bacillus_cereus_GCF_001683695 Bacillus_cereus_GCF_002530565 Bacillus_cereus_GCF_002552015

Running the targets against 1 queries...
	/tmp/tmpfb84Gn.fasta

Writing the results file...
	/tmp/tmpzwphgF.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_albus_GCF_004116415.lastz
Creating Bacillus_albus_GCF_004116415 table
Inserting data to Bacillus_albus_GCF_004116415 table

Running against Bacillus_albus_GCF_004153665.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp12Yq1B.fasta

Writing the results file...
	/tmp/tmp55cyeO.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_albus_GCF_004153665.lastz
Creating Bacillus_albus_GCF_004153665 table
Inserting data to Bacillus_albus_GCF_004153665 table

Running against Bacillus_albus

Running the targets against 1 queries...
	/tmp/tmpWUlXm5.fasta

Writing the results file...
	/tmp/tmpaEM49w.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_cereus_GCF_000291075.lastz
Creating Bacillus_cereus_GCF_000291075 table
Inserting data to Bacillus_cereus_GCF_000291075 table

Running against Bacillus_cereus_GCF_001583875.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp5Sp_Hu.fasta

Writing the results file...
	/tmp/tmpDTSQBX.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_cereus_GCF_001583875.lastz
Creating Bacillus_cereus_GCF_001583875 table
Inserting data to Bacillus_cereus_GCF_001583875 table

Running against Bacillu


Running against Bacillus_cytotoxicus_GCF_002251005.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpNYMhGr.fasta

Writing the results file...
	/tmp/tmpkDkC4q.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_cytotoxicus_GCF_002251005.lastz
Creating Bacillus_cytotoxicus_GCF_002251005 table
Inserting data to Bacillus_cytotoxicus_GCF_002251005 table

Running against Bacillus_cytotoxicus_GCF_002251025.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpDhJ1UL.fasta

Writing the results file...
	/tmp/tmpnx12b2.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_cytot


Running against Bacillus_mycoides_GCF_000290695.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpKoaAKO.fasta

Writing the results file...
	/tmp/tmpu0EuU0.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_mycoides_GCF_000290695.lastz
Creating Bacillus_mycoides_GCF_000290695 table
Inserting data to Bacillus_mycoides_GCF_000290695 table

Running against Bacillus_mycoides_GCF_000517985.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpqRX6Sv.fasta

Writing the results file...
	/tmp/tmpbhIe7g.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_mycoides_GCF_0005179

Inserting data to Bacillus_nitratireducens_GCF_007682355 table

Running against Bacillus_pacificus_GCA_009884315.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp0zY48V.fasta

Writing the results file...
	/tmp/tmp5cvgIv.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_pacificus_GCA_009884315.lastz
Creating Bacillus_pacificus_GCA_009884315 table
Inserting data to Bacillus_pacificus_GCA_009884315 table

Running against Bacillus_pacificus_GCF_001884025.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp4fTUjG.fasta

Writing the results file...
	/tmp/tmpWae4Dw.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthraci

Inserting data to Bacillus_paranthracis_GCF_007682155 table

Running against Bacillus_paranthracis_GCF_009498675.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp5PXdbB.fasta

Writing the results file...
	/tmp/tmpTnO4Ho.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_paranthracis_GCF_009498675.lastz
Creating Bacillus_paranthracis_GCF_009498675 table
Inserting data to Bacillus_paranthracis_GCF_009498675 table

Running against Bacillus_paranthracis_GCF_009799965.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpPFQRN7.fasta

Writing the results file...
	/tmp/tmp9sE_VY.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacil

Running the targets against 1 queries...
	/tmp/tmpvIqjmY.fasta

Writing the results file...
	/tmp/tmpmiuwC1.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_thuringiensis_GCF_000710255.lastz
Creating Bacillus_thuringiensis_GCF_000710255 table
Inserting data to Bacillus_thuringiensis_GCF_000710255 table

Running against Bacillus_thuringiensis_GCF_001402735.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpxYQ2Qj.fasta

Writing the results file...
	/tmp/tmpOUvaXY.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_thuringiensis_GCF_001402735.lastz
Creating Bacillus_thuringiensis_GCF_001402735 table
Inserting data to Bacillus_thuringie

Inserting data to Bacillus_toyonensis_GCF_002556675 table

Running against Bacillus_toyonensis_GCF_002570155.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpThaBk3.fasta

Writing the results file...
	/tmp/tmpnWfwG9.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_toyonensis_GCF_002570155.lastz
Creating Bacillus_toyonensis_GCF_002570155 table
Inserting data to Bacillus_toyonensis_GCF_002570155 table

Running against Bacillus_toyonensis_GCF_002580355.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpzsEMRg.fasta

Writing the results file...
	/tmp/tmplJTiRv.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthraci


Running against Bacillus_wiedmannii_GCF_002555555.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpRWHajo.fasta

Writing the results file...
	/tmp/tmpkoRYj1.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_wiedmannii_GCF_002555555.lastz
Creating Bacillus_wiedmannii_GCF_002555555 table
Inserting data to Bacillus_wiedmannii_GCF_002555555 table

Running against Bacillus_wiedmannii_GCF_002555735.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp95UrNL.fasta

Writing the results file...
	/tmp/tmp2K6HHE.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bacillus/validate_intervals/lastz/Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_Bacillus_wiedmannii

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [251]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/bacillus/extract_probes_from_group/bacillus_genome.conf --lastz results/phyluce/bacillus/validate_intervals/lastz --probes 120 --probe-prefix uce_bacillus_ --name-pattern "Bacillus_anthracis_GCF_000008165_+132_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/bacillus/extract_probes_from_group/probe_fasta;
2020-02-07 15:38:53,419 - Phyluce - INFO - --------- Working on Bacillus_albus_GCF_001884185 genome --------
2020-02-07 15:38:53,419 - Phyluce - INFO - Reading Bacillus_albus_GCF_001884185 genome
2020-02-07 15:38:53,957 - Phyluce - INFO - Bacillus_albus_GCF_001884185: 121 uces, 10 dupes, 111 non-dupes, 0 orient drop, 0 length drop, 111 written
2020-02-07 15:38:53,958 - Phyluce - INFO - --------- Working on Bacillus_albus_GCF_003966295 genome --------
2020-02-07 15:38:53,962 - Phyluce - INFO - Reading Bacillus_albus_GCF_003966295 genome
2020-02-07 15:38:54,428 - Phyluce - INFO - Bacillus_albus_GCF_003966295: 120

2020-02-07 15:39:04,053 - Phyluce - INFO - Bacillus_cereus_GCF_001583875: 121 uces, 8 dupes, 113 non-dupes, 0 orient drop, 1 length drop, 112 written
2020-02-07 15:39:04,054 - Phyluce - INFO - -------- Working on Bacillus_cereus_GCF_001584025 genome --------
2020-02-07 15:39:04,087 - Phyluce - INFO - Reading Bacillus_cereus_GCF_001584025 genome
2020-02-07 15:39:04,580 - Phyluce - INFO - Bacillus_cereus_GCF_001584025: 121 uces, 11 dupes, 110 non-dupes, 0 orient drop, 1 length drop, 109 written
2020-02-07 15:39:04,580 - Phyluce - INFO - -------- Working on Bacillus_cereus_GCF_001683695 genome --------
2020-02-07 15:39:04,610 - Phyluce - INFO - Reading Bacillus_cereus_GCF_001683695 genome
2020-02-07 15:39:05,080 - Phyluce - INFO - Bacillus_cereus_GCF_001683695: 120 uces, 11 dupes, 109 non-dupes, 0 orient drop, 0 length drop, 109 written
2020-02-07 15:39:05,080 - Phyluce - INFO - -------- Working on Bacillus_cereus_GCF_002530565 genome --------
2020-02-07 15:39:05,081 - Phyluce - INFO - Re

2020-02-07 15:39:15,285 - Phyluce - INFO - Bacillus_mobilis_GCF_003612955: 121 uces, 0 dupes, 121 non-dupes, 7 orient drop, 5 length drop, 109 written
2020-02-07 15:39:15,285 - Phyluce - INFO - -------- Working on Bacillus_mobilis_GCF_007681185 genome -------
2020-02-07 15:39:15,286 - Phyluce - INFO - Reading Bacillus_mobilis_GCF_007681185 genome
2020-02-07 15:39:15,762 - Phyluce - INFO - Bacillus_mobilis_GCF_007681185: 120 uces, 9 dupes, 111 non-dupes, 0 orient drop, 0 length drop, 111 written
2020-02-07 15:39:15,763 - Phyluce - INFO - -------- Working on Bacillus_mobilis_GCF_007681195 genome -------
2020-02-07 15:39:15,792 - Phyluce - INFO - Reading Bacillus_mobilis_GCF_007681195 genome
2020-02-07 15:39:16,303 - Phyluce - INFO - Bacillus_mobilis_GCF_007681195: 121 uces, 13 dupes, 108 non-dupes, 0 orient drop, 1 length drop, 107 written
2020-02-07 15:39:16,303 - Phyluce - INFO - -------- Working on Bacillus_mobilis_GCF_007682195 genome -------
2020-02-07 15:39:16,304 - Phyluce - INFO 

2020-02-07 15:39:26,331 - Phyluce - INFO - Bacillus_paramycoides_GCF_001884235: 121 uces, 12 dupes, 109 non-dupes, 0 orient drop, 1 length drop, 108 written
2020-02-07 15:39:26,332 - Phyluce - INFO - ----- Working on Bacillus_paramycoides_GCF_007682005 genome -----
2020-02-07 15:39:26,333 - Phyluce - INFO - Reading Bacillus_paramycoides_GCF_007682005 genome
2020-02-07 15:39:26,810 - Phyluce - INFO - Bacillus_paramycoides_GCF_007682005: 121 uces, 10 dupes, 111 non-dupes, 0 orient drop, 1 length drop, 110 written
2020-02-07 15:39:26,810 - Phyluce - INFO - ----- Working on Bacillus_paranthracis_GCA_009648955 genome -----
2020-02-07 15:39:26,814 - Phyluce - INFO - Reading Bacillus_paranthracis_GCA_009648955 genome
2020-02-07 15:39:27,331 - Phyluce - INFO - Bacillus_paranthracis_GCA_009648955: 121 uces, 1 dupes, 120 non-dupes, 5 orient drop, 7 length drop, 108 written
2020-02-07 15:39:27,331 - Phyluce - INFO - ----- Working on Bacillus_paranthracis_GCA_009873395 genome -----
2020-02-07 15:3

2020-02-07 15:39:36,540 - Phyluce - INFO - Reading Bacillus_pseudomycoides_GCF_002569075 genome
2020-02-07 15:39:36,956 - Phyluce - INFO - Bacillus_pseudomycoides_GCF_002569075: 119 uces, 10 dupes, 109 non-dupes, 0 orient drop, 0 length drop, 109 written
2020-02-07 15:39:36,956 - Phyluce - INFO - ----- Working on Bacillus_thuringiensis_GCF_000161735 genome ----
2020-02-07 15:39:36,969 - Phyluce - INFO - Reading Bacillus_thuringiensis_GCF_000161735 genome
2020-02-07 15:39:37,422 - Phyluce - INFO - Bacillus_thuringiensis_GCF_000161735: 121 uces, 0 dupes, 121 non-dupes, 9 orient drop, 3 length drop, 109 written
2020-02-07 15:39:37,422 - Phyluce - INFO - ----- Working on Bacillus_thuringiensis_GCF_000710255 genome ----
2020-02-07 15:39:37,423 - Phyluce - INFO - Reading Bacillus_thuringiensis_GCF_000710255 genome
2020-02-07 15:39:37,876 - Phyluce - INFO - Bacillus_thuringiensis_GCF_000710255: 120 uces, 10 dupes, 110 non-dupes, 0 orient drop, 0 length drop, 110 written
2020-02-07 15:39:37,87

2020-02-07 15:39:47,480 - Phyluce - INFO - Bacillus_tropicus_GCF_006349645: 121 uces, 11 dupes, 110 non-dupes, 0 orient drop, 2 length drop, 108 written
2020-02-07 15:39:47,480 - Phyluce - INFO - ------- Working on Bacillus_tropicus_GCF_006457285 genome -------
2020-02-07 15:39:47,481 - Phyluce - INFO - Reading Bacillus_tropicus_GCF_006457285 genome
2020-02-07 15:39:47,992 - Phyluce - INFO - Bacillus_tropicus_GCF_006457285: 121 uces, 0 dupes, 121 non-dupes, 10 orient drop, 4 length drop, 107 written
2020-02-07 15:39:47,992 - Phyluce - INFO - ------- Working on Bacillus_tropicus_GCF_007672275 genome -------
2020-02-07 15:39:47,993 - Phyluce - INFO - Reading Bacillus_tropicus_GCF_007672275 genome
2020-02-07 15:39:48,421 - Phyluce - INFO - Bacillus_tropicus_GCF_007672275: 121 uces, 9 dupes, 112 non-dupes, 0 orient drop, 4 length drop, 108 written
2020-02-07 15:39:48,422 - Phyluce - INFO - ------- Working on Bacillus_tropicus_GCF_007676425 genome -------
2020-02-07 15:39:48,422 - Phyluce -

In [252]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/bacillus/extract_probes_from_group/probe_fasta --output results/phyluce/bacillus/extract_probes_from_group/multifastas.sqlite --base-taxon Bacillus_anthracis_GCF_000008165;
bacillus_albus_gcf_001884185.
bacillus_albus_gcf_003966295.
bacillus_albus_gcf_004116025.
bacillus_albus_gcf_004116035.
bacillus_albus_gcf_004116065.
bacillus_albus_gcf_004116085.
bacillus_albus_gcf_004116415.
bacillus_albus_gcf_004153665.
bacillus_albus_gcf_009740005.
bacillus_anthracis_gcf_000558965.
bacillus_anthracis_gcf_000559005.
bacillus_anthracis_gcf_000697555.
bacillus_anthracis_gcf_000742875.
bacillus_anthracis_gcf_000783055.
bacillus_anthracis_gcf_001883895.
bacillus_anthracis_gcf_002233635.
bacillus_anthracis_gcf_002525695.
bacillus_anthracis_gcf_002525705.
bacillus_bombysepticus_gcf_000831065.
bacillus_cereus_gcf_000161315.
bacillus_cereus_gcf_000290795.
bacillus_cereus_gcf_000291075.
bacillus_cereus_gcf_001583875.
bacillus_cereus_gcf_00158402

In [253]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(133)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/bacillus/extract_probes_from_group/multifastas.sqlite  --base-taxon Bacillus_anthracis_GCF_000008165 --output results/phyluce/bacillus/extract_probes_from_group/Bacillus_anthracis_GCF_000008165+132-back-to-133.conf --specific-counts 133;
Counter({'bacillus_pseudomycoides_gcf_002555685': 76, 'bacillus_albus_gcf_004116415': 76, 'bacillus_paranthracis_gca_009873395': 76, 'bacillus_pseudomycoides_gcf_002556205': 76, 'bacillus_mycoides_gcf_001757985': 76, 'bacillus_thuringiensis_gcf_002564505': 76, 'bacillus_anthracis_gcf_000697555': 76, 'bacillus_nitratireducens_gcf_007676605': 76, 'bacillus_paranthracis_gcf_009799965': 76, 'bacillus_albus_gcf_003966295': 76, 'bacillus_thuringiensis_gcf_000161735': 76, 'bacillus_cereus_gcf_002530565': 76, 'bacillus_nitratireducens_gcf_007681065': 76, 'bacillus_cereus_gcf_000161315': 76, 'bacillus_nitratireducens_gcf_007676595': 76, 'bacillus_luti_gcf_001884105': 76, 'bacillus_cereus_gcf_001584025':

## Final group specific bait design

In [254]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/bacillus/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/bacillus/extract_probes_from_group/Bacillus_anthracis_GCF_000008165+132-back-to-133.conf --probe-prefix uce_bacillus_ --designer rnplattii --design bacillus_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGNNNNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [255]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list.fasta --query results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  15:45:24
Ended:  Fri Feb 07, 2020  15:49:31
Time for execution:  4.11266578436 minutes


In [256]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list.fasta --lastz results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_bacillus_;
Parsing lastz file...
Screening results...
Screened 19199 fasta sequences.  Filtered 0 duplicates. Kept 19200.


## CDhit to reduce numbers

In [257]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list.fasta
         -o
         results/phyluce/bacillus/final_probe_design/bacillus_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 17:36:29 2020
                            Output                              
----------------------------------------------------------------
total seq: 19200
longest and shortest : 80 and 80
Total letters: 1536000
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 4M
Buffer          : 4 X 12M = 49M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 91M

Table limit with the given memory limit:
Max number of representatives: 3932319
Max number of word counting entries: 88516514

# comparing sequences from          0  to       3200
...---------- new table with      262 representatives
# comparing sequences from       3200 

# Borrelia

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [32]:
group = 'borrelia'

In [33]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [34]:
group_taxa = [ "Borrelia_anserina_GCF_001936255",
               "Borrelia_coriaceae_GCF_000568755",
               "Borrelia_crocidurae_GCF_000259345",
               "Borrelia_duttonii_GCF_000019685",
               "Borrelia_hermsii_GCF_001660005",
               "Borrelia_hispnica_GCF_000500065",
               "Borrelia_miyamotoi_GCF_000445425",
               "Borrelia_parkeri_GCF_000512145",
               "Borrelia_persica_GCF_000500045",
               "Borrelia_recurrentis_GCF_000019705",
               "Borrelia_turcica_GCF_003606285",
               "Borrelia_turicatae_GCF_000012085",
               "Borreliella_afzelii_GCA_000502195",
               "Borreliella_garinii_GCF_000501735" ]

reference_taxon = "Borreliella_burgdorferi_GCF_000502155"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [35]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001936255.1_ASM193625v1_genomic.fna.gz

sent 42 bytes  received 297132 bytes  198116.00 bytes/sec
total size is 296946  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplina

GCA_003606285.1_ASM360628v1_genomic.fna.gz

sent 42 bytes  received 365396 bytes  243625.33 bytes/sec
total size is 365194  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000012085.2_ASM1208v2_genomic.fna.gz

sent 42 bytes  received 329927 bytes  659938.00 bytes/sec
total size is 329735  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [36]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [37]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481461 ("sim_Borrelia_anserina_GCF_001936255") has been submitted
Your job 5481462 ("sim_Borrelia_coriaceae_GCF_000568755") has been submitted
Your job 5481463 ("sim_Borrelia_crocidurae_GCF_000259345") has been submitted
Your job 5481464 ("sim_Borrelia_duttonii_GCF_000019685") has been submitted
Your job 5481465 ("sim_Borrelia_hermsii_GCF_001660005") has been submitted
Your job 5481466 ("sim_Borrelia_hispnica_GCF_000500065") has been submitted
Your job 5481467 ("sim_Borrelia_miyamotoi_GCF_000445425") has been submitted
Your job 5481468 ("sim_Borrelia_parkeri_GCF_000512145") has been submitted
Your job 5481469 ("sim_Borrelia_persica_GCF_000500045") has been submitted
Your job 5481470 ("sim_Borrelia_recurrentis_GCF_000019705") has been submitted
Your job 5481471 ("sim_Borrelia_turcica_GCF_003606285") has been submitted
Your job 5481472 ("sim_Borrelia_turicatae_GCF_000012085") has been submitted
Your job 5481473 ("sim_Borreliella_afzelii_GCA_000502195") has been submitted
Your j

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [38]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/borrelia/cleaned_genomes/Borreliella_burgdorferi_GCF_000502155_formatted.fas path=results/phyluce/borrelia/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [39]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481475 ("map_Borrelia_anserina_GCF_001936255") has been submitted
Your job 5481476 ("map_Borrelia_coriaceae_GCF_000568755") has been submitted
Your job 5481477 ("map_Borrelia_crocidurae_GCF_000259345") has been submitted
Your job 5481478 ("map_Borrelia_duttonii_GCF_000019685") has been submitted
Your job 5481479 ("map_Borrelia_hermsii_GCF_001660005") has been submitted
Your job 5481480 ("map_Borrelia_hispnica_GCF_000500065") has been submitted
Your job 5481481 ("map_Borrelia_miyamotoi_GCF_000445425") has been submitted
Your job 5481482 ("map_Borrelia_parkeri_GCF_000512145") has been submitted
Your job 5481483 ("map_Borrelia_persica_GCF_000500045") has been submitted
Your job 5481484 ("map_Borrelia_recurrentis_GCF_000019705") has been submitted
Your job 5481485 ("map_Borrelia_turcica_GCF_003606285") has been submitted
Your job 5481486 ("map_Borrelia_turicatae_GCF_000012085") has been submitted
Your job 5481487 ("map_Borreliella_afzelii_GCA_000502195") has been submitted
Your j

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [40]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481489 ("merge_Borrelia_anserina_GCF_001936255") has been submitted
Your job 5481490 ("merge_Borrelia_coriaceae_GCF_000568755") has been submitted
Your job 5481491 ("merge_Borrelia_crocidurae_GCF_000259345") has been submitted
Your job 5481492 ("merge_Borrelia_duttonii_GCF_000019685") has been submitted
Your job 5481493 ("merge_Borrelia_hermsii_GCF_001660005") has been submitted
Your job 5481494 ("merge_Borrelia_hispnica_GCF_000500065") has been submitted
Your job 5481495 ("merge_Borrelia_miyamotoi_GCF_000445425") has been submitted
Your job 5481496 ("merge_Borrelia_parkeri_GCF_000512145") has been submitted
Your job 5481497 ("merge_Borrelia_persica_GCF_000500045") has been submitted
Your job 5481498 ("merge_Borrelia_recurrentis_GCF_000019705") has been submitted
Your job 5481499 ("merge_Borrelia_turcica_GCF_003606285") has been submitted
Your job 5481500 ("merge_Borrelia_turicatae_GCF_000012085") has been submitted
Your job 5481501 ("merge_Borreliella_afzelii_GCA_000502195")

remove loci that were masked in the original genome

In [41]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 176 sequences from Borrelia_anserina_GCF_001936255_merged.bed.  Filtered 142 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 34.
Screened 187 sequences from Borrelia_coriaceae_GCF_000568755_merged.bed.  Filtered 159 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 28.
Screened 216 sequences from Borrelia_crocidurae_GCF_000259345_merged.bed.  Filtered 166 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 50.
Screened 220 sequences from Borrelia_duttonii_GCF_000019685_merged.bed.  Filtered 176 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 44.
Screened 198 sequences from Borrelia_hermsii_GCF_001660005_merged.bed.  Filtered 160 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 38.
Screened 227 sequences from Borrelia_hispnica_GCF_000500065_merged.bed.  Filtered 182 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 45.
Screened 187 sequences from Borrelia_miyamotoi_GCF_000445425_merged.bed.  Filter

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [42]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [43]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/borrelia/initial_intervals/Borreliella_garinii_GCF_000501735_merged.bed --twobit results/phyluce/borrelia/cleaned_genomes/Borreliella_burgdorferi_GCF_000502155_formatted.2bit --output results/phyluce/borrelia/initial_intervals/Borreliella_garinii_GCF_000501735_stripped.bed;
borrelia_anserina_gcf_001936255.
borrelia_coriaceae_gcf_000568755.
borrelia_crocidurae_gcf_000259345.
borrelia_duttonii_gcf_000019685.
borrelia_hermsii_gcf_001660005.
borrelia_hispnica_gcf_000500065.
borrelia_miyamotoi_gcf_000445425.
borrelia_parkeri_gcf_000512145.
borrelia_persica_gcf_000500045.
borrelia_recurrentis_gcf_000019705.
borrelia_turcica_gcf_003606285.
borrelia_turicatae_gcf_000012085.
borreliella_afzelii_gca_000502195.
borreliella_garinii_gcf_000501735.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [44]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/borrelia/initial_intervals/Borreliella_garinii_GCF_000501735_merged.bed --twobit results/phyluce/borrelia/cleaned_genomes/Borreliella_burgdorferi_GCF_000502155_formatted.2bit --output results/phyluce/borrelia/initial_intervals/Borreliella_garinii_GCF_000501735_stripped.bed;
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 0 taxa:	1,077.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 1 taxa:	1,077.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 2 taxa:	928.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 3 taxa:	128.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 4 taxa:	86.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 5 taxa:	68.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 6 taxa:	50.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 7 taxa:	39.0
Loci shared by Borreliella_burgdorferi_GCF_000502155 + 8 taxa:	30.0
Loci s

In [45]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 4
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/borrelia/initial_intervals/borrelia-to-Borreliella_burgdorferi_GCF_000502155.sqlite --base-taxon Borreliella_burgdorferi_GCF_000502155 --output results/phyluce/borrelia/initial_intervals/Borreliella_burgdorferi_GCF_000502155_+4.bed --specific-counts 4;
Counter({'borreliella_afzelii_gca_000502195': 85, 'borreliella_garinii_gcf_000501735': 83, 'borrelia_recurrentis_gcf_000019705': 47, 'borrelia_crocidurae_gcf_000259345': 46, 'borrelia_hispnica_gcf_000500065': 41, 'borrelia_turicatae_gcf_000012085': 40, 'borrelia_duttonii_gcf_000019685': 39, 'borrelia_miyamotoi_gcf_000445425': 37, 'borrelia_parkeri_gcf_000512145': 35, 'borrelia_hermsii_gcf_001660005': 32, 'borrelia_persica_gcf_000500045': 29, 'borrelia_turcica_gcf_003606285': 29, 'borrelia_anserina_gcf_001936255': 28, 'borrelia_coriaceae_gcf_000568755': 27})


## Design temp set of baits

In [46]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/borrelia/initial_intervals/Borreliella_burgdorferi_GCF_000502155_+4.bed --twobit results/phyluce/borrelia/cleaned_genomes/Borreliella_burgdorferi_GCF_000502155_formatted.2bit --buffer-to 160 --output results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4.fasta;
Screened 86 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 86.


design the baits

In [47]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4.fasta --probe-prefix uce_borrelia_ --design borrelia_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 68
Probe Count = 125


## Find duplicate baited regions

In [48]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas --query results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  16:33:56
Ended:  Thu Feb 06, 2020  16:33:57
Time for execution:  0.0021835009257 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas                        --lastz results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes_vself.lastz                       --probe-prefix=uce_borrelia_;
Parsing lastz file...
Screening results...
Screened 124 fasta sequences.  Filtered 0 duplicates. Kept 125.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [49]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [50]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/borrelia/validate_intervals/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas --scaffoldlist Borrelia_anserina_GCF_001936255 Borrelia_coriaceae_GCF_000568755 Borrelia_crocidurae_GCF_000259345 Borrelia_duttonii_GCF_000019685 Borrelia_hermsii_GCF_001660005 Borrelia_hispnica_GCF_000500065 Borrelia_miyamotoi_GCF_000445425 Borrelia_parkeri_GCF_000512145 Borrelia_persica_GCF_000500045 Borrelia_recurrentis_GCF_000019705 Borrelia_turcica_GCF_003606285 Borrelia_turicatae_GCF_000012085 Borreliella_afzelii_GCA_000502195 Borreliella_garinii_GCF_000501735 Borreliella_burgdorferi_GCF_000502155 --genome-base-path results/phyluce/borrelia/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/borrelia/validate_intervals/borrelia-to-Borreliella_burgdorferi_GCF_000502155.sqlite --output results/phyluce/borrelia/validate_intervals/lastz/;

Running against Borrelia_anserina_GCF_001936255.2bit
Running with the --huge option. 

	/tmp/tmpBpRSNI.fasta

Writing the results file...
	/tmp/tmpoVuuS2.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/borrelia/validate_intervals/lastz/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas_v_Borreliella_garinii_GCF_000501735.lastz
Creating Borreliella_garinii_GCF_000501735 table
Inserting data to Borreliella_garinii_GCF_000501735 table

Running against Borreliella_burgdorferi_GCF_000502155.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpIGkiX8.fasta

Writing the results file...
	/tmp/tmp1LmNOt.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/borrelia/validate_intervals/lastz/Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas_v_Borreliella_burgdorferi_GCF_000502155.lastz
Creating Borreliella_burgdorferi_GCF_000502155 table
Inserting data to Borreliella_burgdorferi_GCF_000502155 table


## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [51]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/borrelia/extract_probes_from_group/borrelia_genome.conf --lastz results/phyluce/borrelia/validate_intervals/lastz --probes 120 --probe-prefix uce_borrelia_ --name-pattern "Borreliella_burgdorferi_GCF_000502155_+4_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/borrelia/extract_probes_from_group/probe_fasta;
2020-02-06 16:34:09,003 - Phyluce - INFO - ------- Working on Borrelia_anserina_GCF_001936255 genome -------
2020-02-06 16:34:09,018 - Phyluce - INFO - Reading Borrelia_anserina_GCF_001936255 genome
2020-02-06 16:34:09,287 - Phyluce - INFO - Borrelia_anserina_GCF_001936255: 67 uces, 0 dupes, 67 non-dupes, 0 orient drop, 1 length drop, 66 written
2020-02-06 16:34:09,287 - Phyluce - INFO - ------- Working on Borrelia_coriaceae_GCF_000568755 genome ------
2020-02-06 16:34:09,288 - Phyluce - INFO - Reading Borrelia_coriaceae_GCF_000568755 genome
2020-02-06 16:34:09,497 - Phyluce - INFO - Borrelia_coriaceae_GCF_0

In [52]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/borrelia/extract_probes_from_group/probe_fasta --output results/phyluce/borrelia/extract_probes_from_group/multifastas.sqlite --base-taxon Borreliella_burgdorferi_GCF_000502155;
borrelia_anserina_gcf_001936255.
borrelia_coriaceae_gcf_000568755.
borrelia_crocidurae_gcf_000259345.
borrelia_duttonii_gcf_000019685.
borrelia_hermsii_gcf_001660005.
borrelia_hispnica_gcf_000500065.
borrelia_miyamotoi_gcf_000445425.
borrelia_parkeri_gcf_000512145.
borrelia_persica_gcf_000500045.
borrelia_recurrentis_gcf_000019705.
borrelia_turcica_gcf_003606285.
borrelia_turicatae_gcf_000012085.
borreliella_afzelii_gca_000502195.
borreliella_garinii_gcf_000501735.
borreliella_burgdorferi_gcf_000502155.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/borrelia/extract_probes_from_group/multifastas.sqlite --base-taxon Borreliella_burgdorferi_GCF_000502155;
Loci shared by 0 taxa:	67.0
Loci shared by 1 taxa:	

In [53]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(1)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/borrelia/extract_probes_from_group/multifastas.sqlite  --base-taxon Borreliella_burgdorferi_GCF_000502155 --output results/phyluce/borrelia/extract_probes_from_group/Borreliella_burgdorferi_GCF_000502155+4-back-to-1.conf --specific-counts 1;
Counter({'borrelia_hispnica_gcf_000500065': 66, 'borrelia_turicatae_gcf_000012085': 66, 'borrelia_recurrentis_gcf_000019705': 66, 'borrelia_anserina_gcf_001936255': 66, 'borrelia_persica_gcf_000500045': 66, 'borrelia_parkeri_gcf_000512145': 66, 'borrelia_crocidurae_gcf_000259345': 66, 'borrelia_turcica_gcf_003606285': 66, 'borrelia_duttonii_gcf_000019685': 66, 'borrelia_coriaceae_gcf_000568755': 65, 'borrelia_miyamotoi_gcf_000445425': 65, 'borreliella_garinii_gcf_000501735': 65, 'borreliella_burgdorferi_gcf_000502155': 65, 'borreliella_afzelii_gca_000502195': 64, 'borrelia_hermsii_gcf_001660005': 64})
Total loci = 67


## Final group specific bait design

In [54]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/borrelia/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/borrelia/extract_probes_from_group/Borreliella_burgdorferi_GCF_000502155+4-back-to-1.conf --probe-prefix uce_borrelia_ --designer rnplattii --design borrelia_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 67
Probe Count = 1783


In [55]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list.fasta --query results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  16:35:06
Ended:  Thu Feb 06, 2020  16:35:08
Time for execution:  0.0482173323631 minutes


In [56]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list.fasta --lastz results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_borrelia_;
Parsing lastz file...
Screening results...
Screened 1782 fasta sequences.  Filtered 0 duplicates. Kept 1783.


## CDhit to reduce numbers

In [57]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list.fasta
         -o
         results/phyluce/borrelia/final_probe_design/borrelia_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 16:36:16 2020
                            Output                              
----------------------------------------------------------------
total seq: 1783
longest and shortest : 80 and 80
Total letters: 142640
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 86M

Table limit with the given memory limit:
Max number of representatives: 3961964
Max number of word counting entries: 89183821

# comparing sequences from          0  to        297
---------- new table with      193 representatives
# comparing sequences from        297  to  

# Bartonella

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [58]:
group = 'bartonella'

In [59]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [60]:
group_taxa = [ "Bartonella_alsatica_GCF_000280015",
               "Bartonella_ancashensis_GCF_001281405",
               "Bartonella_apis_GCF_002007565",
               "Bartonella_australis_GCF_000341355",
               "Bartonella_birtlesii_GCF_000273375",
               "Bartonella_bovis_GCF_000384965",
               "Bartonella_clarridgeiae_GCF_000253015",
               "Bartonella_doshiae_GCF_000278155",
               "Bartonella_elizabethae_GCF_900638615",
               "Bartonella_florencae_GCF_000312525",
               "Bartonella_grahamii_GCF_000022725",
               "Bartonella_henselae_GCF_000046705",
               "Bartonella_koehlerae_GCF_000706625",
               "Bartonella_massiliensis_GCF_902150025",
               "Bartonella_mastomydis_GCF_900185775",
               "Bartonella_melophagi_GCF_000278255",
               "Bartonella_queenslandensis_GCF_000312585",
               "Bartonella_quintana_GCF_000046685",
               "Bartonella_rattaustraliani_GCF_000312565",
               "Bartonella_rattimassiliensis_GCF_000278215",
               "Bartonella_rochalimae_GCF_000706645",
               "Bartonella_schoenbuchensis_GCF_002022685",
               "Bartonella_senegalensis_GCF_000312545",
               "Bartonella_sp_GCF_003606325",
               "Bartonella_tamiae_GCF_000279995",
               "Bartonella_taylorii_GCF_000278295",
               "Bartonella_tribocorum_GCF_000196435",
               "Bartonella_vinsonii_GCF_000341385",
               "Bartonella_washoeensis_GCF_000278135" ]

reference_taxon = "Bartonella_bacilliformis_GCF_000015445"

all_taxa = group_taxa + [reference_taxon]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [61]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000280015.1_Bart_alsa_IBS_382_V1_genomic.fna.gz

sent 42 bytes  received 486645 bytes  194674.80 bytes/sec
total size is 486410  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
d

GCA_000312525.1_ASM31252v1_genomic.fna.gz

sent 42 bytes  received 590098 bytes  393426.67 bytes/sec
total size is 589841  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000022725.1_ASM2272v1_genomic.fna.gz

sent 42 bytes  received 690170 bytes  460141.33 bytes/sec
total size is 689890  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netwo

GCA_000706645.1_Bart_roch_BMGH_V1_genomic.fna.gz

sent 42 bytes  received 444131 bytes  296115.33 bytes/sec
total size is 443907  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002022685.1_ASM202268v1_genomic.fna.gz

sent 42 bytes  received 501791 bytes  334555.33 bytes/sec
total size is 501557  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [62]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [63]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481503 ("sim_Bartonella_alsatica_GCF_000280015") has been submitted
Your job 5481504 ("sim_Bartonella_ancashensis_GCF_001281405") has been submitted
Your job 5481505 ("sim_Bartonella_apis_GCF_002007565") has been submitted
Your job 5481506 ("sim_Bartonella_australis_GCF_000341355") has been submitted
Your job 5481507 ("sim_Bartonella_birtlesii_GCF_000273375") has been submitted
Your job 5481508 ("sim_Bartonella_bovis_GCF_000384965") has been submitted
Your job 5481509 ("sim_Bartonella_clarridgeiae_GCF_000253015") has been submitted
Your job 5481510 ("sim_Bartonella_doshiae_GCF_000278155") has been submitted
Your job 5481511 ("sim_Bartonella_elizabethae_GCF_900638615") has been submitted
Your job 5481512 ("sim_Bartonella_florencae_GCF_000312525") has been submitted
Your job 5481513 ("sim_Bartonella_grahamii_GCF_000022725") has been submitted
Your job 5481514 ("sim_Bartonella_henselae_GCF_000046705") has been submitted
Your job 5481515 ("sim_Bartonella_koehlerae_GCF_000706625")

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [64]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/bartonella/cleaned_genomes/Bartonella_bacilliformis_GCF_000015445_formatted.fas path=results/phyluce/bartonella/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [65]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481532 ("map_Bartonella_alsatica_GCF_000280015") has been submitted
Your job 5481533 ("map_Bartonella_ancashensis_GCF_001281405") has been submitted
Your job 5481534 ("map_Bartonella_apis_GCF_002007565") has been submitted
Your job 5481535 ("map_Bartonella_australis_GCF_000341355") has been submitted
Your job 5481536 ("map_Bartonella_birtlesii_GCF_000273375") has been submitted
Your job 5481537 ("map_Bartonella_bovis_GCF_000384965") has been submitted
Your job 5481538 ("map_Bartonella_clarridgeiae_GCF_000253015") has been submitted
Your job 5481539 ("map_Bartonella_doshiae_GCF_000278155") has been submitted
Your job 5481540 ("map_Bartonella_elizabethae_GCF_900638615") has been submitted
Your job 5481541 ("map_Bartonella_florencae_GCF_000312525") has been submitted
Your job 5481542 ("map_Bartonella_grahamii_GCF_000022725") has been submitted
Your job 5481543 ("map_Bartonella_henselae_GCF_000046705") has been submitted
Your job 5481544 ("map_Bartonella_koehlerae_GCF_000706625")

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [66]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481561 ("merge_Bartonella_alsatica_GCF_000280015") has been submitted
Your job 5481562 ("merge_Bartonella_ancashensis_GCF_001281405") has been submitted
Your job 5481563 ("merge_Bartonella_apis_GCF_002007565") has been submitted
Your job 5481564 ("merge_Bartonella_australis_GCF_000341355") has been submitted
Your job 5481565 ("merge_Bartonella_birtlesii_GCF_000273375") has been submitted
Your job 5481566 ("merge_Bartonella_bovis_GCF_000384965") has been submitted
Your job 5481567 ("merge_Bartonella_clarridgeiae_GCF_000253015") has been submitted
Your job 5481568 ("merge_Bartonella_doshiae_GCF_000278155") has been submitted
Your job 5481569 ("merge_Bartonella_elizabethae_GCF_900638615") has been submitted
Your job 5481570 ("merge_Bartonella_florencae_GCF_000312525") has been submitted
Your job 5481571 ("merge_Bartonella_grahamii_GCF_000022725") has been submitted
Your job 5481572 ("merge_Bartonella_henselae_GCF_000046705") has been submitted
Your job 5481573 ("merge_Bartonella

remove loci that were masked in the original genome

In [67]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 720 sequences from Bartonella_alsatica_GCF_000280015_merged.bed.  Filtered 550 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 170.
Screened 562 sequences from Bartonella_ancashensis_GCF_001281405_merged.bed.  Filtered 430 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 132.
Screened 38 sequences from Bartonella_apis_GCF_002007565_merged.bed.  Filtered 32 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 400 sequences from Bartonella_australis_GCF_000341355_merged.bed.  Filtered 333 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 67.
Screened 678 sequences from Bartonella_birtlesii_GCF_000273375_merged.bed.  Filtered 521 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 157.
Screened 1124 sequences from Bartonella_bovis_GCF_000384965_merged.bed.  Filtered 784 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 340.
Screened 897 sequences from Bartonella_clarridgeiae_GCF_000253015_merg

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [68]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [69]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/bartonella/initial_intervals/Bartonella_washoeensis_GCF_000278135_merged.bed --twobit results/phyluce/bartonella/cleaned_genomes/Bartonella_bacilliformis_GCF_000015445_formatted.2bit --output results/phyluce/bartonella/initial_intervals/Bartonella_washoeensis_GCF_000278135_stripped.bed;
bartonella_alsatica_gcf_000280015.
bartonella_ancashensis_gcf_001281405.
bartonella_apis_gcf_002007565.
bartonella_australis_gcf_000341355.
bartonella_birtlesii_gcf_000273375.
bartonella_bovis_gcf_000384965.
bartonella_clarridgeiae_gcf_000253015.
bartonella_doshiae_gcf_000278155.
bartonella_elizabethae_gcf_900638615.
bartonella_florencae_gcf_000312525.
bartonella_grahamii_gcf_000022725.
bartonella_henselae_gcf_000046705.
bartonella_koehlerae_gcf_000706625.
bartonella_massiliensis_gcf_902150025.
bartonella_mastomydis_gcf_900185775.
bartonella_melophagi_gcf_000278255.
bartonella_queenslandensis_gcf_000312585

Quantify probes and the number of targeted taxa for each.

In [70]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/bartonella/initial_intervals/Bartonella_washoeensis_GCF_000278135_merged.bed --twobit results/phyluce/bartonella/cleaned_genomes/Bartonella_bacilliformis_GCF_000015445_formatted.2bit --output results/phyluce/bartonella/initial_intervals/Bartonella_washoeensis_GCF_000278135_stripped.bed;
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 0 taxa:	911.0
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 1 taxa:	911.0
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 2 taxa:	643.0
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 3 taxa:	498.0
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 4 taxa:	400.0
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 5 taxa:	335.0
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 6 taxa:	274.0
Loci shared by Bartonella_bacilliformis_GCF_000015445 + 7 taxa:	231.0
Loci shared by Bartonella_bacilliformis_GCF_000015445

In [71]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 11
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/bartonella/initial_intervals/bartonella-to-Bartonella_bacilliformis_GCF_000015445.sqlite --base-taxon Bartonella_bacilliformis_GCF_000015445 --output results/phyluce/bartonella/initial_intervals/Bartonella_bacilliformis_GCF_000015445_+11.bed --specific-counts 11;
Counter({'bartonella_melophagi_gcf_000278255': 113, 'bartonella_schoenbuchensis_gcf_002022685': 107, 'bartonella_bovis_gcf_000384965': 106, 'bartonella_vinsonii_gcf_000341385': 101, 'bartonella_koehlerae_gcf_000706625': 98, 'bartonella_clarridgeiae_gcf_000253015': 98, 'bartonella_quintana_gcf_000046685': 97, 'bartonella_taylorii_gcf_000278295': 96, 'bartonella_washoeensis_gcf_000278135': 96, 'bartonella_henselae_gcf_000046705': 95, 'bartonella_birtlesii_gcf_000273375': 88, 'bartonella_senegalensis_gcf_000312545': 86, 'bartonella_rochalimae_gcf_000706645': 85, 'bartonella_grahamii_gcf_000022725': 85, 'bartonella_alsatica_gcf_000280015': 84, 'bartonella_doshiae_gcf_00027

## Design temp set of baits

In [72]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/bartonella/initial_intervals/Bartonella_bacilliformis_GCF_000015445_+11.bed --twobit results/phyluce/bartonella/cleaned_genomes/Bartonella_bacilliformis_GCF_000015445_formatted.2bit --buffer-to 160 --output results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11.fasta;
Screened 136 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 136.


design the baits

In [73]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11.fasta --probe-prefix uce_bartonella_ --design bartonella_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
G


Conserved locus count = 136
Probe Count = 271


## Find duplicate baited regions

In [74]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas --query results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  16:42:19
Ended:  Thu Feb 06, 2020  16:42:19
Time for execution:  0.00309858322144 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas                        --lastz results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes_vself.lastz                       --probe-prefix=uce_bartonella_;
Parsing lastz file...
Screening results...
Screened 270 fasta sequences.  Filtered 6 d

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [75]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [76]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/bartonella/validate_intervals/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas --scaffoldlist Bartonella_alsatica_GCF_000280015 Bartonella_ancashensis_GCF_001281405 Bartonella_apis_GCF_002007565 Bartonella_australis_GCF_000341355 Bartonella_birtlesii_GCF_000273375 Bartonella_bovis_GCF_000384965 Bartonella_clarridgeiae_GCF_000253015 Bartonella_doshiae_GCF_000278155 Bartonella_elizabethae_GCF_900638615 Bartonella_florencae_GCF_000312525 Bartonella_grahamii_GCF_000022725 Bartonella_henselae_GCF_000046705 Bartonella_koehlerae_GCF_000706625 Bartonella_massiliensis_GCF_902150025 Bartonella_mastomydis_GCF_900185775 Bartonella_melophagi_GCF_000278255 Bartonella_queenslandensis_GCF_000312585 Bartonella_quintana_GCF_000046685 Bartonella_rattaustraliani_GCF_000312565 Bartonella_rattimassiliensis_GCF_000278215 Bartonella_rochalimae_GCF_000706645 Bartonella_schoenbuchensis_GCF_002022685 Bartonella_senegalensis_GCF_000312

Inserting data to Bartonella_henselae_GCF_000046705 table

Running against Bartonella_koehlerae_GCF_000706625.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpxvhd4u.fasta

Writing the results file...
	/tmp/tmptue79K.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bartonella/validate_intervals/lastz/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas_v_Bartonella_koehlerae_GCF_000706625.lastz
Creating Bartonella_koehlerae_GCF_000706625 table
Inserting data to Bartonella_koehlerae_GCF_000706625 table

Running against Bartonella_massiliensis_GCF_902150025.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpQAdrFi.fasta

Writing the results file...
	/tmp/tmpcgXntp.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bartonella/validate_intervals/lastz/


Running against Bartonella_tribocorum_GCF_000196435.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpgGSIBS.fasta

Writing the results file...
	/tmp/tmprTXj4n.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bartonella/validate_intervals/lastz/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas_v_Bartonella_tribocorum_GCF_000196435.lastz
Creating Bartonella_tribocorum_GCF_000196435 table
Inserting data to Bartonella_tribocorum_GCF_000196435 table

Running against Bartonella_vinsonii_GCF_000341385.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp4Tk4M2.fasta

Writing the results file...
	/tmp/tmpmkv9rH.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/bartonella/validate_intervals/lastz/Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [77]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/bartonella/extract_probes_from_group/bartonella_genome.conf --lastz results/phyluce/bartonella/validate_intervals/lastz --probes 120 --probe-prefix uce_bartonella_ --name-pattern "Bartonella_bacilliformis_GCF_000015445_+11_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/bartonella/extract_probes_from_group/probe_fasta;
2020-02-06 16:42:52,840 - Phyluce - INFO - ------ Working on Bartonella_alsatica_GCF_000280015 genome ------
2020-02-06 16:42:52,841 - Phyluce - INFO - Reading Bartonella_alsatica_GCF_000280015 genome
2020-02-06 16:42:53,399 - Phyluce - INFO - Bartonella_alsatica_GCF_000280015: 136 uces, 0 dupes, 136 non-dupes, 11 orient drop, 4 length drop, 121 written
2020-02-06 16:42:53,399 - Phyluce - INFO - ----- Working on Bartonella_ancashensis_GCF_001281405 genome ----
2020-02-06 16:42:53,411 - Phyluce - INFO - Reading Bartonella_ancashensis_GCF_001281405 genome
2020-02-06 16:42:53,901 - Phyluce - INFO - 

2020-02-06 16:43:03,631 - Phyluce - INFO - Bartonella_schoenbuchensis_GCF_002022685: 136 uces, 0 dupes, 136 non-dupes, 12 orient drop, 4 length drop, 120 written
2020-02-06 16:43:03,631 - Phyluce - INFO - ---- Working on Bartonella_senegalensis_GCF_000312545 genome ----
2020-02-06 16:43:03,632 - Phyluce - INFO - Reading Bartonella_senegalensis_GCF_000312545 genome
2020-02-06 16:43:04,092 - Phyluce - INFO - Bartonella_senegalensis_GCF_000312545: 136 uces, 5 dupes, 131 non-dupes, 2 orient drop, 1 length drop, 128 written
2020-02-06 16:43:04,093 - Phyluce - INFO - --------- Working on Bartonella_sp_GCF_003606325 genome ---------
2020-02-06 16:43:04,099 - Phyluce - INFO - Reading Bartonella_sp_GCF_003606325 genome
2020-02-06 16:43:04,590 - Phyluce - INFO - Bartonella_sp_GCF_003606325: 136 uces, 0 dupes, 136 non-dupes, 7 orient drop, 10 length drop, 119 written
2020-02-06 16:43:04,590 - Phyluce - INFO - ------- Working on Bartonella_tamiae_GCF_000279995 genome -------
2020-02-06 16:43:04,59

In [78]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/bartonella/extract_probes_from_group/probe_fasta --output results/phyluce/bartonella/extract_probes_from_group/multifastas.sqlite --base-taxon Bartonella_bacilliformis_GCF_000015445;
bartonella_alsatica_gcf_000280015.
bartonella_ancashensis_gcf_001281405.
bartonella_apis_gcf_002007565.
bartonella_australis_gcf_000341355.
bartonella_birtlesii_gcf_000273375.
bartonella_bovis_gcf_000384965.
bartonella_clarridgeiae_gcf_000253015.
bartonella_doshiae_gcf_000278155.
bartonella_elizabethae_gcf_900638615.
bartonella_florencae_gcf_000312525.
bartonella_grahamii_gcf_000022725.
bartonella_henselae_gcf_000046705.
bartonella_koehlerae_gcf_000706625.
bartonella_massiliensis_gcf_902150025.
bartonella_mastomydis_gcf_900185775.
bartonella_melophagi_gcf_000278255.
bartonella_queenslandensis_gcf_000312585.
bartonella_quintana_gcf_000046685.
bartonella_rattaustraliani_gcf_000312565.
bartonella_rattimassiliensis_gcf_000278215.
bartonella_rochalima

In [79]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(29)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/bartonella/extract_probes_from_group/multifastas.sqlite  --base-taxon Bartonella_bacilliformis_GCF_000015445 --output results/phyluce/bartonella/extract_probes_from_group/Bartonella_bacilliformis_GCF_000015445+11-back-to-29.conf --specific-counts 29;
Counter({'bartonella_bovis_gcf_000384965': 112, 'bartonella_rochalimae_gcf_000706645': 112, 'bartonella_tamiae_gcf_000279995': 112, 'bartonella_tribocorum_gcf_000196435': 112, 'bartonella_koehlerae_gcf_000706625': 112, 'bartonella_melophagi_gcf_000278255': 112, 'bartonella_doshiae_gcf_000278155': 112, 'bartonella_birtlesii_gcf_000273375': 112, 'bartonella_henselae_gcf_000046705': 112, 'bartonella_grahamii_gcf_000022725': 112, 'bartonella_schoenbuchensis_gcf_002022685': 112, 'bartonella_massiliensis_gcf_902150025': 112, 'bartonella_australis_gcf_000341355': 112, 'bartonella_elizabethae_gcf_900638615': 112, 'bartonella_ancashensis_gcf_001281405': 112, 'bartonella_rattaustraliani_gcf_

## Final group specific bait design

In [80]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/bartonella/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/bartonella/extract_probes_from_group/Bartonella_bacilliformis_GCF_000015445+11-back-to-29.conf --probe-prefix uce_bartonella_ --designer rnplattii --design bartonella_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 112
Probe Count = 6644


In [81]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list.fasta --query results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  16:45:53
Ended:  Thu Feb 06, 2020  16:46:14
Time for execution:  0.355904750029 minutes


In [82]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list.fasta --lastz results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_bartonella_;
Parsing lastz file...
Screening results...
Screened 6643 fasta sequences.  Filtered 0 duplicates. Kept 6644.


## CDhit to reduce numbers

In [83]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list.fasta
         -o
         results/phyluce/bartonella/final_probe_design/bartonella_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 16:54:31 2020
                            Output                              
----------------------------------------------------------------
total seq: 6644
longest and shortest : 80 and 80
Total letters: 531520
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 88M

Table limit with the given memory limit:
Max number of representatives: 3953696
Max number of word counting entries: 88997714

# comparing sequences from          0  to       1107
.---------- new table with      760 representatives
# comparing sequences from       1

# Burkholderia

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [84]:
group = 'burkholderia'

In [85]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [86]:
group_taxa = [ "Burkholderia_agricolaris_GCF_009455635",
               "Burkholderia_ambifaria_GCF_000959545",
               "Burkholderia_anthina_GCF_001547525",
               "Burkholderia_bonniea_GCF_009455625",
               "Burkholderia_catarinensis_GCF_001883705",
               "Burkholderia_cenocepacia_GCF_001999785",
               "Burkholderia_cepacia_GCF_001411495",
               "Burkholderia_contaminans_GCF_001029145",
               "Burkholderia_diffusa_GCF_001718315",
               "Burkholderia_dolosa_GCF_003813045",
               "Burkholderia_gladioli_GCF_000959725",
               "Burkholderia_glumae_GCF_000960995",
               "Burkholderia_hayleyella_GCF_009455685",
               "Burkholderia_humptydooensis_GCF_001462435",
               "Burkholderia_insecticola_GCF_000402035",
               "Burkholderia_lata_GCF_000012945",
               "Burkholderia_latens_GCF_001718795",
               "Burkholderia_metallica_GCF_001718555",
               "Burkholderia_multivorans_GCF_000959525",
               "Burkholderia_novacaledonica_GCF_900258035",
               "Burkholderia_oklahomensis_GCF_000959365",
               "Burkholderia_paludis_GCF_000732615",
               "Burkholderia_plantarii_GCF_001411805",
               "Burkholderia_pseudomallei_GCF_000011545",
               "Burkholderia_pseudomultivorans_GCF_001718415",
               "Burkholderia_puraquae_GCF_002099195",
               "Burkholderia_pyrrocinia_GCF_001028665",
               "Burkholderia_reimsis_GCF_003294055",
               "Burkholderia_seminalis_GCF_001718535",
               "Burkholderia_singularis_GCF_900176645",
               "Burkholderia_sp_GCF_005280735",
               "Burkholderia_stabilis_GCF_001742165",
               "Burkholderia_stagnalis_GCF_001718955",
               "Burkholderia_territorii_GCF_001718335",
               "Burkholderia_thailandensis_GCF_003568605",
               "Burkholderia_ubonensis_GCF_000959245",
               "Burkholderia_vietnamiensis_GCF_000959445" ]
                    
reference_taxon = "Burkholderia_mallei_GCF_000011705"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [87]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_009455635.1_ASM945563v1_genomic.fna.gz

sent 42 bytes  received 2550418 bytes  1020184.00 bytes/sec
total size is 2549688  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
discipl

GCA_000959725.1_ASM95972v1_genomic.fna.gz

sent 42 bytes  received 2512846 bytes  1675258.67 bytes/sec
total size is 2512125  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000960995.1_ASM96099v1_genomic.fna.gz

sent 42 bytes  received 1926199 bytes  1284160.67 bytes/sec
total size is 1925622  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_000959365.1_ASM95936v1_genomic.fna.gz

sent 42 bytes  received 2003289 bytes  1335554.00 bytes/sec
total size is 2002688  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000732615.1_ASM73261v1_genomic.fna.gz

sent 42 bytes  received 2453364 bytes  981362.40 bytes/sec
total size is 2452659  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer,

GCA_001742165.1_ASM174216v1_genomic.fna.gz

sent 42 bytes  received 2414315 bytes  1609571.33 bytes/sec
total size is 2413617  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001718955.1_ASM171895v1_genomic.fna.gz

sent 42 bytes  received 2129384 bytes  1419617.33 bytes/sec
total size is 2128758  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [88]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [89]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481656 ("sim_Burkholderia_agricolaris_GCF_009455635") has been submitted
Your job 5481657 ("sim_Burkholderia_ambifaria_GCF_000959545") has been submitted
Your job 5481658 ("sim_Burkholderia_anthina_GCF_001547525") has been submitted
Your job 5481659 ("sim_Burkholderia_bonniea_GCF_009455625") has been submitted
Your job 5481660 ("sim_Burkholderia_catarinensis_GCF_001883705") has been submitted
Your job 5481661 ("sim_Burkholderia_cenocepacia_GCF_001999785") has been submitted
Your job 5481662 ("sim_Burkholderia_cepacia_GCF_001411495") has been submitted
Your job 5481663 ("sim_Burkholderia_contaminans_GCF_001029145") has been submitted
Your job 5481664 ("sim_Burkholderia_diffusa_GCF_001718315") has been submitted
Your job 5481665 ("sim_Burkholderia_dolosa_GCF_003813045") has been submitted
Your job 5481666 ("sim_Burkholderia_gladioli_GCF_000959725") has been submitted
Your job 5481667 ("sim_Burkholderia_glumae_GCF_000960995") has been submitted
Your job 5481668 ("sim_Burkholderi

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [90]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/burkholderia/cleaned_genomes/Burkholderia_mallei_GCF_000011705_formatted.fas path=results/phyluce/burkholderia/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [91]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481693 ("map_Burkholderia_agricolaris_GCF_009455635") has been submitted
Your job 5481694 ("map_Burkholderia_ambifaria_GCF_000959545") has been submitted
Your job 5481695 ("map_Burkholderia_anthina_GCF_001547525") has been submitted
Your job 5481696 ("map_Burkholderia_bonniea_GCF_009455625") has been submitted
Your job 5481697 ("map_Burkholderia_catarinensis_GCF_001883705") has been submitted
Your job 5481698 ("map_Burkholderia_cenocepacia_GCF_001999785") has been submitted
Your job 5481699 ("map_Burkholderia_cepacia_GCF_001411495") has been submitted
Your job 5481700 ("map_Burkholderia_contaminans_GCF_001029145") has been submitted
Your job 5481701 ("map_Burkholderia_diffusa_GCF_001718315") has been submitted
Your job 5481702 ("map_Burkholderia_dolosa_GCF_003813045") has been submitted
Your job 5481703 ("map_Burkholderia_gladioli_GCF_000959725") has been submitted
Your job 5481704 ("map_Burkholderia_glumae_GCF_000960995") has been submitted
Your job 5481705 ("map_Burkholderi

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [92]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481730 ("merge_Burkholderia_agricolaris_GCF_009455635") has been submitted
Your job 5481731 ("merge_Burkholderia_ambifaria_GCF_000959545") has been submitted
Your job 5481732 ("merge_Burkholderia_anthina_GCF_001547525") has been submitted
Your job 5481733 ("merge_Burkholderia_bonniea_GCF_009455625") has been submitted
Your job 5481734 ("merge_Burkholderia_catarinensis_GCF_001883705") has been submitted
Your job 5481735 ("merge_Burkholderia_cenocepacia_GCF_001999785") has been submitted
Your job 5481736 ("merge_Burkholderia_cepacia_GCF_001411495") has been submitted
Your job 5481737 ("merge_Burkholderia_contaminans_GCF_001029145") has been submitted
Your job 5481738 ("merge_Burkholderia_diffusa_GCF_001718315") has been submitted
Your job 5481739 ("merge_Burkholderia_dolosa_GCF_003813045") has been submitted
Your job 5481740 ("merge_Burkholderia_gladioli_GCF_000959725") has been submitted
Your job 5481741 ("merge_Burkholderia_glumae_GCF_000960995") has been submitted
Your job 5

remove loci that were masked in the original genome

In [93]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 2546 sequences from Burkholderia_agricolaris_GCF_009455635_merged.bed.  Filtered 1621 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 925.
Screened 5940 sequences from Burkholderia_ambifaria_GCF_000959545_merged.bed.  Filtered 2660 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3280.
Screened 5988 sequences from Burkholderia_anthina_GCF_001547525_merged.bed.  Filtered 2724 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3264.
Screened 670 sequences from Burkholderia_bonniea_GCF_009455625_merged.bed.  Filtered 519 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 151.
Screened 6137 sequences from Burkholderia_catarinensis_GCF_001883705_merged.bed.  Filtered 2805 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3332.
Screened 6075 sequences from Burkholderia_cenocepacia_GCF_001999785_merged.bed.  Filtered 2766 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3309.
Screened 6080 sequences from 

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [94]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [95]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/burkholderia/initial_intervals/Burkholderia_vietnamiensis_GCF_000959445_merged.bed --twobit results/phyluce/burkholderia/cleaned_genomes/Burkholderia_mallei_GCF_000011705_formatted.2bit --output results/phyluce/burkholderia/initial_intervals/Burkholderia_vietnamiensis_GCF_000959445_stripped.bed;
burkholderia_agricolaris_gcf_009455635.
burkholderia_ambifaria_gcf_000959545....
burkholderia_anthina_gcf_001547525....
burkholderia_bonniea_gcf_009455625.
burkholderia_catarinensis_gcf_001883705....
burkholderia_cenocepacia_gcf_001999785....
burkholderia_cepacia_gcf_001411495....
burkholderia_contaminans_gcf_001029145....
burkholderia_diffusa_gcf_001718315....
burkholderia_dolosa_gcf_003813045....
burkholderia_gladioli_gcf_000959725...
burkholderia_glumae_gcf_000960995...
burkholderia_hayleyella_gcf_009455685.
burkholderia_humptydooensis_gcf_001462435....
burkholderia_insecticola_gcf_000402035..


Quantify probes and the number of targeted taxa for each.

In [97]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/burkholderia/initial_intervals/Burkholderia_vietnamiensis_GCF_000959445_merged.bed --twobit results/phyluce/burkholderia/cleaned_genomes/Burkholderia_mallei_GCF_000011705_formatted.2bit --output results/phyluce/burkholderia/initial_intervals/Burkholderia_vietnamiensis_GCF_000959445_stripped.bed;
Loci shared by Burkholderia_mallei_GCF_000011705 + 0 taxa:	9,314.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 1 taxa:	9,314.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 2 taxa:	8,927.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 3 taxa:	8,337.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 4 taxa:	7,220.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 5 taxa:	6,128.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 6 taxa:	5,723.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 7 taxa:	5,469.0
Loci shared by Burkholderia_mallei_GCF_000011705 + 8 taxa:	5,214.0
L

In [98]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 36
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/burkholderia/initial_intervals/burkholderia-to-Burkholderia_mallei_GCF_000011705.sqlite --base-taxon Burkholderia_mallei_GCF_000011705 --output results/phyluce/burkholderia/initial_intervals/Burkholderia_mallei_GCF_000011705_+36.bed --specific-counts 36;
Counter({'burkholderia_ambifaria_gcf_000959545': 153, 'burkholderia_humptydooensis_gcf_001462435': 153, 'burkholderia_pyrrocinia_gcf_001028665': 153, 'burkholderia_dolosa_gcf_003813045': 153, 'burkholderia_contaminans_gcf_001029145': 153, 'burkholderia_cepacia_gcf_001411495': 153, 'burkholderia_plantarii_gcf_001411805': 153, 'burkholderia_latens_gcf_001718795': 153, 'burkholderia_cenocepacia_gcf_001999785': 153, 'burkholderia_vietnamiensis_gcf_000959445': 153, 'burkholderia_gladioli_gcf_000959725': 153, 'burkholderia_insecticola_gcf_000402035': 153, 'burkholderia_ubonensis_gcf_000959245': 153, 'burkholderia_territorii_gcf_001718335': 153, 'burkholderia_glumae_gcf_000960995': 15

## Design temp set of baits

In [99]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/burkholderia/initial_intervals/Burkholderia_mallei_GCF_000011705_+36.bed --twobit results/phyluce/burkholderia/cleaned_genomes/Burkholderia_mallei_GCF_000011705_formatted.2bit --buffer-to 160 --output results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36.fasta;
Screened 153 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 153.


design the baits

In [100]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36.fasta --probe-prefix uce_burkholderia_ --design burkholderia_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGG


Conserved locus count = 149
Probe Count = 292


## Find duplicate baited regions

In [101]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas --query results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  17:10:16
Ended:  Thu Feb 06, 2020  17:10:16
Time for execution:  0.00308758020401 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas                        --lastz results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36_temp_probes_vself.lastz                       --probe-prefix=uce_burkholderia_;
Parsing lastz file...
Screening results...
Screened 291 fasta sequences.  Filtered 2 duplicates. Ke

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [102]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [103]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/burkholderia/validate_intervals/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas --scaffoldlist Burkholderia_agricolaris_GCF_009455635 Burkholderia_ambifaria_GCF_000959545 Burkholderia_anthina_GCF_001547525 Burkholderia_bonniea_GCF_009455625 Burkholderia_catarinensis_GCF_001883705 Burkholderia_cenocepacia_GCF_001999785 Burkholderia_cepacia_GCF_001411495 Burkholderia_contaminans_GCF_001029145 Burkholderia_diffusa_GCF_001718315 Burkholderia_dolosa_GCF_003813045 Burkholderia_gladioli_GCF_000959725 Burkholderia_glumae_GCF_000960995 Burkholderia_hayleyella_GCF_009455685 Burkholderia_humptydooensis_GCF_001462435 Burkholderia_insecticola_GCF_000402035 Burkholderia_lata_GCF_000012945 Burkholderia_latens_GCF_001718795 Burkholderia_metallica_GCF_001718555 Burkholderia_multivorans_GCF_000959525 Burkholderia_novacaledonica_GCF_900258035 Burkholderia_oklahomensis_GCF_000959365 Burkholderia_paludis_GCF_000732615 Burkholderia_p


Running against Burkholderia_glumae_GCF_000960995.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp4wTLPw.fasta

Writing the results file...
	/tmp/tmpzM4ijp.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/burkholderia/validate_intervals/lastz/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas_v_Burkholderia_glumae_GCF_000960995.lastz
Creating Burkholderia_glumae_GCF_000960995 table
Inserting data to Burkholderia_glumae_GCF_000960995 table

Running against Burkholderia_hayleyella_GCF_009455685.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpZ3w3W3.fasta

Writing the results file...
	/tmp/tmpl3Ggpx.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/burkholderia/validate_intervals/lastz/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas_v_Burkhol

Running the targets against 1 queries...
	/tmp/tmpzxCplW.fasta

Writing the results file...
	/tmp/tmpkaxWlo.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/burkholderia/validate_intervals/lastz/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas_v_Burkholderia_puraquae_GCF_002099195.lastz
Creating Burkholderia_puraquae_GCF_002099195 table
Inserting data to Burkholderia_puraquae_GCF_002099195 table

Running against Burkholderia_pyrrocinia_GCF_001028665.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpBqaoau.fasta

Writing the results file...
	/tmp/tmp5q6zIR.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/burkholderia/validate_intervals/lastz/Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas_v_Burkholderia_pyrrocinia_GCF_001028665.lastz
Creating Burkholderia_pyrrocinia_GCF_001028665 table
Inserting data to Burkholder

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [104]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/burkholderia/extract_probes_from_group/burkholderia_genome.conf --lastz results/phyluce/burkholderia/validate_intervals/lastz --probes 120 --probe-prefix uce_burkholderia_ --name-pattern "Burkholderia_mallei_GCF_000011705_+36_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/burkholderia/extract_probes_from_group/probe_fasta;
2020-02-06 17:12:20,800 - Phyluce - INFO - ---- Working on Burkholderia_agricolaris_GCF_009455635 genome ---
2020-02-06 17:12:20,801 - Phyluce - INFO - Reading Burkholderia_agricolaris_GCF_009455635 genome
2020-02-06 17:12:21,652 - Phyluce - INFO - Burkholderia_agricolaris_GCF_009455635: 149 uces, 27 dupes, 122 non-dupes, 11 orient drop, 8 length drop, 103 written
2020-02-06 17:12:21,652 - Phyluce - INFO - ----- Working on Burkholderia_ambifaria_GCF_000959545 genome ----
2020-02-06 17:12:21,662 - Phyluce - INFO - Reading Burkholderia_ambifaria_GCF_000959545 genome
2020-02-06 17:12:22,427 - P

2020-02-06 17:12:36,679 - Phyluce - INFO - Burkholderia_paludis_GCF_000732615: 149 uces, 38 dupes, 111 non-dupes, 0 orient drop, 2 length drop, 109 written
2020-02-06 17:12:36,679 - Phyluce - INFO - ----- Working on Burkholderia_plantarii_GCF_001411805 genome ----
2020-02-06 17:12:36,691 - Phyluce - INFO - Reading Burkholderia_plantarii_GCF_001411805 genome
2020-02-06 17:12:37,427 - Phyluce - INFO - Burkholderia_plantarii_GCF_001411805: 149 uces, 28 dupes, 121 non-dupes, 8 orient drop, 12 length drop, 101 written
2020-02-06 17:12:37,427 - Phyluce - INFO - --- Working on Burkholderia_pseudomallei_GCF_000011545 genome ---
2020-02-06 17:12:37,427 - Phyluce - INFO - Reading Burkholderia_pseudomallei_GCF_000011545 genome
2020-02-06 17:12:38,125 - Phyluce - INFO - Burkholderia_pseudomallei_GCF_000011545: 149 uces, 27 dupes, 122 non-dupes, 2 orient drop, 15 length drop, 105 written
2020-02-06 17:12:38,125 - Phyluce - INFO - - Working on Burkholderia_pseudomultivorans_GCF_001718415 genome 
202

In [105]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/burkholderia/extract_probes_from_group/probe_fasta --output results/phyluce/burkholderia/extract_probes_from_group/multifastas.sqlite --base-taxon Burkholderia_mallei_GCF_000011705;
burkholderia_agricolaris_gcf_009455635.
burkholderia_ambifaria_gcf_000959545.
burkholderia_anthina_gcf_001547525.
burkholderia_bonniea_gcf_009455625.
burkholderia_catarinensis_gcf_001883705.
burkholderia_cenocepacia_gcf_001999785.
burkholderia_cepacia_gcf_001411495.
burkholderia_contaminans_gcf_001029145.
burkholderia_diffusa_gcf_001718315.
burkholderia_dolosa_gcf_003813045.
burkholderia_gladioli_gcf_000959725.
burkholderia_glumae_gcf_000960995.
burkholderia_hayleyella_gcf_009455685.
burkholderia_humptydooensis_gcf_001462435.
burkholderia_insecticola_gcf_000402035.
burkholderia_lata_gcf_000012945.
burkholderia_latens_gcf_001718795.
burkholderia_metallica_gcf_001718555.
burkholderia_multivorans_gcf_000959525.
burkholderia_novacaledonica_gcf_9002580

In [106]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(32)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/burkholderia/extract_probes_from_group/multifastas.sqlite  --base-taxon Burkholderia_mallei_GCF_000011705 --output results/phyluce/burkholderia/extract_probes_from_group/Burkholderia_mallei_GCF_000011705+36-back-to-32.conf --specific-counts 32;
Counter({'burkholderia_dolosa_gcf_003813045': 96, 'burkholderia_cenocepacia_gcf_001999785': 96, 'burkholderia_puraquae_gcf_002099195': 96, 'burkholderia_multivorans_gcf_000959525': 96, 'burkholderia_diffusa_gcf_001718315': 96, 'burkholderia_singularis_gcf_900176645': 96, 'burkholderia_contaminans_gcf_001029145': 96, 'burkholderia_hayleyella_gcf_009455685': 96, 'burkholderia_reimsis_gcf_003294055': 96, 'burkholderia_ambifaria_gcf_000959545': 95, 'burkholderia_mallei_gcf_000011705': 95, 'burkholderia_vietnamiensis_gcf_000959445': 95, 'burkholderia_catarinensis_gcf_001883705': 95, 'burkholderia_metallica_gcf_001718555': 95, 'burkholderia_seminalis_gcf_001718535': 95, 'burkholderia_stabilis_

## Final group specific bait design

In [108]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/burkholderia/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/burkholderia/extract_probes_from_group/Burkholderia_mallei_GCF_000011705+36-back-to-32.conf --probe-prefix uce_burkholderia_ --designer rnplattii --design burkholderia_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGN


Conserved locus count = 96
Probe Count = 7005


In [110]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list.fasta --query results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  17:13:39
Ended:  Thu Feb 06, 2020  17:14:07
Time for execution:  0.463900315762 minutes


In [111]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list.fasta --lastz results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_burkholderia_;
Parsing lastz file...
Screening results...
Screened 7004 fasta sequences.  Filtered 0 duplicates. Kept 7005.


## CDhit to reduce numbers

In [112]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list.fasta
         -o
         results/phyluce/burkholderia/final_probe_design/burkholderia_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 17:25:14 2020
                            Output                              
----------------------------------------------------------------
total seq: 7005
longest and shortest : 80 and 80
Total letters: 560400
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 88M

Table limit with the given memory limit:
Max number of representatives: 3953038
Max number of word counting entries: 88982893

# comparing sequences from          0  to       1167
.---------- new table with      340 representatives
# comparing sequences from

# Campylobacter

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [113]:
group = 'campylobacter'

In [114]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [115]:
group_taxa = [ "Campylobacter_armoricus_GCF_009036385",
               "Campylobacter_avium_GCF_002238335",
               "Campylobacter_blaseri_GCF_003015205",
               "Campylobacter_coli_GCF_002024185",
               "Campylobacter_concisus_GCF_001298465",
               "Campylobacter_corcagiensis_GCF_000597805",
               "Campylobacter_cuniculorum_GCF_002104335",
               "Campylobacter_curvus_GCF_000017465",
               "Campylobacter_fetus_GCF_008271385",
               "Campylobacter_geochelonis_GCF_900065885",
               "Campylobacter_gracilis_GCF_001190745",
               "Campylobacter_helveticus_GCF_002080395",
               "Campylobacter_hepaticus_GCF_001687475",
               "Campylobacter_hominis_GCF_000017585",
               "Campylobacter_hyointestinalis_GCF_001643955",
               "Campylobacter_iguaniorum_GCF_000736415",
               "Campylobacter_insulaenigrae_GCF_000816185",
               "Campylobacter_lanienae_GCF_002139935",
               "Campylobacter_lari_GCF_000019205",
               "Campylobacter_ornithocola_GCF_001705345",
               "Campylobacter_peloridis_GCF_000816785",
               "Campylobacter_pinnipediorum_GCF_002021925",
               "Campylobacter_rectus_GCF_000174175",
               "Campylobacter_showae_GCF_000313615",
               "Campylobacter_sp_GCF_002139895",
               "Campylobacter_sptorum_GCF_002220735",
               "Campylobacter_subantarcticus_GCF_000816305",
               "Campylobacter_troglodytis_GCF_006864425",
               "Campylobacter_upsaliensis_GCF_000620965",
               "Campylobacter_ureolyticus_GCF_001190755",
               "Campylobacter_volucris_GCF_000816345" ]
                    
reference_taxon = "Campylobacter_jejuni_GCF_000009085"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [116]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_009036385.1_ASM903638v1_genomic.fna.gz

sent 42 bytes  received 491694 bytes  327824.00 bytes/sec
total size is 491468  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplina

GCA_001190745.1_ASM119074v1_genomic.fna.gz

sent 42 bytes  received 664480 bytes  443014.67 bytes/sec
total size is 664206  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002080395.1_ASM208039v1_genomic.fna.gz

sent 42 bytes  received 534429 bytes  356314.00 bytes/sec
total size is 534187  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, ne

GCA_000816785.1_ASM81678v1_genomic.fna.gz

sent 42 bytes  received 480434 bytes  320317.33 bytes/sec
total size is 480209  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002021925.1_ASM202192v1_genomic.fna.gz

sent 42 bytes  received 507997 bytes  338692.67 bytes/sec
total size is 507763  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, net

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [117]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [118]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481767 ("sim_Campylobacter_armoricus_GCF_009036385") has been submitted
Your job 5481768 ("sim_Campylobacter_avium_GCF_002238335") has been submitted
Your job 5481769 ("sim_Campylobacter_blaseri_GCF_003015205") has been submitted
Your job 5481770 ("sim_Campylobacter_coli_GCF_002024185") has been submitted
Your job 5481771 ("sim_Campylobacter_concisus_GCF_001298465") has been submitted
Your job 5481772 ("sim_Campylobacter_corcagiensis_GCF_000597805") has been submitted
Your job 5481773 ("sim_Campylobacter_cuniculorum_GCF_002104335") has been submitted
Your job 5481774 ("sim_Campylobacter_curvus_GCF_000017465") has been submitted
Your job 5481775 ("sim_Campylobacter_fetus_GCF_008271385") has been submitted
Your job 5481776 ("sim_Campylobacter_geochelonis_GCF_900065885") has been submitted
Your job 5481777 ("sim_Campylobacter_gracilis_GCF_001190745") has been submitted
Your job 5481778 ("sim_Campylobacter_helveticus_GCF_002080395") has been submitted
Your job 5481779 ("sim_Campy

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [119]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/campylobacter/cleaned_genomes/Campylobacter_jejuni_GCF_000009085_formatted.fas path=results/phyluce/campylobacter/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [120]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481798 ("map_Campylobacter_armoricus_GCF_009036385") has been submitted
Your job 5481799 ("map_Campylobacter_avium_GCF_002238335") has been submitted
Your job 5481800 ("map_Campylobacter_blaseri_GCF_003015205") has been submitted
Your job 5481801 ("map_Campylobacter_coli_GCF_002024185") has been submitted
Your job 5481802 ("map_Campylobacter_concisus_GCF_001298465") has been submitted
Your job 5481803 ("map_Campylobacter_corcagiensis_GCF_000597805") has been submitted
Your job 5481804 ("map_Campylobacter_cuniculorum_GCF_002104335") has been submitted
Your job 5481805 ("map_Campylobacter_curvus_GCF_000017465") has been submitted
Your job 5481806 ("map_Campylobacter_fetus_GCF_008271385") has been submitted
Your job 5481807 ("map_Campylobacter_geochelonis_GCF_900065885") has been submitted
Your job 5481808 ("map_Campylobacter_gracilis_GCF_001190745") has been submitted
Your job 5481809 ("map_Campylobacter_helveticus_GCF_002080395") has been submitted
Your job 5481810 ("map_Campy

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [121]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481829 ("merge_Campylobacter_armoricus_GCF_009036385") has been submitted
Your job 5481830 ("merge_Campylobacter_avium_GCF_002238335") has been submitted
Your job 5481831 ("merge_Campylobacter_blaseri_GCF_003015205") has been submitted
Your job 5481832 ("merge_Campylobacter_coli_GCF_002024185") has been submitted
Your job 5481833 ("merge_Campylobacter_concisus_GCF_001298465") has been submitted
Your job 5481834 ("merge_Campylobacter_corcagiensis_GCF_000597805") has been submitted
Your job 5481835 ("merge_Campylobacter_cuniculorum_GCF_002104335") has been submitted
Your job 5481836 ("merge_Campylobacter_curvus_GCF_000017465") has been submitted
Your job 5481837 ("merge_Campylobacter_fetus_GCF_008271385") has been submitted
Your job 5481838 ("merge_Campylobacter_geochelonis_GCF_900065885") has been submitted
Your job 5481839 ("merge_Campylobacter_gracilis_GCF_001190745") has been submitted
Your job 5481840 ("merge_Campylobacter_helveticus_GCF_002080395") has been submitted
Your

remove loci that were masked in the original genome

In [122]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 437 sequences from Campylobacter_armoricus_GCF_009036385_merged.bed.  Filtered 326 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 111.
Screened 66 sequences from Campylobacter_avium_GCF_002238335_merged.bed.  Filtered 52 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 14.
Screened 44 sequences from Campylobacter_blaseri_GCF_003015205_merged.bed.  Filtered 32 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 12.
Screened 1953 sequences from Campylobacter_coli_GCF_002024185_merged.bed.  Filtered 1057 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 896.
Screened 25 sequences from Campylobacter_concisus_GCF_001298465_merged.bed.  Filtered 15 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10.
Screened 30 sequences from Campylobacter_corcagiensis_GCF_000597805_merged.bed.  Filtered 21 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 9.
Screened 439 sequences from Campylobacter_cuniculorum_GCF_00

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [123]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [124]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/campylobacter/initial_intervals/Campylobacter_volucris_GCF_000816345_merged.bed --twobit results/phyluce/campylobacter/cleaned_genomes/Campylobacter_jejuni_GCF_000009085_formatted.2bit --output results/phyluce/campylobacter/initial_intervals/Campylobacter_volucris_GCF_000816345_stripped.bed;
campylobacter_armoricus_gcf_009036385.
campylobacter_avium_gcf_002238335.
campylobacter_blaseri_gcf_003015205.
campylobacter_coli_gcf_002024185.
campylobacter_concisus_gcf_001298465.
campylobacter_corcagiensis_gcf_000597805.
campylobacter_cuniculorum_gcf_002104335.
campylobacter_curvus_gcf_000017465.
campylobacter_fetus_gcf_008271385.
campylobacter_geochelonis_gcf_900065885.
campylobacter_gracilis_gcf_001190745.
campylobacter_helveticus_gcf_002080395.
campylobacter_hepaticus_gcf_001687475..
campylobacter_hominis_gcf_000017585.
campylobacter_hyointestinalis_gcf_001643955.
campylobacter_iguaniorum_gcf_0

Quantify probes and the number of targeted taxa for each.

In [125]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/campylobacter/initial_intervals/Campylobacter_volucris_GCF_000816345_merged.bed --twobit results/phyluce/campylobacter/cleaned_genomes/Campylobacter_jejuni_GCF_000009085_formatted.2bit --output results/phyluce/campylobacter/initial_intervals/Campylobacter_volucris_GCF_000816345_stripped.bed;
Loci shared by Campylobacter_jejuni_GCF_000009085 + 0 taxa:	1,514.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 1 taxa:	1,514.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 2 taxa:	661.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 3 taxa:	244.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 4 taxa:	169.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 5 taxa:	130.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 6 taxa:	78.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 7 taxa:	59.0
Loci shared by Campylobacter_jejuni_GCF_000009085 + 8 taxa:	50.0
Loci shared b

In [126]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 4
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/campylobacter/initial_intervals/campylobacter-to-Campylobacter_jejuni_GCF_000009085.sqlite --base-taxon Campylobacter_jejuni_GCF_000009085 --output results/phyluce/campylobacter/initial_intervals/Campylobacter_jejuni_GCF_000009085_+4.bed --specific-counts 4;
Counter({'campylobacter_hepaticus_gcf_001687475': 154, 'campylobacter_coli_gcf_002024185': 138, 'campylobacter_lari_gcf_000019205': 102, 'campylobacter_subantarcticus_gcf_000816305': 97, 'campylobacter_ornithocola_gcf_001705345': 93, 'campylobacter_peloridis_gcf_000816785': 92, 'campylobacter_armoricus_gcf_009036385': 91, 'campylobacter_volucris_gcf_000816345': 87, 'campylobacter_insulaenigrae_gcf_000816185': 66, 'campylobacter_cuniculorum_gcf_002104335': 58, 'campylobacter_helveticus_gcf_002080395': 57, 'campylobacter_upsaliensis_gcf_000620965': 48, 'campylobacter_avium_gcf_002238335': 11, 'campylobacter_geochelonis_gcf_900065885': 10, 'campylobacter_lanienae_gcf_002139935

## Design temp set of baits

In [127]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/campylobacter/initial_intervals/Campylobacter_jejuni_GCF_000009085_+4.bed --twobit results/phyluce/campylobacter/cleaned_genomes/Campylobacter_jejuni_GCF_000009085_formatted.2bit --buffer-to 160 --output results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4.fasta;
Screened 169 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 169.


design the baits

In [128]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4.fasta --probe-prefix uce_campylobacter_ --design campylobacter_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 153
Probe Count = 296


## Find duplicate baited regions

In [129]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas --query results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  17:31:09
Ended:  Thu Feb 06, 2020  17:31:09
Time for execution:  0.00314044952393 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas                        --lastz results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4_temp_probes_vself.lastz                       --probe-prefix=uce_campylobacter_;
Parsing lastz file...
Screening results...
Screened 295 fasta sequences.  Filtered 3 duplicat

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [130]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [131]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/campylobacter/validate_intervals/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas --scaffoldlist Campylobacter_armoricus_GCF_009036385 Campylobacter_avium_GCF_002238335 Campylobacter_blaseri_GCF_003015205 Campylobacter_coli_GCF_002024185 Campylobacter_concisus_GCF_001298465 Campylobacter_corcagiensis_GCF_000597805 Campylobacter_cuniculorum_GCF_002104335 Campylobacter_curvus_GCF_000017465 Campylobacter_fetus_GCF_008271385 Campylobacter_geochelonis_GCF_900065885 Campylobacter_gracilis_GCF_001190745 Campylobacter_helveticus_GCF_002080395 Campylobacter_hepaticus_GCF_001687475 Campylobacter_hominis_GCF_000017585 Campylobacter_hyointestinalis_GCF_001643955 Campylobacter_iguaniorum_GCF_000736415 Campylobacter_insulaenigrae_GCF_000816185 Campylobacter_lanienae_GCF_002139935 Campylobacter_lari_GCF_000019205 Campylobacter_ornithocola_GCF_001705345 Campylobacter_peloridis_GCF_000816785 Campylobacter_pinnipediorum_GCF_002021

	/tmp/tmpgPDhyE.fasta

Writing the results file...
	/tmp/tmpSGPfzE.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/campylobacter/validate_intervals/lastz/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas_v_Campylobacter_helveticus_GCF_002080395.lastz
Creating Campylobacter_helveticus_GCF_002080395 table
Inserting data to Campylobacter_helveticus_GCF_002080395 table

Running against Campylobacter_hepaticus_GCF_001687475.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmphluDLO.fasta

Writing the results file...
	/tmp/tmpgTn_lB.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/campylobacter/validate_intervals/lastz/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas_v_Campylobacter_hepaticus_GCF_001687475.lastz
Creating Campylobacter_hepaticus_GCF_001687475 table
Inserting data to Campylobacter_hepaticus_GCF_001687475 ta

Inserting data to Campylobacter_sptorum_GCF_002220735 table

Running against Campylobacter_subantarcticus_GCF_000816305.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp6k_dBW.fasta

Writing the results file...
	/tmp/tmpPWFHlG.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/campylobacter/validate_intervals/lastz/Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas_v_Campylobacter_subantarcticus_GCF_000816305.lastz
Creating Campylobacter_subantarcticus_GCF_000816305 table
Inserting data to Campylobacter_subantarcticus_GCF_000816305 table

Running against Campylobacter_troglodytis_GCF_006864425.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpC_X02a.fasta

Writing the results file...
	/tmp/tmpNYCl0J.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/ca

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [132]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/campylobacter/extract_probes_from_group/campylobacter_genome.conf --lastz results/phyluce/campylobacter/validate_intervals/lastz --probes 120 --probe-prefix uce_campylobacter_ --name-pattern "Campylobacter_jejuni_GCF_000009085_+4_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/campylobacter/extract_probes_from_group/probe_fasta;
2020-02-06 17:31:49,381 - Phyluce - INFO - ---- Working on Campylobacter_armoricus_GCF_009036385 genome ----
2020-02-06 17:31:49,382 - Phyluce - INFO - Reading Campylobacter_armoricus_GCF_009036385 genome
2020-02-06 17:31:49,960 - Phyluce - INFO - Campylobacter_armoricus_GCF_009036385: 148 uces, 0 dupes, 148 non-dupes, 11 orient drop, 4 length drop, 133 written
2020-02-06 17:31:49,960 - Phyluce - INFO - ------ Working on Campylobacter_avium_GCF_002238335 genome ------
2020-02-06 17:31:49,971 - Phyluce - INFO - Reading Campylobacter_avium_GCF_002238335 genome
2020-02-06 17:31:50,461 - Ph

2020-02-06 17:31:59,696 - Phyluce - INFO - Reading Campylobacter_pinnipediorum_GCF_002021925 genome
2020-02-06 17:32:00,166 - Phyluce - INFO - Campylobacter_pinnipediorum_GCF_002021925: 129 uces, 0 dupes, 129 non-dupes, 4 orient drop, 9 length drop, 116 written
2020-02-06 17:32:00,166 - Phyluce - INFO - ------ Working on Campylobacter_rectus_GCF_000174175 genome -----
2020-02-06 17:32:00,167 - Phyluce - INFO - Reading Campylobacter_rectus_GCF_000174175 genome
2020-02-06 17:32:00,553 - Phyluce - INFO - Campylobacter_rectus_GCF_000174175: 128 uces, 2 dupes, 126 non-dupes, 0 orient drop, 0 length drop, 126 written
2020-02-06 17:32:00,553 - Phyluce - INFO - ------ Working on Campylobacter_showae_GCF_000313615 genome -----
2020-02-06 17:32:00,553 - Phyluce - INFO - Reading Campylobacter_showae_GCF_000313615 genome
2020-02-06 17:32:00,960 - Phyluce - INFO - Campylobacter_showae_GCF_000313615: 133 uces, 5 dupes, 128 non-dupes, 0 orient drop, 0 length drop, 128 written
2020-02-06 17:32:00,960 

In [133]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/campylobacter/extract_probes_from_group/probe_fasta --output results/phyluce/campylobacter/extract_probes_from_group/multifastas.sqlite --base-taxon Campylobacter_jejuni_GCF_000009085;
campylobacter_armoricus_gcf_009036385.
campylobacter_avium_gcf_002238335.
campylobacter_blaseri_gcf_003015205.
campylobacter_coli_gcf_002024185.
campylobacter_concisus_gcf_001298465.
campylobacter_corcagiensis_gcf_000597805.
campylobacter_cuniculorum_gcf_002104335.
campylobacter_curvus_gcf_000017465.
campylobacter_fetus_gcf_008271385.
campylobacter_geochelonis_gcf_900065885.
campylobacter_gracilis_gcf_001190745.
campylobacter_helveticus_gcf_002080395.
campylobacter_hepaticus_gcf_001687475.
campylobacter_hominis_gcf_000017585.
campylobacter_hyointestinalis_gcf_001643955.
campylobacter_iguaniorum_gcf_000736415.
campylobacter_insulaenigrae_gcf_000816185.
campylobacter_lanienae_gcf_002139935.
campylobacter_lari_gcf_000019205.
campylobacter_ornithoc

In [136]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(29)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/campylobacter/extract_probes_from_group/multifastas.sqlite  --base-taxon Campylobacter_jejuni_GCF_000009085 --output results/phyluce/campylobacter/extract_probes_from_group/Campylobacter_jejuni_GCF_000009085+4-back-to-29.conf --specific-counts 29;
Counter({'campylobacter_fetus_gcf_008271385': 110, 'campylobacter_curvus_gcf_000017465': 110, 'campylobacter_insulaenigrae_gcf_000816185': 110, 'campylobacter_jejuni_gcf_000009085': 110, 'campylobacter_peloridis_gcf_000816785': 110, 'campylobacter_lanienae_gcf_002139935': 110, 'campylobacter_cuniculorum_gcf_002104335': 110, 'campylobacter_hepaticus_gcf_001687475': 110, 'campylobacter_coli_gcf_002024185': 110, 'campylobacter_hyointestinalis_gcf_001643955': 110, 'campylobacter_volucris_gcf_000816345': 110, 'campylobacter_corcagiensis_gcf_000597805': 110, 'campylobacter_iguaniorum_gcf_000736415': 110, 'campylobacter_geochelonis_gcf_900065885': 110, 'campylobacter_avium_gcf_002238335': 11

## Final group specific bait design

In [137]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/campylobacter/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/campylobacter/extract_probes_from_group/Campylobacter_jejuni_GCF_000009085+4-back-to-29.conf --probe-prefix uce_campylobacter_ --designer rnplattii --design campylobacter_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [138]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list.fasta --query results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  17:33:14
Ended:  Thu Feb 06, 2020  17:33:30
Time for execution:  0.282086964448 minutes


In [139]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list.fasta --lastz results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_campylobacter_;
Parsing lastz file...
Screening results...
Screened 6311 fasta sequences.  Filtered 0 duplicates. Kept 6312.


## CDhit to reduce numbers

In [140]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list.fasta
         -o
         results/phyluce/campylobacter/final_probe_design/campylobacter_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 17:39:40 2020
                            Output                              
----------------------------------------------------------------
total seq: 6312
longest and shortest : 80 and 80
Total letters: 504960
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 87M

Table limit with the given memory limit:
Max number of representatives: 3954246
Max number of word counting entries: 89010081

# comparing sequences from          0  to       1052
.---------- new table with      936 representatives
# comparing sequences 