## Prep python

In [2]:
import os
import subprocess
import pandas as pd
import shutil
from shutil import copy
import time
from collections import defaultdict
from Bio import SeqIO
import glob

os.chdir("/master/nplatt/pathogen_probes/")


def wait_on_running_jobs():
   
    num_jobs = 1
    
    while num_jobs > 0:
        num_jobs = len(subprocess.check_output('qstat', shell=True).split("\n")) - 2
        time.sleep(60)
        print(".")

# Chlamydia

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [8]:
group = 'chlamydia'

In [9]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [10]:
group_taxa = [ "Chlamydia_abortus_GCF_002895085",
               "Chlamydia_avium_GCF_000583875",
               "Chlamydia_buteonis_GCF_900634605",
               "Chlamydia_caviae_GCF_000007605",
               "Chlamydia_felis_GCF_000009945",
               "Chlamydia_gallinacea_GCF_000471025",
               "Chlamydia_ibidis_GCF_000454725",
               "Chlamydia_muridarum_GCF_000006685",
               "Chlamydia_pecorum_GCF_000204135",
               "Chlamydia_pneumoniae_GCF_000008745",
               "Chlamydia_psittaci_GCF_000204255",
               "Chlamydia_sp_GCF_900239945",
               "Chlamydia_suis_GCF_900169085" ]

reference_taxon = "Chlamydia_trachomatis_GCF_000008725"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [11]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002895085.1_ASM289508v1_genomic.fna.gz

sent 42 bytes  received 341785 bytes  227884.67 bytes/sec
total size is 341591  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplina

GCA_000204255.1_ASM20425v1_genomic.fna.gz

sent 42 bytes  received 351224 bytes  234177.33 bytes/sec
total size is 351031  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900239945.1_Chlamydia_sp._nov._H15-1957-10C_genomic.fna.gz

sent 42 bytes  received 357880 bytes  238614.67 bytes/sec
total size is 357666  speedup is 1.00


You are accessing a U.S. Government information system which include

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [12]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [13]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482630 ("sim_Chlamydia_abortus_GCF_002895085") has been submitted
Your job 5482631 ("sim_Chlamydia_avium_GCF_000583875") has been submitted
Your job 5482632 ("sim_Chlamydia_buteonis_GCF_900634605") has been submitted
Your job 5482633 ("sim_Chlamydia_caviae_GCF_000007605") has been submitted
Your job 5482634 ("sim_Chlamydia_felis_GCF_000009945") has been submitted
Your job 5482635 ("sim_Chlamydia_gallinacea_GCF_000471025") has been submitted
Your job 5482636 ("sim_Chlamydia_ibidis_GCF_000454725") has been submitted
Your job 5482637 ("sim_Chlamydia_muridarum_GCF_000006685") has been submitted
Your job 5482638 ("sim_Chlamydia_pecorum_GCF_000204135") has been submitted
Your job 5482639 ("sim_Chlamydia_pneumoniae_GCF_000008745") has been submitted
Your job 5482640 ("sim_Chlamydia_psittaci_GCF_000204255") has been submitted
Your job 5482641 ("sim_Chlamydia_sp_GCF_900239945") has been submitted
Your job 5482642 ("sim_Chlamydia_suis_GCF_900169085") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [14]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/chlamydia/cleaned_genomes/Chlamydia_trachomatis_GCF_000008725_formatted.fas path=results/phyluce/chlamydia/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [15]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482643 ("map_Chlamydia_abortus_GCF_002895085") has been submitted
Your job 5482644 ("map_Chlamydia_avium_GCF_000583875") has been submitted
Your job 5482645 ("map_Chlamydia_buteonis_GCF_900634605") has been submitted
Your job 5482646 ("map_Chlamydia_caviae_GCF_000007605") has been submitted
Your job 5482647 ("map_Chlamydia_felis_GCF_000009945") has been submitted
Your job 5482648 ("map_Chlamydia_gallinacea_GCF_000471025") has been submitted
Your job 5482649 ("map_Chlamydia_ibidis_GCF_000454725") has been submitted
Your job 5482650 ("map_Chlamydia_muridarum_GCF_000006685") has been submitted
Your job 5482651 ("map_Chlamydia_pecorum_GCF_000204135") has been submitted
Your job 5482652 ("map_Chlamydia_pneumoniae_GCF_000008745") has been submitted
Your job 5482653 ("map_Chlamydia_psittaci_GCF_000204255") has been submitted
Your job 5482654 ("map_Chlamydia_sp_GCF_900239945") has been submitted
Your job 5482655 ("map_Chlamydia_suis_GCF_900169085") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [16]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482656 ("merge_Chlamydia_abortus_GCF_002895085") has been submitted
Your job 5482657 ("merge_Chlamydia_avium_GCF_000583875") has been submitted
Your job 5482658 ("merge_Chlamydia_buteonis_GCF_900634605") has been submitted
Your job 5482659 ("merge_Chlamydia_caviae_GCF_000007605") has been submitted
Your job 5482660 ("merge_Chlamydia_felis_GCF_000009945") has been submitted
Your job 5482661 ("merge_Chlamydia_gallinacea_GCF_000471025") has been submitted
Your job 5482662 ("merge_Chlamydia_ibidis_GCF_000454725") has been submitted
Your job 5482663 ("merge_Chlamydia_muridarum_GCF_000006685") has been submitted
Your job 5482664 ("merge_Chlamydia_pecorum_GCF_000204135") has been submitted
Your job 5482665 ("merge_Chlamydia_pneumoniae_GCF_000008745") has been submitted
Your job 5482666 ("merge_Chlamydia_psittaci_GCF_000204255") has been submitted
Your job 5482667 ("merge_Chlamydia_sp_GCF_900239945") has been submitted
Your job 5482668 ("merge_Chlamydia_suis_GCF_900169085") has been 

remove loci that were masked in the original genome

In [17]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 49 sequences from Chlamydia_abortus_GCF_002895085_merged.bed.  Filtered 39 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10.
Screened 33 sequences from Chlamydia_avium_GCF_000583875_merged.bed.  Filtered 21 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 12.
Screened 47 sequences from Chlamydia_buteonis_GCF_900634605_merged.bed.  Filtered 37 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10.
Screened 45 sequences from Chlamydia_caviae_GCF_000007605_merged.bed.  Filtered 34 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 11.
Screened 40 sequences from Chlamydia_felis_GCF_000009945_merged.bed.  Filtered 27 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 13.
Screened 36 sequences from Chlamydia_gallinacea_GCF_000471025_merged.bed.  Filtered 24 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 12.
Screened 40 sequences from Chlamydia_ibidis_GCF_000454725_merged.bed.  Filtered 30 with > 25.0%

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [18]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [19]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/chlamydia/initial_intervals/Chlamydia_suis_GCF_900169085_merged.bed --twobit results/phyluce/chlamydia/cleaned_genomes/Chlamydia_trachomatis_GCF_000008725_formatted.2bit --output results/phyluce/chlamydia/initial_intervals/Chlamydia_suis_GCF_900169085_stripped.bed;
chlamydia_abortus_gcf_002895085.
chlamydia_avium_gcf_000583875.
chlamydia_buteonis_gcf_900634605.
chlamydia_caviae_gcf_000007605.
chlamydia_felis_gcf_000009945.
chlamydia_gallinacea_gcf_000471025.
chlamydia_ibidis_gcf_000454725.
chlamydia_muridarum_gcf_000006685.
chlamydia_pecorum_gcf_000204135.
chlamydia_pneumoniae_gcf_000008745.
chlamydia_psittaci_gcf_000204255.
chlamydia_sp_gcf_900239945.
chlamydia_suis_gcf_900169085.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [20]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/chlamydia/initial_intervals/Chlamydia_suis_GCF_900169085_merged.bed --twobit results/phyluce/chlamydia/cleaned_genomes/Chlamydia_trachomatis_GCF_000008725_formatted.2bit --output results/phyluce/chlamydia/initial_intervals/Chlamydia_suis_GCF_900169085_stripped.bed;
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 0 taxa:	405.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 1 taxa:	405.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 2 taxa:	231.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 3 taxa:	14.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 4 taxa:	13.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 5 taxa:	12.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 6 taxa:	12.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 7 taxa:	10.0
Loci shared by Chlamydia_trachomatis_GCF_000008725 + 8 taxa:	10.0
Loci shared by Chlamydia_trachomatis_G

In [21]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 2
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/chlamydia/initial_intervals/chlamydia-to-Chlamydia_trachomatis_GCF_000008725.sqlite --base-taxon Chlamydia_trachomatis_GCF_000008725 --output results/phyluce/chlamydia/initial_intervals/Chlamydia_trachomatis_GCF_000008725_+2.bed --specific-counts 2;
Counter({'chlamydia_muridarum_gcf_000006685': 230, 'chlamydia_suis_gcf_900169085': 229, 'chlamydia_felis_gcf_000009945': 12, 'chlamydia_psittaci_gcf_000204255': 12, 'chlamydia_avium_gcf_000583875': 11, 'chlamydia_gallinacea_gcf_000471025': 10, 'chlamydia_caviae_gcf_000007605': 10, 'chlamydia_sp_gcf_900239945': 10, 'chlamydia_abortus_gcf_002895085': 10, 'chlamydia_pecorum_gcf_000204135': 10, 'chlamydia_ibidis_gcf_000454725': 10, 'chlamydia_buteonis_gcf_900634605': 9, 'chlamydia_pneumoniae_gcf_000008745': 6})


## Design temp set of baits

In [22]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/chlamydia/initial_intervals/Chlamydia_trachomatis_GCF_000008725_+2.bed --twobit results/phyluce/chlamydia/cleaned_genomes/Chlamydia_trachomatis_GCF_000008725_formatted.2bit --buffer-to 160 --output results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2.fasta;
Screened 231 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 231.


design the baits

In [23]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2.fasta --probe-prefix uce_chlamydia_ --design chlamydia_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGG


Conserved locus count = 227
Probe Count = 446


## Find duplicate baited regions

In [24]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes.fas --query results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  09:29:01
Ended:  Fri Feb 07, 2020  09:29:01
Time for execution:  0.00403754711151 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes.fas                        --lastz results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes_vself.lastz                       --probe-prefix=uce_chlamydia_;
Parsing lastz file...
Screening results...
Screened 445 fasta sequences.  Filtered 4 duplicates. Kept 438.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [25]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [26]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/chlamydia/validate_intervals/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes.fas --scaffoldlist Chlamydia_abortus_GCF_002895085 Chlamydia_avium_GCF_000583875 Chlamydia_buteonis_GCF_900634605 Chlamydia_caviae_GCF_000007605 Chlamydia_felis_GCF_000009945 Chlamydia_gallinacea_GCF_000471025 Chlamydia_ibidis_GCF_000454725 Chlamydia_muridarum_GCF_000006685 Chlamydia_pecorum_GCF_000204135 Chlamydia_pneumoniae_GCF_000008745 Chlamydia_psittaci_GCF_000204255 Chlamydia_sp_GCF_900239945 Chlamydia_suis_GCF_900169085 Chlamydia_trachomatis_GCF_000008725 --genome-base-path results/phyluce/chlamydia/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/chlamydia/validate_intervals/chlamydia-to-Chlamydia_trachomatis_GCF_000008725.sqlite --output results/phyluce/chlamydia/validate_intervals/lastz/;

Running against Chlamydia_abortus_GCF_002895085.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running t

	/tmp/tmpBJesKG.fasta

Writing the results file...
	/tmp/tmpfT00xj.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/chlamydia/validate_intervals/lastz/Chlamydia_trachomatis_GCF_000008725_+2_temp_probes.fas_v_Chlamydia_trachomatis_GCF_000008725.lastz
Creating Chlamydia_trachomatis_GCF_000008725 table
Inserting data to Chlamydia_trachomatis_GCF_000008725 table


## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [27]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/chlamydia/extract_probes_from_group/chlamydia_genome.conf --lastz results/phyluce/chlamydia/validate_intervals/lastz --probes 120 --probe-prefix uce_chlamydia_ --name-pattern "Chlamydia_trachomatis_GCF_000008725_+2_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/chlamydia/extract_probes_from_group/probe_fasta;
2020-02-07 09:29:20,339 - Phyluce - INFO - ------- Working on Chlamydia_abortus_GCF_002895085 genome -------
2020-02-07 09:29:20,340 - Phyluce - INFO - Reading Chlamydia_abortus_GCF_002895085 genome
2020-02-07 09:29:21,004 - Phyluce - INFO - Chlamydia_abortus_GCF_002895085: 208 uces, 0 dupes, 208 non-dupes, 2 orient drop, 0 length drop, 206 written
2020-02-07 09:29:21,004 - Phyluce - INFO - -------- Working on Chlamydia_avium_GCF_000583875 genome --------
2020-02-07 09:29:21,005 - Phyluce - INFO - Reading Chlamydia_avium_GCF_000583875 genome
2020-02-07 09:29:21,589 - Phyluce - INFO - Chlamydia_avium_GCF_0

In [28]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/chlamydia/extract_probes_from_group/probe_fasta --output results/phyluce/chlamydia/extract_probes_from_group/multifastas.sqlite --base-taxon Chlamydia_trachomatis_GCF_000008725;
chlamydia_abortus_gcf_002895085.
chlamydia_avium_gcf_000583875.
chlamydia_buteonis_gcf_900634605.
chlamydia_caviae_gcf_000007605.
chlamydia_felis_gcf_000009945.
chlamydia_gallinacea_gcf_000471025.
chlamydia_ibidis_gcf_000454725.
chlamydia_muridarum_gcf_000006685.
chlamydia_pecorum_gcf_000204135.
chlamydia_pneumoniae_gcf_000008745.
chlamydia_psittaci_gcf_000204255.
chlamydia_sp_gcf_900239945.
chlamydia_suis_gcf_900169085.
chlamydia_trachomatis_gcf_000008725.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/chlamydia/extract_probes_from_group/multifastas.sqlite --base-taxon Chlamydia_trachomatis_GCF_000008725;
Loci shared by 0 taxa:	226.0
Loci shared by 1 taxa:	226.0
Loci shared by 2 taxa:	226.0
Loci shared 

In [29]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(14)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/chlamydia/extract_probes_from_group/multifastas.sqlite  --base-taxon Chlamydia_trachomatis_GCF_000008725 --output results/phyluce/chlamydia/extract_probes_from_group/Chlamydia_trachomatis_GCF_000008725+2-back-to-14.conf --specific-counts 14;
Counter({'chlamydia_buteonis_gcf_900634605': 156, 'chlamydia_avium_gcf_000583875': 156, 'chlamydia_gallinacea_gcf_000471025': 156, 'chlamydia_felis_gcf_000009945': 156, 'chlamydia_caviae_gcf_000007605': 156, 'chlamydia_sp_gcf_900239945': 156, 'chlamydia_pneumoniae_gcf_000008745': 156, 'chlamydia_trachomatis_gcf_000008725': 156, 'chlamydia_psittaci_gcf_000204255': 156, 'chlamydia_suis_gcf_900169085': 156, 'chlamydia_abortus_gcf_002895085': 156, 'chlamydia_pecorum_gcf_000204135': 156, 'chlamydia_ibidis_gcf_000454725': 156, 'chlamydia_muridarum_gcf_000006685': 156})
Total loci = 156


## Final group specific bait design

In [30]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/chlamydia/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/chlamydia/extract_probes_from_group/Chlamydia_trachomatis_GCF_000008725+2-back-to-14.conf --probe-prefix uce_chlamydia_ --designer rnplattii --design chlamydia_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 156
Probe Count = 4238


In [31]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list.fasta --query results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  09:29:43
Ended:  Fri Feb 07, 2020  09:29:49
Time for execution:  0.0977126995722 minutes


In [32]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list.fasta --lastz results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_chlamydia_;
Parsing lastz file...
Screening results...
Screened 4237 fasta sequences.  Filtered 2 duplicates. Kept 4182.


## CDhit to reduce numbers

In [33]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list.fasta
         -o
         results/phyluce/chlamydia/final_probe_design/chlamydia_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 09:31:56 2020
                            Output                              
----------------------------------------------------------------
total seq: 4238
longest and shortest : 80 and 80
Total letters: 339040
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 87M

Table limit with the given memory limit:
Max number of representatives: 3957771
Max number of word counting entries: 89089442

# comparing sequences from          0  to        706
---------- new table with      575 representatives
# comparing sequences from        706  

# Coxiella burnetti

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [34]:
group = 'coxiella'

In [35]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [36]:
group_taxa = [ "Coxiella_burnetii_GCF_000017105",
               "Coxiella_burnetii_GCF_001572745",
               "Coxiella_burnetii_GCF_004006195",
               "Coxiella_burnetii_GCF_000019885",
               "Coxiella_burnetii_GCF_000168875",
               "Coxiella_burnetii_GCF_000018745",
               "Coxiella_burnetii_GCF_000019865",
               "Coxiella_burnetii_GCF_000367725",
               "Coxiella_burnetii_GCF_004087735",
               "Coxiella_burnetii_GCF_002109345",
               "Coxiella_burnetii_GCF_002094935",
               "Coxiella_burnetii_GCF_005280755",
               "Coxiella_burnetii_GCF_002634005",
               "Coxiella_burnetii_GCF_002633985",
               "Coxiella_burnetii_GCF_002633965",
               "Coxiella_burnetii_GCF_002633945",
               "Coxiella_burnetii_GCF_002633885",
               "Coxiella_burnetii_GCF_002634025",
               "Coxiella_burnetii_GCF_002634045",
               "Coxiella_burnetii_GCF_000767035",
               "Coxiella_burnetii_GCF_002634065",
               "Coxiella_burnetii_GCF_002633925",
               "Coxiella_burnetii_GCF_002633905",
               "Coxiella_burnetii_GCF_001572765",
               "Coxiella_burnetii_GCF_002634085",
               "Coxiella_burnetii_GCF_000612785",
               "Coxiella_burnetii_GCF_000826165",
               "Coxiella_burnetii_GCF_000820465",
               "Coxiella_burnetii_GCF_000967075",
               "Coxiella_burnetii_GCF_000470495",
               "Coxiella_burnetii_GCF_002247545",
               "Coxiella_burnetii_GCF_002247155",
               "Coxiella_burnetii_GCF_002896835",
               "Coxiella_burnetii_GCF_000613025",
               "Coxiella_burnetii_GCF_000723305",
               "Coxiella_burnetii_GCF_002924395",
               "Coxiella_burnetii_GCF_002247185",
               "Coxiella_burnetii_GCF_002896735",
               "Coxiella_burnetii_GCF_002896755",
               "Coxiella_burnetii_GCF_002896775",
               "Coxiella_burnetii_GCF_002247265",
               "Coxiella_burnetii_GCF_002247205",
               "Coxiella_burnetii_GCF_002924305",
               "Coxiella_burnetii_GCF_002924345",
               "Coxiella_burnetii_GCF_002924425",
               "Coxiella_burnetii_GCF_000723245",
               "Coxiella_burnetii_GCF_002247285",
               "Coxiella_burnetii_GCF_002247305",
               "Coxiella_burnetii_GCF_002247335",
               "Coxiella_burnetii_GCF_002249845",
               "Coxiella_burnetii_GCF_002924325",
               "Coxiella_burnetii_GCF_003849965",
               "Coxiella_burnetii_GCF_002247225",
               "Coxiella_burnetii_GCF_002896795",
               "Coxiella_burnetii_GCF_002924385",
               "Coxiella_burnetii_GCF_000751935",
               "Coxiella_burnetii_GCF_000820825",
               "Coxiella_burnetii_GCF_000756325",
               "Coxiella_burnetii_GCF_002591355",
               "Coxiella_burnetii_GCF_000169495",
               "Coxiella_burnetii_GCF_000723285",
               "Coxiella_burnetii_GCF_002896815",
               "Coxiella_burnetii_GCF_003849885",
               "Coxiella_burnetii_GCF_003849875",
               "Coxiella_burnetii_GCF_003849785",
               "Coxiella_burnetii_GCF_000723265",
               "Coxiella_burnetii_GCF_000300315",
               "Coxiella_burnetii_GCA_000359545" ]
                    
reference_taxon = "Coxiella_burnetii_GCF_000007765"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [37]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000007765.2_ASM776v2_genomic.fna.gz

sent 42 bytes  received 605169 bytes  242084.40 bytes/sec
total size is 604914  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary 

GCA_002109345.1_ASM210934v1_genomic.fna.gz

sent 42 bytes  received 597412 bytes  238981.60 bytes/sec
total size is 597154  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002094935.1_ASM209493v1_genomic.fna.gz

sent 42 bytes  received 597419 bytes  398307.33 bytes/sec
total size is 597161  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, ne

GCA_002634065.1_ASM263406v1_genomic.fna.gz

sent 42 bytes  received 608536 bytes  1217156.00 bytes/sec
total size is 608278  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002633925.1_ASM263392v1_genomic.fna.gz

sent 42 bytes  received 601647 bytes  1203378.00 bytes/sec
total size is 601389  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, 

GCA_002247155.1_ASM224715v1_genomic.fna.gz

sent 42 bytes  received 657149 bytes  438127.33 bytes/sec
total size is 656875  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002896835.1_ASM289683v1_genomic.fna.gz

sent 42 bytes  received 602458 bytes  401666.67 bytes/sec
total size is 602200  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, ne

GCA_002924305.1_ASM292430v1_genomic.fna.gz

sent 42 bytes  received 601463 bytes  401003.33 bytes/sec
total size is 601205  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002924345.1_ASM292434v1_genomic.fna.gz

sent 42 bytes  received 601623 bytes  401110.00 bytes/sec
total size is 601365  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, ne

GCA_002896795.1_ASM289679v1_genomic.fna.gz

sent 42 bytes  received 597094 bytes  398090.67 bytes/sec
total size is 596836  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002924385.1_ASM292438v1_genomic.fna.gz

sent 42 bytes  received 601801 bytes  401228.67 bytes/sec
total size is 601543  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, ne

GCA_003849785.1_ASM384978v1_genomic.fna.gz

sent 42 bytes  received 599846 bytes  399925.33 bytes/sec
total size is 599588  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000723265.1_Cb-O184_genomic.fna.gz

sent 42 bytes  received 645790 bytes  430554.67 bytes/sec
total size is 645528  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, networ

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [38]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [39]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482669 ("sim_Coxiella_burnetii_GCF_000017105") has been submitted
Your job 5482670 ("sim_Coxiella_burnetii_GCF_001572745") has been submitted
Your job 5482671 ("sim_Coxiella_burnetii_GCF_004006195") has been submitted
Your job 5482672 ("sim_Coxiella_burnetii_GCF_000019885") has been submitted
Your job 5482673 ("sim_Coxiella_burnetii_GCF_000168875") has been submitted
Your job 5482674 ("sim_Coxiella_burnetii_GCF_000018745") has been submitted
Your job 5482675 ("sim_Coxiella_burnetii_GCF_000019865") has been submitted
Your job 5482676 ("sim_Coxiella_burnetii_GCF_000367725") has been submitted
Your job 5482677 ("sim_Coxiella_burnetii_GCF_004087735") has been submitted
Your job 5482678 ("sim_Coxiella_burnetii_GCF_002109345") has been submitted
Your job 5482679 ("sim_Coxiella_burnetii_GCF_002094935") has been submitted
Your job 5482680 ("sim_Coxiella_burnetii_GCF_005280755") has been submitted
Your job 5482681 ("sim_Coxiella_burnetii_GCF_002634005") has been submitted
Your job 548

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [40]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/coxiella/cleaned_genomes/Coxiella_burnetii_GCF_000007765_formatted.fas path=results/phyluce/coxiella/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [41]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482737 ("map_Coxiella_burnetii_GCF_000017105") has been submitted
Your job 5482738 ("map_Coxiella_burnetii_GCF_001572745") has been submitted
Your job 5482739 ("map_Coxiella_burnetii_GCF_004006195") has been submitted
Your job 5482740 ("map_Coxiella_burnetii_GCF_000019885") has been submitted
Your job 5482741 ("map_Coxiella_burnetii_GCF_000168875") has been submitted
Your job 5482742 ("map_Coxiella_burnetii_GCF_000018745") has been submitted
Your job 5482743 ("map_Coxiella_burnetii_GCF_000019865") has been submitted
Your job 5482744 ("map_Coxiella_burnetii_GCF_000367725") has been submitted
Your job 5482745 ("map_Coxiella_burnetii_GCF_004087735") has been submitted
Your job 5482746 ("map_Coxiella_burnetii_GCF_002109345") has been submitted
Your job 5482747 ("map_Coxiella_burnetii_GCF_002094935") has been submitted
Your job 5482748 ("map_Coxiella_burnetii_GCF_005280755") has been submitted
Your job 5482749 ("map_Coxiella_burnetii_GCF_002634005") has been submitted
Your job 548

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [42]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482805 ("merge_Coxiella_burnetii_GCF_000017105") has been submitted
Your job 5482806 ("merge_Coxiella_burnetii_GCF_001572745") has been submitted
Your job 5482807 ("merge_Coxiella_burnetii_GCF_004006195") has been submitted
Your job 5482808 ("merge_Coxiella_burnetii_GCF_000019885") has been submitted
Your job 5482809 ("merge_Coxiella_burnetii_GCF_000168875") has been submitted
Your job 5482810 ("merge_Coxiella_burnetii_GCF_000018745") has been submitted
Your job 5482811 ("merge_Coxiella_burnetii_GCF_000019865") has been submitted
Your job 5482812 ("merge_Coxiella_burnetii_GCF_000367725") has been submitted
Your job 5482813 ("merge_Coxiella_burnetii_GCF_004087735") has been submitted
Your job 5482814 ("merge_Coxiella_burnetii_GCF_002109345") has been submitted
Your job 5482815 ("merge_Coxiella_burnetii_GCF_002094935") has been submitted
Your job 5482816 ("merge_Coxiella_burnetii_GCF_005280755") has been submitted
Your job 5482817 ("merge_Coxiella_burnetii_GCF_002634005") has b

remove loci that were masked in the original genome

In [43]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 109 sequences from Coxiella_burnetii_GCF_000017105_merged.bed.  Filtered 2 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 107.
Screened 130 sequences from Coxiella_burnetii_GCF_001572745_merged.bed.  Filtered 4 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 126.
Screened 140 sequences from Coxiella_burnetii_GCF_004006195_merged.bed.  Filtered 5 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 135.
Screened 142 sequences from Coxiella_burnetii_GCF_000019885_merged.bed.  Filtered 8 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 134.
Screened 130 sequences from Coxiella_burnetii_GCF_000168875_merged.bed.  Filtered 8 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 122.
Screened 102 sequences from Coxiella_burnetii_GCF_000018745_merged.bed.  Filtered 6 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 96.
Screened 122 sequences from Coxiella_burnetii_GCF_000019865_merged.bed.  Filtered 4 with 

Screened 153 sequences from Coxiella_burnetii_GCF_000751935_merged.bed.  Filtered 5 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 148.
Screened 152 sequences from Coxiella_burnetii_GCF_000820825_merged.bed.  Filtered 5 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 147.
Screened 140 sequences from Coxiella_burnetii_GCF_000756325_merged.bed.  Filtered 8 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 132.
Screened 137 sequences from Coxiella_burnetii_GCF_002591355_merged.bed.  Filtered 4 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 133.
Screened 214 sequences from Coxiella_burnetii_GCF_000169495_merged.bed.  Filtered 16 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 198.
Screened 130 sequences from Coxiella_burnetii_GCF_000723285_merged.bed.  Filtered 6 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 124.
Screened 131 sequences from Coxiella_burnetii_GCF_002896815_merged.bed.  Filtered 3 wit

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [44]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [45]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/coxiella/initial_intervals/Coxiella_burnetii_GCA_000359545_merged.bed --twobit results/phyluce/coxiella/cleaned_genomes/Coxiella_burnetii_GCF_000007765_formatted.2bit --output results/phyluce/coxiella/initial_intervals/Coxiella_burnetii_GCA_000359545_stripped.bed;
coxiella_burnetii_gcf_000017105.
coxiella_burnetii_gcf_001572745.
coxiella_burnetii_gcf_004006195.
coxiella_burnetii_gcf_000019885.
coxiella_burnetii_gcf_000168875.
coxiella_burnetii_gcf_000018745.
coxiella_burnetii_gcf_000019865.
coxiella_burnetii_gcf_000367725.
coxiella_burnetii_gcf_004087735.
coxiella_burnetii_gcf_002109345.
coxiella_burnetii_gcf_002094935.
coxiella_burnetii_gcf_005280755.
coxiella_burnetii_gcf_002634005.
coxiella_burnetii_gcf_002633985.
coxiella_burnetii_gcf_002633965.
coxiella_burnetii_gcf_002633945.
coxiella_burnetii_gcf_002633885.
coxiella_burnetii_gcf_002634025.
coxiella_burnetii_gcf_002634045.
coxiella_

Quantify probes and the number of targeted taxa for each.

In [46]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/coxiella/initial_intervals/Coxiella_burnetii_GCA_000359545_merged.bed --twobit results/phyluce/coxiella/cleaned_genomes/Coxiella_burnetii_GCF_000007765_formatted.2bit --output results/phyluce/coxiella/initial_intervals/Coxiella_burnetii_GCA_000359545_stripped.bed;
Loci shared by Coxiella_burnetii_GCF_000007765 + 0 taxa:	114.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 1 taxa:	114.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 2 taxa:	113.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 3 taxa:	111.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 4 taxa:	111.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 5 taxa:	111.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 6 taxa:	111.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 7 taxa:	111.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 8 taxa:	111.0
Loci shared by Coxiella_burnetii_GCF_000007765 + 9 taxa:	111.0
Loci s

In [47]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 68
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/coxiella/initial_intervals/coxiella-to-Coxiella_burnetii_GCF_000007765.sqlite --base-taxon Coxiella_burnetii_GCF_000007765 --output results/phyluce/coxiella/initial_intervals/Coxiella_burnetii_GCF_000007765_+68.bed --specific-counts 68;
Counter({'coxiella_burnetii_gcf_005280755': 91, 'coxiella_burnetii_gcf_000169495': 91, 'coxiella_burnetii_gcf_001572765': 91, 'coxiella_burnetii_gcf_002094935': 91, 'coxiella_burnetii_gcf_003849885': 91, 'coxiella_burnetii_gcf_000470495': 91, 'coxiella_burnetii_gcf_002247185': 91, 'coxiella_burnetii_gcf_002924395': 91, 'coxiella_burnetii_gcf_004087735': 91, 'coxiella_burnetii_gcf_002247155': 91, 'coxiella_burnetii_gcf_000723265': 91, 'coxiella_burnetii_gcf_001572745': 91, 'coxiella_burnetii_gcf_002247285': 91, 'coxiella_burnetii_gcf_002924385': 91, 'coxiella_burnetii_gcf_000300315': 91, 'coxiella_burnetii_gcf_002633925': 91, 'coxiella_burnetii_gcf_000767035': 91, 'coxiella_burnetii_gcf_002896835

## Design temp set of baits

In [48]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/coxiella/initial_intervals/Coxiella_burnetii_GCF_000007765_+68.bed --twobit results/phyluce/coxiella/cleaned_genomes/Coxiella_burnetii_GCF_000007765_formatted.2bit --buffer-to 160 --output results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68.fasta;
Screened 91 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 91.


design the baits

In [49]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68.fasta --probe-prefix uce_coxiella_ --design coxiella_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGG


Conserved locus count = 89
Probe Count = 175


## Find duplicate baited regions

In [50]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas --query results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  09:44:05
Ended:  Fri Feb 07, 2020  09:44:06
Time for execution:  0.0123950004578 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas                        --lastz results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68_temp_probes_vself.lastz                       --probe-prefix=uce_coxiella_;
Parsing lastz file...
Screening results...
Screened 174 fasta sequences.  Filtered 0 duplicates. Kept 175.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [51]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [52]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/coxiella/validate_intervals/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas --scaffoldlist Coxiella_burnetii_GCF_000017105 Coxiella_burnetii_GCF_001572745 Coxiella_burnetii_GCF_004006195 Coxiella_burnetii_GCF_000019885 Coxiella_burnetii_GCF_000168875 Coxiella_burnetii_GCF_000018745 Coxiella_burnetii_GCF_000019865 Coxiella_burnetii_GCF_000367725 Coxiella_burnetii_GCF_004087735 Coxiella_burnetii_GCF_002109345 Coxiella_burnetii_GCF_002094935 Coxiella_burnetii_GCF_005280755 Coxiella_burnetii_GCF_002634005 Coxiella_burnetii_GCF_002633985 Coxiella_burnetii_GCF_002633965 Coxiella_burnetii_GCF_002633945 Coxiella_burnetii_GCF_002633885 Coxiella_burnetii_GCF_002634025 Coxiella_burnetii_GCF_002634045 Coxiella_burnetii_GCF_000767035 Coxiella_burnetii_GCF_002634065 Coxiella_burnetii_GCF_002633925 Coxiella_burnetii_GCF_002633905 Coxiella_burnetii_GCF_001572765 Coxiella_burnetii_GCF_002634085 Coxiella_burnetii_GCF_000612785 Coxi

	/tmp/tmpvgAQid.fasta

Writing the results file...
	/tmp/tmpgUZm07.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_002094935.lastz
Creating Coxiella_burnetii_GCF_002094935 table
Inserting data to Coxiella_burnetii_GCF_002094935 table

Running against Coxiella_burnetii_GCF_005280755.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpvHw9bH.fasta

Writing the results file...
	/tmp/tmpuqRDF4.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_005280755.lastz
Creating Coxiella_burnetii_GCF_005280755 table
Inserting data to Coxiella_burnetii_GCF_005280755 table

Running against Coxiella_burnetii_GCF_002634005.2bit
R

Running the targets against 1 queries...
	/tmp/tmpWAaJv6.fasta

Writing the results file...
	/tmp/tmpCTHE6q.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_000612785.lastz
Creating Coxiella_burnetii_GCF_000612785 table
Inserting data to Coxiella_burnetii_GCF_000612785 table

Running against Coxiella_burnetii_GCF_000826165.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpQE1G5J.fasta

Writing the results file...
	/tmp/tmp8kxSfw.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_000826165.lastz
Creating Coxiella_burnetii_GCF_000826165 table
Inserting data to Coxiella_burnetii_GCF_000826165 table

Running again

Running the targets against 1 queries...
	/tmp/tmpxk3WK5.fasta

Writing the results file...
	/tmp/tmpOi0BgA.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_002247265.lastz
Creating Coxiella_burnetii_GCF_002247265 table
Inserting data to Coxiella_burnetii_GCF_002247265 table

Running against Coxiella_burnetii_GCF_002247205.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpX_dLCw.fasta

Writing the results file...
	/tmp/tmpPPXQEV.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_002247205.lastz
Creating Coxiella_burnetii_GCF_002247205 table
Inserting data to Coxiella_burnetii_GCF_002247205 table

Running again

	/tmp/tmpILbpUy.fasta

Writing the results file...
	/tmp/tmpJhQNPv.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_000751935.lastz
Creating Coxiella_burnetii_GCF_000751935 table
Inserting data to Coxiella_burnetii_GCF_000751935 table

Running against Coxiella_burnetii_GCF_000820825.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpPlHg7d.fasta

Writing the results file...
	/tmp/tmpbd_v7e.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/coxiella/validate_intervals/lastz/Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_Coxiella_burnetii_GCF_000820825.lastz
Creating Coxiella_burnetii_GCF_000820825 table
Inserting data to Coxiella_burnetii_GCF_000820825 table

Running against Coxiella_burnetii_GCF_000756325.2bit
R

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [53]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/coxiella/extract_probes_from_group/coxiella_genome.conf --lastz results/phyluce/coxiella/validate_intervals/lastz --probes 120 --probe-prefix uce_coxiella_ --name-pattern "Coxiella_burnetii_GCF_000007765_+68_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/coxiella/extract_probes_from_group/probe_fasta;
2020-02-07 09:45:58,473 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_000017105 genome -------
2020-02-07 09:45:58,474 - Phyluce - INFO - Reading Coxiella_burnetii_GCF_000017105 genome
2020-02-07 09:45:59,004 - Phyluce - INFO - Coxiella_burnetii_GCF_000017105: 89 uces, 0 dupes, 89 non-dupes, 2 orient drop, 0 length drop, 87 written
2020-02-07 09:45:59,005 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_001572745 genome -------
2020-02-07 09:45:59,006 - Phyluce - INFO - Reading Coxiella_burnetii_GCF_001572745 genome
2020-02-07 09:45:59,303 - Phyluce - INFO - Coxiella_burnetii_GCF_00157274

2020-02-07 09:46:06,363 - Phyluce - INFO - Coxiella_burnetii_GCF_002633905: 87 uces, 0 dupes, 87 non-dupes, 2 orient drop, 0 length drop, 85 written
2020-02-07 09:46:06,363 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_001572765 genome -------
2020-02-07 09:46:06,364 - Phyluce - INFO - Reading Coxiella_burnetii_GCF_001572765 genome
2020-02-07 09:46:06,664 - Phyluce - INFO - Coxiella_burnetii_GCF_001572765: 87 uces, 0 dupes, 87 non-dupes, 2 orient drop, 0 length drop, 85 written
2020-02-07 09:46:06,664 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_002634085 genome -------
2020-02-07 09:46:06,665 - Phyluce - INFO - Reading Coxiella_burnetii_GCF_002634085 genome
2020-02-07 09:46:06,962 - Phyluce - INFO - Coxiella_burnetii_GCF_002634085: 87 uces, 0 dupes, 87 non-dupes, 0 orient drop, 1 length drop, 86 written
2020-02-07 09:46:06,963 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_000612785 genome -------
2020-02-07 09:46:06,963 - Phyluce - INFO - Rea

2020-02-07 09:46:14,683 - Phyluce - INFO - Coxiella_burnetii_GCF_002247285: 89 uces, 2 dupes, 87 non-dupes, 0 orient drop, 0 length drop, 87 written
2020-02-07 09:46:14,683 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_002247305 genome -------
2020-02-07 09:46:14,684 - Phyluce - INFO - Reading Coxiella_burnetii_GCF_002247305 genome
2020-02-07 09:46:15,032 - Phyluce - INFO - Coxiella_burnetii_GCF_002247305: 89 uces, 2 dupes, 87 non-dupes, 0 orient drop, 0 length drop, 87 written
2020-02-07 09:46:15,032 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_002247335 genome -------
2020-02-07 09:46:15,033 - Phyluce - INFO - Reading Coxiella_burnetii_GCF_002247335 genome
2020-02-07 09:46:15,378 - Phyluce - INFO - Coxiella_burnetii_GCF_002247335: 89 uces, 2 dupes, 87 non-dupes, 0 orient drop, 0 length drop, 87 written
2020-02-07 09:46:15,378 - Phyluce - INFO - ------- Working on Coxiella_burnetii_GCF_002249845 genome -------
2020-02-07 09:46:15,379 - Phyluce - INFO - Rea

In [54]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/coxiella/extract_probes_from_group/probe_fasta --output results/phyluce/coxiella/extract_probes_from_group/multifastas.sqlite --base-taxon Coxiella_burnetii_GCF_000007765;
coxiella_burnetii_gcf_000017105.
coxiella_burnetii_gcf_001572745.
coxiella_burnetii_gcf_004006195.
coxiella_burnetii_gcf_000019885.
coxiella_burnetii_gcf_000168875.
coxiella_burnetii_gcf_000018745.
coxiella_burnetii_gcf_000019865.
coxiella_burnetii_gcf_000367725.
coxiella_burnetii_gcf_004087735.
coxiella_burnetii_gcf_002109345.
coxiella_burnetii_gcf_002094935.
coxiella_burnetii_gcf_005280755.
coxiella_burnetii_gcf_002634005.
coxiella_burnetii_gcf_002633985.
coxiella_burnetii_gcf_002633965.
coxiella_burnetii_gcf_002633945.
coxiella_burnetii_gcf_002633885.
coxiella_burnetii_gcf_002634025.
coxiella_burnetii_gcf_002634045.
coxiella_burnetii_gcf_000767035.
coxiella_burnetii_gcf_002634065.
coxiella_burnetii_gcf_002633925.
coxiella_burnetii_gcf_002633905.
coxiella

In [55]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(68)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/coxiella/extract_probes_from_group/multifastas.sqlite  --base-taxon Coxiella_burnetii_GCF_000007765 --output results/phyluce/coxiella/extract_probes_from_group/Coxiella_burnetii_GCF_000007765+68-back-to-68.conf --specific-counts 68;
Counter({'coxiella_burnetii_gcf_005280755': 81, 'coxiella_burnetii_gcf_001572765': 81, 'coxiella_burnetii_gcf_002094935': 81, 'coxiella_burnetii_gcf_003849885': 81, 'coxiella_burnetii_gcf_000470495': 81, 'coxiella_burnetii_gcf_002247185': 81, 'coxiella_burnetii_gcf_004087735': 81, 'coxiella_burnetii_gcf_000007765': 81, 'coxiella_burnetii_gcf_001572745': 81, 'coxiella_burnetii_gcf_002247285': 81, 'coxiella_burnetii_gcf_002924385': 81, 'coxiella_burnetii_gcf_002633925': 81, 'coxiella_burnetii_gcf_002634045': 81, 'coxiella_burnetii_gcf_002896835': 81, 'coxiella_burnetii_gca_000359545': 81, 'coxiella_burnetii_gcf_002634085': 81, 'coxiella_burnetii_gcf_000019865': 81, 'coxiella_burnetii_gcf_002249845': 8

## Final group specific bait design

In [56]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/coxiella/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/coxiella/extract_probes_from_group/Coxiella_burnetii_GCF_000007765+68-back-to-68.conf --probe-prefix uce_coxiella_ --designer rnplattii --design coxiella_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list.fasta;
NNNNGGGGGGGGGGGGGGNNGGGGGGGGGGGGGGGGGGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN


Conserved locus count = 81
Probe Count = 11076


In [57]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list.fasta --query results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  09:47:11
Ended:  Fri Feb 07, 2020  09:48:29
Time for execution:  1.3013803641 minutes


In [58]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list.fasta --lastz results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_coxiella_;
Parsing lastz file...
Screening results...
Screened 11075 fasta sequences.  Filtered 0 duplicates. Kept 11076.


## CDhit to reduce numbers

In [59]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list.fasta
         -o
         results/phyluce/coxiella/final_probe_design/coxiella_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 10:21:16 2020
                            Output                              
----------------------------------------------------------------
total seq: 11076
longest and shortest : 80 and 80
Total letters: 886080
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 2M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 89M

Table limit with the given memory limit:
Max number of representatives: 3946150
Max number of word counting entries: 88827838

# comparing sequences from          0  to       1846
.---------- new table with       42 representatives
# comparing sequences from       1846  to

# Ehrlichia

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [60]:
group = 'ehrlichia'

In [61]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [62]:
group_taxa = [ "Ehrlichia_chaffeensis_GCF_000013145",
               "Ehrlichia_minasensis_GCF_000825765",
               "Ehrlichia_muris_GCF_000508225",
               "Ehrlichia_ruminantium_GCF_000026005",
               "Ehrlichia_sp_GCF_000632845" ]
                    
reference_taxon = "Ehrlichia_canis_GCF_000012565"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [63]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000012565.1_ASM1256v1_genomic.fna.gz

sent 42 bytes  received 373614 bytes  149462.40 bytes/sec
total size is 373414  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [64]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [65]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482997 ("sim_Ehrlichia_chaffeensis_GCF_000013145") has been submitted
Your job 5482998 ("sim_Ehrlichia_minasensis_GCF_000825765") has been submitted
Your job 5482999 ("sim_Ehrlichia_muris_GCF_000508225") has been submitted
Your job 5483000 ("sim_Ehrlichia_ruminantium_GCF_000026005") has been submitted
Your job 5483001 ("sim_Ehrlichia_sp_GCF_000632845") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [66]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/ehrlichia/cleaned_genomes/Ehrlichia_canis_GCF_000012565_formatted.fas path=results/phyluce/ehrlichia/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [67]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483002 ("map_Ehrlichia_chaffeensis_GCF_000013145") has been submitted
Your job 5483003 ("map_Ehrlichia_minasensis_GCF_000825765") has been submitted
Your job 5483004 ("map_Ehrlichia_muris_GCF_000508225") has been submitted
Your job 5483005 ("map_Ehrlichia_ruminantium_GCF_000026005") has been submitted
Your job 5483006 ("map_Ehrlichia_sp_GCF_000632845") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [68]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483007 ("merge_Ehrlichia_chaffeensis_GCF_000013145") has been submitted
Your job 5483008 ("merge_Ehrlichia_minasensis_GCF_000825765") has been submitted
Your job 5483009 ("merge_Ehrlichia_muris_GCF_000508225") has been submitted
Your job 5483010 ("merge_Ehrlichia_ruminantium_GCF_000026005") has been submitted
Your job 5483011 ("merge_Ehrlichia_sp_GCF_000632845") has been submitted


remove loci that were masked in the original genome

In [69]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 1367 sequences from Ehrlichia_chaffeensis_GCF_000013145_merged.bed.  Filtered 697 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 670.
Screened 1144 sequences from Ehrlichia_minasensis_GCF_000825765_merged.bed.  Filtered 254 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 890.
Screened 1479 sequences from Ehrlichia_muris_GCF_000508225_merged.bed.  Filtered 813 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 666.
Screened 480 sequences from Ehrlichia_ruminantium_GCF_000026005_merged.bed.  Filtered 321 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 159.
Screened 1426 sequences from Ehrlichia_sp_GCF_000632845_merged.bed.  Filtered 777 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 649.


## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [70]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [71]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/ehrlichia/initial_intervals/Ehrlichia_sp_GCF_000632845_merged.bed --twobit results/phyluce/ehrlichia/cleaned_genomes/Ehrlichia_canis_GCF_000012565_formatted.2bit --output results/phyluce/ehrlichia/initial_intervals/Ehrlichia_sp_GCF_000632845_stripped.bed;
ehrlichia_chaffeensis_gcf_000013145.
ehrlichia_minasensis_gcf_000825765.
ehrlichia_muris_gcf_000508225.
ehrlichia_ruminantium_gcf_000026005.
ehrlichia_sp_gcf_000632845.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [72]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/ehrlichia/initial_intervals/Ehrlichia_sp_GCF_000632845_merged.bed --twobit results/phyluce/ehrlichia/cleaned_genomes/Ehrlichia_canis_GCF_000012565_formatted.2bit --output results/phyluce/ehrlichia/initial_intervals/Ehrlichia_sp_GCF_000632845_stripped.bed;
Loci shared by Ehrlichia_canis_GCF_000012565 + 0 taxa:	1,397.0
Loci shared by Ehrlichia_canis_GCF_000012565 + 1 taxa:	1,397.0
Loci shared by Ehrlichia_canis_GCF_000012565 + 2 taxa:	830.0
Loci shared by Ehrlichia_canis_GCF_000012565 + 3 taxa:	568.0
Loci shared by Ehrlichia_canis_GCF_000012565 + 4 taxa:	426.0
Loci shared by Ehrlichia_canis_GCF_000012565 + 5 taxa:	122.0


In [73]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 4
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/ehrlichia/initial_intervals/ehrlichia-to-Ehrlichia_canis_GCF_000012565.sqlite --base-taxon Ehrlichia_canis_GCF_000012565 --output results/phyluce/ehrlichia/initial_intervals/Ehrlichia_canis_GCF_000012565_+4.bed --specific-counts 4;
Counter({'ehrlichia_minasensis_gcf_000825765': 426, 'ehrlichia_chaffeensis_gcf_000013145': 425, 'ehrlichia_sp_gcf_000632845': 423, 'ehrlichia_muris_gcf_000508225': 418, 'ehrlichia_ruminantium_gcf_000026005': 134})


## Design temp set of baits

In [74]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/ehrlichia/initial_intervals/Ehrlichia_canis_GCF_000012565_+4.bed --twobit results/phyluce/ehrlichia/cleaned_genomes/Ehrlichia_canis_GCF_000012565_formatted.2bit --buffer-to 160 --output results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4.fasta;
Screened 426 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 426.


design the baits

In [75]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4.fasta --probe-prefix uce_ehrlichia_ --design ehrlichia_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 345
Probe Count = 611


## Find duplicate baited regions

In [76]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4_temp_probes.fas --query results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  10:28:15
Ended:  Fri Feb 07, 2020  10:28:15
Time for execution:  0.00455739895503 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4_temp_probes.fas                        --lastz results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4_temp_probes_vself.lastz                       --probe-prefix=uce_ehrlichia_;
Parsing lastz file...
Screening results...
Screened 610 fasta sequences.  Filtered 2 duplicates. Kept 607.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [77]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [78]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/ehrlichia/validate_intervals/Ehrlichia_canis_GCF_000012565_+4_temp_probes.fas --scaffoldlist Ehrlichia_chaffeensis_GCF_000013145 Ehrlichia_minasensis_GCF_000825765 Ehrlichia_muris_GCF_000508225 Ehrlichia_ruminantium_GCF_000026005 Ehrlichia_sp_GCF_000632845 Ehrlichia_canis_GCF_000012565 --genome-base-path results/phyluce/ehrlichia/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/ehrlichia/validate_intervals/ehrlichia-to-Ehrlichia_canis_GCF_000012565.sqlite --output results/phyluce/ehrlichia/validate_intervals/lastz/;

Running against Ehrlichia_chaffeensis_GCF_000013145.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp3B_2hh.fasta

Writing the results file...
	/tmp/tmpaLFPwf.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/ehrlichia/validate_intervals/lastz/Ehrlichia_canis_GCF_000012565_+4_

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [79]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/ehrlichia/extract_probes_from_group/ehrlichia_genome.conf --lastz results/phyluce/ehrlichia/validate_intervals/lastz --probes 120 --probe-prefix uce_ehrlichia_ --name-pattern "Ehrlichia_canis_GCF_000012565_+4_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/ehrlichia/extract_probes_from_group/probe_fasta;
2020-02-07 10:28:27,922 - Phyluce - INFO - ----- Working on Ehrlichia_chaffeensis_GCF_000013145 genome -----
2020-02-07 10:28:27,923 - Phyluce - INFO - Reading Ehrlichia_chaffeensis_GCF_000013145 genome
2020-02-07 10:28:29,336 - Phyluce - INFO - Ehrlichia_chaffeensis_GCF_000013145: 345 uces, 0 dupes, 345 non-dupes, 4 orient drop, 5 length drop, 336 written
2020-02-07 10:28:29,337 - Phyluce - INFO - ------ Working on Ehrlichia_minasensis_GCF_000825765 genome -----
2020-02-07 10:28:29,337 - Phyluce - INFO - Reading Ehrlichia_minasensis_GCF_000825765 genome
2020-02-07 10:28:30,375 - Phyluce - INFO - Ehrlichia_mina

In [80]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/ehrlichia/extract_probes_from_group/probe_fasta --output results/phyluce/ehrlichia/extract_probes_from_group/multifastas.sqlite --base-taxon Ehrlichia_canis_GCF_000012565;
ehrlichia_chaffeensis_gcf_000013145.
ehrlichia_minasensis_gcf_000825765.
ehrlichia_muris_gcf_000508225.
ehrlichia_ruminantium_gcf_000026005.
ehrlichia_sp_gcf_000632845.
ehrlichia_canis_gcf_000012565.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/ehrlichia/extract_probes_from_group/multifastas.sqlite --base-taxon Ehrlichia_canis_GCF_000012565;
Loci shared by 0 taxa:	339.0
Loci shared by 1 taxa:	339.0
Loci shared by 2 taxa:	337.0
Loci shared by 3 taxa:	335.0
Loci shared by 4 taxa:	335.0
Loci shared by 5 taxa:	329.0
Loci shared by 6 taxa:	312.0


In [81]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(6)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/ehrlichia/extract_probes_from_group/multifastas.sqlite  --base-taxon Ehrlichia_canis_GCF_000012565 --output results/phyluce/ehrlichia/extract_probes_from_group/Ehrlichia_canis_GCF_000012565+4-back-to-6.conf --specific-counts 6;
Counter({'ehrlichia_ruminantium_gcf_000026005': 312, 'ehrlichia_minasensis_gcf_000825765': 312, 'ehrlichia_muris_gcf_000508225': 312, 'ehrlichia_chaffeensis_gcf_000013145': 312, 'ehrlichia_canis_gcf_000012565': 312, 'ehrlichia_sp_gcf_000632845': 312})
Total loci = 312


## Final group specific bait design

In [82]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/ehrlichia/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/ehrlichia/extract_probes_from_group/Ehrlichia_canis_GCF_000012565+4-back-to-6.conf --probe-prefix uce_ehrlichia_ --designer rnplattii --design ehrlichia_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [83]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list.fasta --query results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  10:28:47
Ended:  Fri Feb 07, 2020  10:28:50
Time for execution:  0.0384563008944 minutes


In [84]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list.fasta --lastz results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_ehrlichia_;
Parsing lastz file...
Screening results...
Screened 3206 fasta sequences.  Filtered 0 duplicates. Kept 3207.


## CDhit to reduce numbers

In [85]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list.fasta
         -o
         results/phyluce/ehrlichia/final_probe_design/ehrlichia_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 10:29:38 2020
                            Output                              
----------------------------------------------------------------
total seq: 3207
longest and shortest : 80 and 80
Total letters: 256560
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 86M

Table limit with the given memory limit:
Max number of representatives: 3959542
Max number of word counting entries: 89129304

# comparing sequences from          0  to        534
---------- new table with      449 representatives
# comparing sequences from        534  

# Francisella

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [86]:
group = 'francisella'

In [87]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [88]:
group_taxa = [ "Francisella_adeliensis_GCF_003290445",
               "Francisella_frigiditurris_GCF_001880225",
               "Francisella_halioticida_GCF_002211785",
               "Francisella_hispniensis_GCF_001885235",
               "Francisella_marina_GCF_008369785",
               "Francisella_noatunensis_GCF_008330245",
               "Francisella_opportunistica_GCF_003347135",
               "Francisella_persica_GCF_001653955",
               "Francisella_philomiragia_GCF_000019285",
               "Francisella_salina_GCF_000219045",
               "Francisella_sp_GCF_000764555",
               "Francisella_uliginis_GCF_001895265" ]
                    
reference_taxon = "Francisella_tularensis_GCF_000008985"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [89]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003290445.1_ASM329044v1_genomic.fna.gz

sent 42 bytes  received 597341 bytes  398255.33 bytes/sec
total size is 597083  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplina

GCA_000764555.1_ASM76455v1_genomic.fna.gz

sent 42 bytes  received 584089 bytes  389420.67 bytes/sec
total size is 583840  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000008985.1_ASM898v1_genomic.fna.gz

sent 42 bytes  received 547773 bytes  365210.00 bytes/sec
total size is 547534  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, networ

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [90]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [91]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483012 ("sim_Francisella_adeliensis_GCF_003290445") has been submitted
Your job 5483013 ("sim_Francisella_frigiditurris_GCF_001880225") has been submitted
Your job 5483014 ("sim_Francisella_halioticida_GCF_002211785") has been submitted
Your job 5483015 ("sim_Francisella_hispniensis_GCF_001885235") has been submitted
Your job 5483016 ("sim_Francisella_marina_GCF_008369785") has been submitted
Your job 5483017 ("sim_Francisella_noatunensis_GCF_008330245") has been submitted
Your job 5483018 ("sim_Francisella_opportunistica_GCF_003347135") has been submitted
Your job 5483019 ("sim_Francisella_persica_GCF_001653955") has been submitted
Your job 5483020 ("sim_Francisella_philomiragia_GCF_000019285") has been submitted
Your job 5483021 ("sim_Francisella_salina_GCF_000219045") has been submitted
Your job 5483022 ("sim_Francisella_sp_GCF_000764555") has been submitted
Your job 5483023 ("sim_Francisella_uliginis_GCF_001895265") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [92]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/francisella/cleaned_genomes/Francisella_tularensis_GCF_000008985_formatted.fas path=results/phyluce/francisella/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [93]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483024 ("map_Francisella_adeliensis_GCF_003290445") has been submitted
Your job 5483025 ("map_Francisella_frigiditurris_GCF_001880225") has been submitted
Your job 5483026 ("map_Francisella_halioticida_GCF_002211785") has been submitted
Your job 5483027 ("map_Francisella_hispniensis_GCF_001885235") has been submitted
Your job 5483028 ("map_Francisella_marina_GCF_008369785") has been submitted
Your job 5483029 ("map_Francisella_noatunensis_GCF_008330245") has been submitted
Your job 5483030 ("map_Francisella_opportunistica_GCF_003347135") has been submitted
Your job 5483031 ("map_Francisella_persica_GCF_001653955") has been submitted
Your job 5483032 ("map_Francisella_philomiragia_GCF_000019285") has been submitted
Your job 5483033 ("map_Francisella_salina_GCF_000219045") has been submitted
Your job 5483034 ("map_Francisella_sp_GCF_000764555") has been submitted
Your job 5483035 ("map_Francisella_uliginis_GCF_001895265") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [94]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483036 ("merge_Francisella_adeliensis_GCF_003290445") has been submitted
Your job 5483037 ("merge_Francisella_frigiditurris_GCF_001880225") has been submitted
Your job 5483038 ("merge_Francisella_halioticida_GCF_002211785") has been submitted
Your job 5483039 ("merge_Francisella_hispniensis_GCF_001885235") has been submitted
Your job 5483040 ("merge_Francisella_marina_GCF_008369785") has been submitted
Your job 5483041 ("merge_Francisella_noatunensis_GCF_008330245") has been submitted
Your job 5483042 ("merge_Francisella_opportunistica_GCF_003347135") has been submitted
Your job 5483043 ("merge_Francisella_persica_GCF_001653955") has been submitted
Your job 5483044 ("merge_Francisella_philomiragia_GCF_000019285") has been submitted
Your job 5483045 ("merge_Francisella_salina_GCF_000219045") has been submitted
Your job 5483046 ("merge_Francisella_sp_GCF_000764555") has been submitted
Your job 5483047 ("merge_Francisella_uliginis_GCF_001895265") has been submitted


remove loci that were masked in the original genome

In [95]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 295 sequences from Francisella_adeliensis_GCF_003290445_merged.bed.  Filtered 211 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 84.
Screened 234 sequences from Francisella_frigiditurris_GCF_001880225_merged.bed.  Filtered 162 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 72.
Screened 638 sequences from Francisella_halioticida_GCF_002211785_merged.bed.  Filtered 454 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 184.
Screened 2181 sequences from Francisella_hispniensis_GCF_001885235_merged.bed.  Filtered 430 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1751.
Screened 1263 sequences from Francisella_marina_GCF_008369785_merged.bed.  Filtered 734 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 529.
Screened 1242 sequences from Francisella_noatunensis_GCF_008330245_merged.bed.  Filtered 688 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 554.
Screened 2919 sequences from Francisella_o

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [96]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [97]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/francisella/initial_intervals/Francisella_uliginis_GCF_001895265_merged.bed --twobit results/phyluce/francisella/cleaned_genomes/Francisella_tularensis_GCF_000008985_formatted.2bit --output results/phyluce/francisella/initial_intervals/Francisella_uliginis_GCF_001895265_stripped.bed;
francisella_adeliensis_gcf_003290445.
francisella_frigiditurris_gcf_001880225.
francisella_halioticida_gcf_002211785.
francisella_hispniensis_gcf_001885235..
francisella_marina_gcf_008369785.
francisella_noatunensis_gcf_008330245.
francisella_opportunistica_gcf_003347135..
francisella_persica_gcf_001653955..
francisella_philomiragia_gcf_000019285.
francisella_salina_gcf_000219045.
francisella_sp_gcf_000764555.
francisella_uliginis_gcf_001895265.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [98]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/francisella/initial_intervals/Francisella_uliginis_GCF_001895265_merged.bed --twobit results/phyluce/francisella/cleaned_genomes/Francisella_tularensis_GCF_000008985_formatted.2bit --output results/phyluce/francisella/initial_intervals/Francisella_uliginis_GCF_001895265_stripped.bed;
Loci shared by Francisella_tularensis_GCF_000008985 + 0 taxa:	2,296.0
Loci shared by Francisella_tularensis_GCF_000008985 + 1 taxa:	2,296.0
Loci shared by Francisella_tularensis_GCF_000008985 + 2 taxa:	1,761.0
Loci shared by Francisella_tularensis_GCF_000008985 + 3 taxa:	1,286.0
Loci shared by Francisella_tularensis_GCF_000008985 + 4 taxa:	720.0
Loci shared by Francisella_tularensis_GCF_000008985 + 5 taxa:	543.0
Loci shared by Francisella_tularensis_GCF_000008985 + 6 taxa:	371.0
Loci shared by Francisella_tularensis_GCF_000008985 + 7 taxa:	269.0
Loci shared by Francisella_tularensis_GCF_000008985 + 8 taxa:	18

In [99]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 7
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/francisella/initial_intervals/francisella-to-Francisella_tularensis_GCF_000008985.sqlite --base-taxon Francisella_tularensis_GCF_000008985 --output results/phyluce/francisella/initial_intervals/Francisella_tularensis_GCF_000008985_+7.bed --specific-counts 7;
Counter({'francisella_philomiragia_gcf_000019285': 254, 'francisella_opportunistica_gcf_003347135': 252, 'francisella_hispniensis_gcf_001885235': 252, 'francisella_noatunensis_gcf_008330245': 247, 'francisella_persica_gcf_001653955': 244, 'francisella_salina_gcf_000219045': 242, 'francisella_marina_gcf_008369785': 241, 'francisella_uliginis_gcf_001895265': 198, 'francisella_sp_gcf_000764555': 168, 'francisella_halioticida_gcf_002211785': 148, 'francisella_adeliensis_gcf_003290445': 64, 'francisella_frigiditurris_gcf_001880225': 47})


## Design temp set of baits

In [100]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/francisella/initial_intervals/Francisella_tularensis_GCF_000008985_+7.bed --twobit results/phyluce/francisella/cleaned_genomes/Francisella_tularensis_GCF_000008985_formatted.2bit --buffer-to 160 --output results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7.fasta;
Screened 269 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 269.


design the baits

In [101]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7.fasta --probe-prefix uce_francisella_ --design francisella_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 253
Probe Count = 470


## Find duplicate baited regions

In [102]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7_temp_probes.fas --query results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  10:33:58
Ended:  Fri Feb 07, 2020  10:34:00
Time for execution:  0.0289617498716 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7_temp_probes.fas                        --lastz results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7_temp_probes_vself.lastz                       --probe-prefix=uce_francisella_;
Parsing lastz file...
Screening results...
Screened 469 fasta sequences.  Filtered 2 duplicates.

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [103]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [104]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/francisella/validate_intervals/Francisella_tularensis_GCF_000008985_+7_temp_probes.fas --scaffoldlist Francisella_adeliensis_GCF_003290445 Francisella_frigiditurris_GCF_001880225 Francisella_halioticida_GCF_002211785 Francisella_hispniensis_GCF_001885235 Francisella_marina_GCF_008369785 Francisella_noatunensis_GCF_008330245 Francisella_opportunistica_GCF_003347135 Francisella_persica_GCF_001653955 Francisella_philomiragia_GCF_000019285 Francisella_salina_GCF_000219045 Francisella_sp_GCF_000764555 Francisella_uliginis_GCF_001895265 Francisella_tularensis_GCF_000008985 --genome-base-path results/phyluce/francisella/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/francisella/validate_intervals/francisella-to-Francisella_tularensis_GCF_000008985.sqlite --output results/phyluce/francisella/validate_intervals/lastz/;

Running against Francisella_adeliensis_GCF_003290445.2bit
Running with the --huge option.  Chu

Creating Francisella_tularensis_GCF_000008985 table
Inserting data to Francisella_tularensis_GCF_000008985 table


## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [105]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/francisella/extract_probes_from_group/francisella_genome.conf --lastz results/phyluce/francisella/validate_intervals/lastz --probes 120 --probe-prefix uce_francisella_ --name-pattern "Francisella_tularensis_GCF_000008985_+7_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/francisella/extract_probes_from_group/probe_fasta;
2020-02-07 10:34:23,745 - Phyluce - INFO - ----- Working on Francisella_adeliensis_GCF_003290445 genome ----
2020-02-07 10:34:23,746 - Phyluce - INFO - Reading Francisella_adeliensis_GCF_003290445 genome
2020-02-07 10:34:24,610 - Phyluce - INFO - Francisella_adeliensis_GCF_003290445: 245 uces, 0 dupes, 245 non-dupes, 7 orient drop, 2 length drop, 236 written
2020-02-07 10:34:24,610 - Phyluce - INFO - --- Working on Francisella_frigiditurris_GCF_001880225 genome ---
2020-02-07 10:34:24,611 - Phyluce - INFO - Reading Francisella_frigiditurris_GCF_001880225 genome
2020-02-07 10:34:25,358 - Phyluce

In [106]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/francisella/extract_probes_from_group/probe_fasta --output results/phyluce/francisella/extract_probes_from_group/multifastas.sqlite --base-taxon Francisella_tularensis_GCF_000008985;
francisella_adeliensis_gcf_003290445.
francisella_frigiditurris_gcf_001880225.
francisella_halioticida_gcf_002211785.
francisella_hispniensis_gcf_001885235.
francisella_marina_gcf_008369785.
francisella_noatunensis_gcf_008330245.
francisella_opportunistica_gcf_003347135.
francisella_persica_gcf_001653955.
francisella_philomiragia_gcf_000019285.
francisella_salina_gcf_000219045.
francisella_sp_gcf_000764555.
francisella_uliginis_gcf_001895265.
francisella_tularensis_gcf_000008985.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/francisella/extract_probes_from_group/multifastas.sqlite --base-taxon Francisella_tularensis_GCF_000008985;
Loci shared by 0 taxa:	248.0
Loci shared by 1 taxa:	248.0
Loci share

In [107]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(13)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/francisella/extract_probes_from_group/multifastas.sqlite  --base-taxon Francisella_tularensis_GCF_000008985 --output results/phyluce/francisella/extract_probes_from_group/Francisella_tularensis_GCF_000008985+7-back-to-13.conf --specific-counts 13;
Counter({'francisella_persica_gcf_001653955': 201, 'francisella_sp_gcf_000764555': 201, 'francisella_salina_gcf_000219045': 201, 'francisella_frigiditurris_gcf_001880225': 201, 'francisella_opportunistica_gcf_003347135': 201, 'francisella_noatunensis_gcf_008330245': 201, 'francisella_hispniensis_gcf_001885235': 201, 'francisella_marina_gcf_008369785': 201, 'francisella_philomiragia_gcf_000019285': 201, 'francisella_adeliensis_gcf_003290445': 201, 'francisella_halioticida_gcf_002211785': 201, 'francisella_tularensis_gcf_000008985': 201, 'francisella_uliginis_gcf_001895265': 201})
Total loci = 201


## Final group specific bait design

In [108]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/francisella/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/francisella/extract_probes_from_group/Francisella_tularensis_GCF_000008985+7-back-to-13.conf --probe-prefix uce_francisella_ --designer rnplattii --design francisella_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [109]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list.fasta --query results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  10:34:49
Ended:  Fri Feb 07, 2020  10:34:56
Time for execution:  0.113882136345 minutes


In [110]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list.fasta --lastz results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_francisella_;
Parsing lastz file...
Screening results...
Screened 4670 fasta sequences.  Filtered 0 duplicates. Kept 4671.


## CDhit to reduce numbers

In [111]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list.fasta
         -o
         results/phyluce/francisella/final_probe_design/francisella_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 10:37:22 2020
                            Output                              
----------------------------------------------------------------
total seq: 4671
longest and shortest : 80 and 80
Total letters: 373680
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 87M

Table limit with the given memory limit:
Max number of representatives: 3957054
Max number of word counting entries: 89073306

# comparing sequences from          0  to        778
---------- new table with      618 representatives
# comparing sequences from     

# Leptospira


## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [148]:
group = 'leptospira'

In [149]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [156]:
group_taxa = [ "Leptospira_adleri_GCF_002811985",
               "Leptospira_alexanderi_GCF_000243815",
               "Leptospira_alstonii_GCF_000347175",
               "Leptospira_andrefontaineae_GCF_004770105",
               "Leptospira_bandrabouensis_GCF_004770905",
               "Leptospira_barantonii_GCF_002811925",
               "Leptospira_biflexa_GCF_000017685",
               "Leptospira_borgpetersenii_GCF_000013945",
               "Leptospira_bourretii_GCF_004769285",
               "Leptospira_bouyouniensis_GCF_004770625",
               "Leptospira_brenneri_GCF_002812125",
               "Leptospira_broomii_GCF_000243715",
               "Leptospira_congkakensis_GCF_004770265",
               "Leptospira_dzianensis_GCF_004770135",
               "Leptospira_dzoumogneensis_GCF_004770895",
               "Leptospira_ellinghausenii_GCF_003114815",
               "Leptospira_elllisii_GCF_002811955",
               "Leptospira_fainei_GCF_000306235",
               "Leptospira_fletcheri_GCF_004769195",
               "Leptospira_fluminis_GCF_004771275",
               "Leptospira_gomenensis_GCF_004770155",
               "Leptospira_haakeii_GCF_002812045",
               "Leptospira_harrisiae_GCF_002811945",
               "Leptospira_hartskeerlii_GCF_002811475",
               "Leptospira_idonii_GCF_004770995",
               "Leptospira_ilyithenensis_GCF_004771005",
               "Leptospira_inadai_GCF_000243675",
               "Leptospira_jelokensis_GCF_004769775",
               "Leptospira_johnsonii_GCF_003112675",
               "Leptospira_kanakyensis_GCF_004769235",
               "Leptospira_kemamanensis_GCF_004769665",
               "Leptospira_kirschneri_GCF_000243695",
               "Leptospira_kmetyi_GCF_000243735",
               "Leptospira_kobayashii_GCF_003114835",
               "Leptospira_koniamboensis_GCF_004769555",
               "Leptospira_langatensis_GCF_004770615",
               "Leptospira_levettii_GCF_002812085",
               "Leptospira_licerasiae_GCF_000244755",
               "Leptospira_macculloughii_GCF_002811975",
               "Leptospira_mayottensis_GCF_000306675",
               "Leptospira_meyeri_GCF_000304275",
               "Leptospira_montravelensis_GCF_004769455",
               "Leptospira_mtsangambouensis_GCF_004770475",
               "Leptospira_neocaledonica_GCF_002812205",
               "Leptospira_noguchii_GCF_000306255",
               "Leptospira_noumeaensis_GCF_004770765",
               "Leptospira_ognonensis_GCF_004770745",
               "Leptospira_perdikensis_GCF_004769575",
               "Leptospira_perolatii_GCF_002811875",
               "Leptospira_putramalaysiae_GCF_004770035",
               "Leptospira_ryugenii_GCF_003114855",
               "Leptospira_saintgironsiae_GCF_002811765",
               "Leptospira_santarosai_GCF_000313175",
               "Leptospira_sarikeiensis_GCF_004769615",
               "Leptospira_selangorensis_GCF_004769405",
               "Leptospira_semungkisensis_GCF_004770055",
               "Leptospira_sp_GCF_000347035",
               "Leptospira_stimsonii_GCF_003545885",
               "Leptospira_terpstrae_GCF_000332495",
               "Leptospira_tipperaryensis_GCF_001729245",
               "Leptospira_vanthielii_GCF_000332455",
               "Leptospira_venezuelensis_GCF_002150035",
               "Leptospira_weilii_GCF_000244815",
               "Leptospira_wolbachii_GCF_000332515",
               "Leptospira_wolffii_GCF_000306115",
               "Leptospira_yanagawae_GCF_000332475",
               "Leptospira_yasudae_GCF_003545925" ]
                    
reference_taxon = "Leptospira_interrogans_GCF_000092565"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [166]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002811985.1_ASM281198v1_genomic.fna.gz

sent 42 bytes  received 1422418 bytes  948306.67 bytes/sec
total size is 1421960  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
discipli

GCA_002812125.1_ASM281212v1_genomic.fna.gz

sent 42 bytes  received 1213988 bytes  809353.33 bytes/sec
total size is 1213578  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000243715.3_gls454050v02_genomic.fna.gz

sent 42 bytes  received 1306348 bytes  870926.67 bytes/sec
total size is 1305921  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute


GCA_002812045.1_ASM281204v1_genomic.fna.gz

sent 42 bytes  received 1240941 bytes  107911.57 bytes/sec
total size is 1240531  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002811945.1_ASM281194v1_genomic.fna.gz

sent 42 bytes  received 1164568 bytes  776406.67 bytes/sec
total size is 1164174  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute

GCA_000243695.3_gls454049v02_genomic.fna.gz

sent 42 bytes  received 1284980 bytes  856681.33 bytes/sec
total size is 1284553  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000243735.3_gls454052v1.0_genomic.fna.gz

sent 42 bytes  received 1286415 bytes  857638.00 bytes/sec
total size is 1285987  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compu

GCA_004770475.1_ASM477047v1_genomic.fna.gz

sent 42 bytes  received 1203143 bytes  802123.33 bytes/sec
total size is 1202741  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002812205.1_ASM281220v1_genomic.fna.gz

sent 42 bytes  received 1246331 bytes  830915.33 bytes/sec
total size is 1245913  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_004769615.1_ASM476961v1_genomic.fna.gz

sent 42 bytes  received 1293778 bytes  862546.67 bytes/sec
total size is 1293352  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_004769405.1_ASM476940v1_genomic.fna.gz

sent 42 bytes  received 1237738 bytes  825186.67 bytes/sec
total size is 1237328  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_000306115.2_gls454061v02_genomic.fna.gz

sent 42 bytes  received 1301152 bytes  867462.67 bytes/sec
total size is 1300725  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000332475.2_gls454202v02_genomic.fna.gz

sent 42 bytes  received 1195500 bytes  797028.00 bytes/sec
total size is 1195097  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [168]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [169]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484051 ("sim_Leptospira_adleri_GCF_002811985") has been submitted
Your job 5484052 ("sim_Leptospira_alexanderi_GCF_000243815") has been submitted
Your job 5484053 ("sim_Leptospira_alstonii_GCF_000347175") has been submitted
Your job 5484054 ("sim_Leptospira_andrefontaineae_GCF_004770105") has been submitted
Your job 5484055 ("sim_Leptospira_bandrabouensis_GCF_004770905") has been submitted
Your job 5484056 ("sim_Leptospira_barantonii_GCF_002811925") has been submitted
Your job 5484057 ("sim_Leptospira_biflexa_GCF_000017685") has been submitted
Your job 5484058 ("sim_Leptospira_borgpetersenii_GCF_000013945") has been submitted
Your job 5484059 ("sim_Leptospira_bourretii_GCF_004769285") has been submitted
Your job 5484060 ("sim_Leptospira_bouyouniensis_GCF_004770625") has been submitted
Your job 5484061 ("sim_Leptospira_brenneri_GCF_002812125") has been submitted
Your job 5484062 ("sim_Leptospira_broomii_GCF_000243715") has been submitted
Your job 5484063 ("sim_Leptospira_congk

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [170]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/leptospira/cleaned_genomes/Leptospira_interrogans_GCF_000092565_formatted.fas path=results/phyluce/leptospira/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [171]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484118 ("map_Leptospira_adleri_GCF_002811985") has been submitted
Your job 5484119 ("map_Leptospira_alexanderi_GCF_000243815") has been submitted
Your job 5484120 ("map_Leptospira_alstonii_GCF_000347175") has been submitted
Your job 5484121 ("map_Leptospira_andrefontaineae_GCF_004770105") has been submitted
Your job 5484122 ("map_Leptospira_bandrabouensis_GCF_004770905") has been submitted
Your job 5484123 ("map_Leptospira_barantonii_GCF_002811925") has been submitted
Your job 5484124 ("map_Leptospira_biflexa_GCF_000017685") has been submitted
Your job 5484125 ("map_Leptospira_borgpetersenii_GCF_000013945") has been submitted
Your job 5484126 ("map_Leptospira_bourretii_GCF_004769285") has been submitted
Your job 5484127 ("map_Leptospira_bouyouniensis_GCF_004770625") has been submitted
Your job 5484128 ("map_Leptospira_brenneri_GCF_002812125") has been submitted
Your job 5484129 ("map_Leptospira_broomii_GCF_000243715") has been submitted
Your job 5484130 ("map_Leptospira_congk

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [172]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484185 ("merge_Leptospira_adleri_GCF_002811985") has been submitted
Your job 5484186 ("merge_Leptospira_alexanderi_GCF_000243815") has been submitted
Your job 5484187 ("merge_Leptospira_alstonii_GCF_000347175") has been submitted
Your job 5484188 ("merge_Leptospira_andrefontaineae_GCF_004770105") has been submitted
Your job 5484189 ("merge_Leptospira_bandrabouensis_GCF_004770905") has been submitted
Your job 5484190 ("merge_Leptospira_barantonii_GCF_002811925") has been submitted
Your job 5484191 ("merge_Leptospira_biflexa_GCF_000017685") has been submitted
Your job 5484192 ("merge_Leptospira_borgpetersenii_GCF_000013945") has been submitted
Your job 5484193 ("merge_Leptospira_bourretii_GCF_004769285") has been submitted
Your job 5484194 ("merge_Leptospira_bouyouniensis_GCF_004770625") has been submitted
Your job 5484195 ("merge_Leptospira_brenneri_GCF_002812125") has been submitted
Your job 5484196 ("merge_Leptospira_broomii_GCF_000243715") has been submitted
Your job 548419

remove loci that were masked in the original genome

In [173]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 913 sequences from Leptospira_adleri_GCF_002811985_merged.bed.  Filtered 630 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 283.
Screened 2402 sequences from Leptospira_alexanderi_GCF_000243815_merged.bed.  Filtered 1640 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 762.
Screened 2609 sequences from Leptospira_alstonii_GCF_000347175_merged.bed.  Filtered 1806 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 803.
Screened 50 sequences from Leptospira_andrefontaineae_GCF_004770105_merged.bed.  Filtered 44 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 24 sequences from Leptospira_bandrabouensis_GCF_004770905_merged.bed.  Filtered 17 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 7.
Screened 1492 sequences from Leptospira_barantonii_GCF_002811925_merged.bed.  Filtered 1000 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 492.
Screened 21 sequences from Leptospira_biflexa_GCF_0

Screened 55 sequences from Leptospira_sarikeiensis_GCF_004769615_merged.bed.  Filtered 49 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 49 sequences from Leptospira_selangorensis_GCF_004769405_merged.bed.  Filtered 43 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 44 sequences from Leptospira_semungkisensis_GCF_004770055_merged.bed.  Filtered 37 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 7.
Screened 41 sequences from Leptospira_sp_GCF_000347035_merged.bed.  Filtered 34 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 7.
Screened 897 sequences from Leptospira_stimsonii_GCF_003545885_merged.bed.  Filtered 608 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 289.
Screened 40 sequences from Leptospira_terpstrae_GCF_000332495_merged.bed.  Filtered 20 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 20.
Screened 1035 sequences from Leptospira_tipperaryensis_GCF_001729245_mer

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [174]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [175]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/leptospira/initial_intervals/Leptospira_yasudae_GCF_003545925_merged.bed --twobit results/phyluce/leptospira/cleaned_genomes/Leptospira_interrogans_GCF_000092565_formatted.2bit --output results/phyluce/leptospira/initial_intervals/Leptospira_yasudae_GCF_003545925_stripped.bed;
leptospira_adleri_gcf_002811985.
leptospira_alexanderi_gcf_000243815.
leptospira_alstonii_gcf_000347175.
leptospira_andrefontaineae_gcf_004770105.
leptospira_bandrabouensis_gcf_004770905.
leptospira_barantonii_gcf_002811925.
leptospira_biflexa_gcf_000017685.
leptospira_borgpetersenii_gcf_000013945.
leptospira_bourretii_gcf_004769285.
leptospira_bouyouniensis_gcf_004770625.
leptospira_brenneri_gcf_002812125.
leptospira_broomii_gcf_000243715.
leptospira_congkakensis_gcf_004770265.
leptospira_dzianensis_gcf_004770135.
leptospira_dzoumogneensis_gcf_004770895.
leptospira_ellinghausenii_gcf_003114815.
leptospira_elllisii_

Quantify probes and the number of targeted taxa for each.

In [176]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/leptospira/initial_intervals/Leptospira_yasudae_GCF_003545925_merged.bed --twobit results/phyluce/leptospira/cleaned_genomes/Leptospira_interrogans_GCF_000092565_formatted.2bit --output results/phyluce/leptospira/initial_intervals/Leptospira_yasudae_GCF_003545925_stripped.bed;
Loci shared by Leptospira_interrogans_GCF_000092565 + 0 taxa:	5,335.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 1 taxa:	5,335.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 2 taxa:	3,863.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 3 taxa:	1,553.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 4 taxa:	1,072.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 5 taxa:	812.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 6 taxa:	660.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 7 taxa:	570.0
Loci shared by Leptospira_interrogans_GCF_000092565 + 8 taxa:	466.0
L

In [177]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 10
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/leptospira/initial_intervals/leptospira-to-Leptospira_interrogans_GCF_000092565.sqlite --base-taxon Leptospira_interrogans_GCF_000092565 --output results/phyluce/leptospira/initial_intervals/Leptospira_interrogans_GCF_000092565_+10.bed --specific-counts 10;
Counter({'leptospira_noguchii_gcf_000306255': 353, 'leptospira_kirschneri_gcf_000243695': 352, 'leptospira_alstonii_gcf_000347175': 318, 'leptospira_weilii_gcf_000244815': 315, 'leptospira_kmetyi_gcf_000243735': 314, 'leptospira_dzianensis_gcf_004770135': 312, 'leptospira_yasudae_gcf_003545925': 311, 'leptospira_mayottensis_gcf_000306675': 308, 'leptospira_alexanderi_gcf_000243815': 306, 'leptospira_barantonii_gcf_002811925': 301, 'leptospira_santarosai_gcf_000313175': 300, 'leptospira_borgpetersenii_gcf_000013945': 298, 'leptospira_putramalaysiae_gcf_004770035': 246, 'leptospira_stimsonii_gcf_003545885': 244, 'leptospira_tipperaryensis_gcf_001729245': 233, 'leptospira_adler

## Design temp set of baits

In [178]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/leptospira/initial_intervals/Leptospira_interrogans_GCF_000092565_+10.bed --twobit results/phyluce/leptospira/cleaned_genomes/Leptospira_interrogans_GCF_000092565_formatted.2bit --buffer-to 160 --output results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10.fasta;
Screened 356 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 356.


design the baits

In [179]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10.fasta --probe-prefix uce_leptospira_ --design leptospira_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGG


Conserved locus count = 354
Probe Count = 702


## Find duplicate baited regions

In [180]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas --query results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  13:12:57
Ended:  Fri Feb 07, 2020  13:12:57
Time for execution:  0.00522816578547 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas                        --lastz results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10_temp_probes_vself.lastz                       --probe-prefix=uce_leptospira_;
Parsing lastz file...
Screening results...
Screened 701 fasta sequences.  Filtered 12 duplicates

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [181]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [182]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/leptospira/validate_intervals/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas --scaffoldlist Leptospira_adleri_GCF_002811985 Leptospira_alexanderi_GCF_000243815 Leptospira_alstonii_GCF_000347175 Leptospira_andrefontaineae_GCF_004770105 Leptospira_bandrabouensis_GCF_004770905 Leptospira_barantonii_GCF_002811925 Leptospira_biflexa_GCF_000017685 Leptospira_borgpetersenii_GCF_000013945 Leptospira_bourretii_GCF_004769285 Leptospira_bouyouniensis_GCF_004770625 Leptospira_brenneri_GCF_002812125 Leptospira_broomii_GCF_000243715 Leptospira_congkakensis_GCF_004770265 Leptospira_dzianensis_GCF_004770135 Leptospira_dzoumogneensis_GCF_004770895 Leptospira_ellinghausenii_GCF_003114815 Leptospira_elllisii_GCF_002811955 Leptospira_fainei_GCF_000306235 Leptospira_fletcheri_GCF_004769195 Leptospira_fluminis_GCF_004771275 Leptospira_gomenensis_GCF_004770155 Leptospira_haakeii_GCF_002812045 Leptospira_harrisiae_GCF_002811945 Lep

	/tmp/tmpVjo6lW.fasta

Writing the results file...
	/tmp/tmpTJgfFk.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_Leptospira_bouyouniensis_GCF_004770625.lastz
Creating Leptospira_bouyouniensis_GCF_004770625 table
Inserting data to Leptospira_bouyouniensis_GCF_004770625 table

Running against Leptospira_brenneri_GCF_002812125.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp4u1Bej.fasta

Writing the results file...
	/tmp/tmpcNfbl7.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_Leptospira_brenneri_GCF_002812125.lastz
Creating Leptospira_brenneri_GCF_002812125 table
Inserting data to Leptospira_brenneri_GCF_002812125 table

Running aga

Inserting data to Leptospira_hartskeerlii_GCF_002811475 table

Running against Leptospira_idonii_GCF_004770995.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpJ1gRbP.fasta

Writing the results file...
	/tmp/tmpQWGWkS.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_Leptospira_idonii_GCF_004770995.lastz
Creating Leptospira_idonii_GCF_004770995 table
Inserting data to Leptospira_idonii_GCF_004770995 table

Running against Leptospira_ilyithenensis_GCF_004771005.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpY7jAI7.fasta

Writing the results file...
	/tmp/tmp3L_maL.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospir

Running the targets against 1 queries...
	/tmp/tmppNOxe3.fasta

Writing the results file...
	/tmp/tmpxrEl_b.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_Leptospira_macculloughii_GCF_002811975.lastz
Creating Leptospira_macculloughii_GCF_002811975 table
Inserting data to Leptospira_macculloughii_GCF_002811975 table

Running against Leptospira_mayottensis_GCF_000306675.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpxHLVVn.fasta

Writing the results file...
	/tmp/tmpHQZK1a.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_Leptospira_mayottensis_GCF_000306675.lastz
Creating Leptospira_mayottensis_GCF_000306675 table
Inserting data to Le

Inserting data to Leptospira_santarosai_GCF_000313175 table

Running against Leptospira_sarikeiensis_GCF_004769615.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpOUJA2J.fasta

Writing the results file...
	/tmp/tmp2hF7HY.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_Leptospira_sarikeiensis_GCF_004769615.lastz
Creating Leptospira_sarikeiensis_GCF_004769615 table
Inserting data to Leptospira_sarikeiensis_GCF_004769615 table

Running against Leptospira_selangorensis_GCF_004769405.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpfdOszP.fasta

Writing the results file...
	/tmp/tmp5hPhHr.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_int

Running the targets against 1 queries...
	/tmp/tmpP9IoQC.fasta

Writing the results file...
	/tmp/tmpomayrZ.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/leptospira/validate_intervals/lastz/Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_Leptospira_interrogans_GCF_000092565.lastz
Creating Leptospira_interrogans_GCF_000092565 table
Inserting data to Leptospira_interrogans_GCF_000092565 table


## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [183]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/leptospira/extract_probes_from_group/leptospira_genome.conf --lastz results/phyluce/leptospira/validate_intervals/lastz --probes 120 --probe-prefix uce_leptospira_ --name-pattern "Leptospira_interrogans_GCF_000092565_+10_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/leptospira/extract_probes_from_group/probe_fasta;
2020-02-07 13:16:05,349 - Phyluce - INFO - ------- Working on Leptospira_adleri_GCF_002811985 genome -------
2020-02-07 13:16:05,350 - Phyluce - INFO - Reading Leptospira_adleri_GCF_002811985 genome
2020-02-07 13:16:06,627 - Phyluce - INFO - Leptospira_adleri_GCF_002811985: 354 uces, 16 dupes, 338 non-dupes, 0 orient drop, 0 length drop, 338 written
2020-02-07 13:16:06,627 - Phyluce - INFO - ----- Working on Leptospira_alexanderi_GCF_000243815 genome -----
2020-02-07 13:16:06,628 - Phyluce - INFO - Reading Leptospira_alexanderi_GCF_000243815 genome
2020-02-07 13:16:07,819 - Phyluce - INFO - Leptosp

2020-02-07 13:16:26,545 - Phyluce - INFO - Leptospira_haakeii_GCF_002812045: 301 uces, 9 dupes, 292 non-dupes, 0 orient drop, 6 length drop, 286 written
2020-02-07 13:16:26,546 - Phyluce - INFO - ------ Working on Leptospira_harrisiae_GCF_002811945 genome -----
2020-02-07 13:16:26,546 - Phyluce - INFO - Reading Leptospira_harrisiae_GCF_002811945 genome
2020-02-07 13:16:27,187 - Phyluce - INFO - Leptospira_harrisiae_GCF_002811945: 216 uces, 6 dupes, 210 non-dupes, 6 orient drop, 3 length drop, 201 written
2020-02-07 13:16:27,187 - Phyluce - INFO - ---- Working on Leptospira_hartskeerlii_GCF_002811475 genome ----
2020-02-07 13:16:27,188 - Phyluce - INFO - Reading Leptospira_hartskeerlii_GCF_002811475 genome
2020-02-07 13:16:28,109 - Phyluce - INFO - Leptospira_hartskeerlii_GCF_002811475: 298 uces, 5 dupes, 293 non-dupes, 4 orient drop, 3 length drop, 286 written
2020-02-07 13:16:28,109 - Phyluce - INFO - ------- Working on Leptospira_idonii_GCF_004770995 genome -------
2020-02-07 13:16:2

2020-02-07 13:16:46,217 - Phyluce - INFO - Leptospira_noguchii_GCF_000306255: 353 uces, 15 dupes, 338 non-dupes, 1 orient drop, 1 length drop, 336 written
2020-02-07 13:16:46,217 - Phyluce - INFO - ----- Working on Leptospira_noumeaensis_GCF_004770765 genome ----
2020-02-07 13:16:46,218 - Phyluce - INFO - Reading Leptospira_noumeaensis_GCF_004770765 genome
2020-02-07 13:16:46,849 - Phyluce - INFO - Leptospira_noumeaensis_GCF_004770765: 216 uces, 12 dupes, 204 non-dupes, 0 orient drop, 2 length drop, 202 written
2020-02-07 13:16:46,849 - Phyluce - INFO - ----- Working on Leptospira_ognonensis_GCF_004770745 genome -----
2020-02-07 13:16:46,852 - Phyluce - INFO - Reading Leptospira_ognonensis_GCF_004770745 genome
2020-02-07 13:16:47,482 - Phyluce - INFO - Leptospira_ognonensis_GCF_004770745: 216 uces, 10 dupes, 206 non-dupes, 0 orient drop, 0 length drop, 206 written
2020-02-07 13:16:47,483 - Phyluce - INFO - ----- Working on Leptospira_perdikensis_GCF_004769575 genome ----
2020-02-07 13:

2020-02-07 13:17:07,595 - Phyluce - INFO - Leptospira_interrogans_GCF_000092565: 354 uces, 0 dupes, 354 non-dupes, 16 orient drop, 5 length drop, 333 written


In [184]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/leptospira/extract_probes_from_group/probe_fasta --output results/phyluce/leptospira/extract_probes_from_group/multifastas.sqlite --base-taxon Leptospira_interrogans_GCF_000092565;
leptospira_adleri_gcf_002811985.
leptospira_alexanderi_gcf_000243815.
leptospira_alstonii_gcf_000347175.
leptospira_andrefontaineae_gcf_004770105.
leptospira_bandrabouensis_gcf_004770905.
leptospira_barantonii_gcf_002811925.
leptospira_biflexa_gcf_000017685.
leptospira_borgpetersenii_gcf_000013945.
leptospira_bourretii_gcf_004769285.
leptospira_bouyouniensis_gcf_004770625.
leptospira_brenneri_gcf_002812125.
leptospira_broomii_gcf_000243715.
leptospira_congkakensis_gcf_004770265.
leptospira_dzianensis_gcf_004770135.
leptospira_dzoumogneensis_gcf_004770895.
leptospira_ellinghausenii_gcf_003114815.
leptospira_elllisii_gcf_002811955.
leptospira_fainei_gcf_000306235.
leptospira_fletcheri_gcf_004769195.
leptospira_fluminis_gcf_004771275.
leptospira_gomen

In [186]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(67)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/leptospira/extract_probes_from_group/multifastas.sqlite  --base-taxon Leptospira_interrogans_GCF_000092565 --output results/phyluce/leptospira/extract_probes_from_group/Leptospira_interrogans_GCF_000092565+10-back-to-67.conf --specific-counts 67;
Counter({'leptospira_tipperaryensis_gcf_001729245': 115, 'leptospira_kirschneri_gcf_000243695': 115, 'leptospira_jelokensis_gcf_004769775': 115, 'leptospira_selangorensis_gcf_004769405': 115, 'leptospira_neocaledonica_gcf_002812205': 115, 'leptospira_johnsonii_gcf_003112675': 115, 'leptospira_venezuelensis_gcf_002150035': 115, 'leptospira_broomii_gcf_000243715': 115, 'leptospira_inadai_gcf_000243675': 115, 'leptospira_gomenensis_gcf_004770155': 115, 'leptospira_bouyouniensis_gcf_004770625': 115, 'leptospira_sp_gcf_000347035': 115, 'leptospira_noguchii_gcf_000306255': 115, 'leptospira_ryugenii_gcf_003114855': 115, 'leptospira_fletcheri_gcf_004769195': 115, 'leptospira_brenneri_gcf_00281

## Final group specific bait design

In [187]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/leptospira/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/leptospira/extract_probes_from_group/Leptospira_interrogans_GCF_000092565+10-back-to-67.conf --probe-prefix uce_leptospira_ --designer rnplattii --design leptospira_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 115
Probe Count = 15390


In [188]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list.fasta --query results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  13:19:30
Ended:  Fri Feb 07, 2020  13:20:54
Time for execution:  1.39971166452 minutes


In [189]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list.fasta --lastz results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_leptospira_;
Parsing lastz file...
Screening results...
Screened 15389 fasta sequences.  Filtered 0 duplicates. Kept 15390.


## CDhit to reduce numbers

In [190]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list.fasta
         -o
         results/phyluce/leptospira/final_probe_design/leptospira_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 13:54:26 2020
                            Output                              
----------------------------------------------------------------
total seq: 15390
longest and shortest : 80 and 80
Total letters: 1231200
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 3M
Buffer          : 4 X 12M = 48M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 90M

Table limit with the given memory limit:
Max number of representatives: 3938769
Max number of word counting entries: 88661707

# comparing sequences from          0  to       2565
..---------- new table with     1422 representatives
# comparing sequences from     

# Listeria

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [112]:
group = 'listeria'

In [113]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [114]:
group_taxa = [ "Listeria_aquatica_GCF_000525795",
               "Listeria_booriae_GCF_000766865",
               "Listeria_cornellensis_GCF_000525855",
               "Listeria_costaricensis_GCF_900183385",
               "Listeria_fleischmannii_GCF_000252625",
               "Listeria_floridensis_GCF_000525875",
               "Listeria_goaensis_GCF_900186125",
               "Listeria_grandensis_GCF_000525835",
               "Listeria_grayi_GCF_000148995",
               "Listeria_innocua_GCF_000195795",
               "Listeria_ivanovii_GCF_900637745",
               "Listeria_kieliensis_GCF_003369925",
               "Listeria_marthii_GCF_000183865",          
               "Listeria_newyorkensis_GCF_900461625",
               "Listeria_riparia_GCF_000525995",
               "Listeria_rocourtiae_GCF_000525975",
               "Listeria_seeligeri_GCF_000027145",
               "Listeria_sp_GCF_003856515",
               "Listeria_thailandensis_GCF_900576925",
               "Listeria_weihenstephanensis_GCF_000525955",
               "Listeria_welshimeri_GCF_900187315" ]
                    
reference_taxon = "Listeria_monocytogenes_GCF_000196035"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [115]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000525795.1_Listeriaceae_bacterium_FSL_S10-1188_genomic.fna.gz

sent 42 bytes  received 773875 bytes  515944.67 bytes/sec
total size is 773553  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system 

GCA_900637745.1_52551_F01_genomic.fna.gz

sent 42 bytes  received 885882 bytes  590616.00 bytes/sec
total size is 885554  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003369925.1_ASM336992v1_genomic.fna.gz

sent 42 bytes  received 773749 bytes  515860.67 bytes/sec
total size is 773451  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_900187315.1_52087_C01_genomic.fna.gz

sent 42 bytes  received 827718 bytes  551840.00 bytes/sec
total size is 827406  speedup is 1.00


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [116]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [117]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483096 ("sim_Listeria_aquatica_GCF_000525795") has been submitted
Your job 5483097 ("sim_Listeria_booriae_GCF_000766865") has been submitted
Your job 5483098 ("sim_Listeria_cornellensis_GCF_000525855") has been submitted
Your job 5483099 ("sim_Listeria_costaricensis_GCF_900183385") has been submitted
Your job 5483100 ("sim_Listeria_fleischmannii_GCF_000252625") has been submitted
Your job 5483101 ("sim_Listeria_floridensis_GCF_000525875") has been submitted
Your job 5483102 ("sim_Listeria_goaensis_GCF_900186125") has been submitted
Your job 5483103 ("sim_Listeria_grandensis_GCF_000525835") has been submitted
Your job 5483104 ("sim_Listeria_grayi_GCF_000148995") has been submitted
Your job 5483105 ("sim_Listeria_innocua_GCF_000195795") has been submitted
Your job 5483106 ("sim_Listeria_ivanovii_GCF_900637745") has been submitted
Your job 5483107 ("sim_Listeria_kieliensis_GCF_003369925") has been submitted
Your job 5483108 ("sim_Listeria_marthii_GCF_000183865") has been submitt

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [118]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/listeria/cleaned_genomes/Listeria_monocytogenes_GCF_000196035_formatted.fas path=results/phyluce/listeria/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [119]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483117 ("map_Listeria_aquatica_GCF_000525795") has been submitted
Your job 5483118 ("map_Listeria_booriae_GCF_000766865") has been submitted
Your job 5483119 ("map_Listeria_cornellensis_GCF_000525855") has been submitted
Your job 5483120 ("map_Listeria_costaricensis_GCF_900183385") has been submitted
Your job 5483121 ("map_Listeria_fleischmannii_GCF_000252625") has been submitted
Your job 5483122 ("map_Listeria_floridensis_GCF_000525875") has been submitted
Your job 5483123 ("map_Listeria_goaensis_GCF_900186125") has been submitted
Your job 5483124 ("map_Listeria_grandensis_GCF_000525835") has been submitted
Your job 5483125 ("map_Listeria_grayi_GCF_000148995") has been submitted
Your job 5483126 ("map_Listeria_innocua_GCF_000195795") has been submitted
Your job 5483127 ("map_Listeria_ivanovii_GCF_900637745") has been submitted
Your job 5483128 ("map_Listeria_kieliensis_GCF_003369925") has been submitted
Your job 5483129 ("map_Listeria_marthii_GCF_000183865") has been submitt

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [120]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5483138 ("merge_Listeria_aquatica_GCF_000525795") has been submitted
Your job 5483139 ("merge_Listeria_booriae_GCF_000766865") has been submitted
Your job 5483140 ("merge_Listeria_cornellensis_GCF_000525855") has been submitted
Your job 5483141 ("merge_Listeria_costaricensis_GCF_900183385") has been submitted
Your job 5483142 ("merge_Listeria_fleischmannii_GCF_000252625") has been submitted
Your job 5483143 ("merge_Listeria_floridensis_GCF_000525875") has been submitted
Your job 5483144 ("merge_Listeria_goaensis_GCF_900186125") has been submitted
Your job 5483145 ("merge_Listeria_grandensis_GCF_000525835") has been submitted
Your job 5483146 ("merge_Listeria_grayi_GCF_000148995") has been submitted
Your job 5483147 ("merge_Listeria_innocua_GCF_000195795") has been submitted
Your job 5483148 ("merge_Listeria_ivanovii_GCF_900637745") has been submitted
Your job 5483149 ("merge_Listeria_kieliensis_GCF_003369925") has been submitted
Your job 5483150 ("merge_Listeria_marthii_GCF_00

remove loci that were masked in the original genome

In [121]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 196 sequences from Listeria_aquatica_GCF_000525795_merged.bed.  Filtered 131 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 65.
Screened 308 sequences from Listeria_booriae_GCF_000766865_merged.bed.  Filtered 205 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 103.
Screened 220 sequences from Listeria_cornellensis_GCF_000525855_merged.bed.  Filtered 138 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 82.
Screened 248 sequences from Listeria_costaricensis_GCF_900183385_merged.bed.  Filtered 164 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 84.
Screened 268 sequences from Listeria_fleischmannii_GCF_000252625_merged.bed.  Filtered 176 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 92.
Screened 242 sequences from Listeria_floridensis_GCF_000525875_merged.bed.  Filtered 167 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 75.
Screened 263 sequences from Listeria_goaensis_GCF_900186125_merge

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [122]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [123]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/listeria/initial_intervals/Listeria_welshimeri_GCF_900187315_merged.bed --twobit results/phyluce/listeria/cleaned_genomes/Listeria_monocytogenes_GCF_000196035_formatted.2bit --output results/phyluce/listeria/initial_intervals/Listeria_welshimeri_GCF_900187315_stripped.bed;
listeria_aquatica_gcf_000525795.
listeria_booriae_gcf_000766865.
listeria_cornellensis_gcf_000525855.
listeria_costaricensis_gcf_900183385.
listeria_fleischmannii_gcf_000252625.
listeria_floridensis_gcf_000525875.
listeria_goaensis_gcf_900186125.
listeria_grandensis_gcf_000525835.
listeria_grayi_gcf_000148995.
listeria_innocua_gcf_000195795...
listeria_ivanovii_gcf_900637745..
listeria_kieliensis_gcf_003369925.
listeria_marthii_gcf_000183865...
listeria_newyorkensis_gcf_900461625.
listeria_riparia_gcf_000525995.
listeria_rocourtiae_gcf_000525975.
listeria_seeligeri_gcf_000027145..
listeria_sp_gcf_003856515.
listeria_tha

Quantify probes and the number of targeted taxa for each.

In [124]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/listeria/initial_intervals/Listeria_welshimeri_GCF_900187315_merged.bed --twobit results/phyluce/listeria/cleaned_genomes/Listeria_monocytogenes_GCF_000196035_formatted.2bit --output results/phyluce/listeria/initial_intervals/Listeria_welshimeri_GCF_900187315_stripped.bed;
Loci shared by Listeria_monocytogenes_GCF_000196035 + 0 taxa:	3,661.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 1 taxa:	3,661.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 2 taxa:	2,643.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 3 taxa:	2,008.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 4 taxa:	1,323.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 5 taxa:	913.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 6 taxa:	214.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 7 taxa:	156.0
Loci shared by Listeria_monocytogenes_GCF_000196035 + 8 taxa:	132.0
Loci 

In [125]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 6
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/listeria/initial_intervals/listeria-to-Listeria_monocytogenes_GCF_000196035.sqlite --base-taxon Listeria_monocytogenes_GCF_000196035 --output results/phyluce/listeria/initial_intervals/Listeria_monocytogenes_GCF_000196035_+6.bed --specific-counts 6;
Counter({'listeria_welshimeri_gcf_900187315': 213, 'listeria_seeligeri_gcf_000027145': 212, 'listeria_innocua_gcf_000195795': 211, 'listeria_ivanovii_gcf_900637745': 210, 'listeria_marthii_gcf_000183865': 199, 'listeria_goaensis_gcf_900186125': 100, 'listeria_grayi_gcf_000148995': 95, 'listeria_newyorkensis_gcf_900461625': 92, 'listeria_fleischmannii_gcf_000252625': 90, 'listeria_weihenstephanensis_gcf_000525955': 89, 'listeria_booriae_gcf_000766865': 89, 'listeria_sp_gcf_003856515': 87, 'listeria_cornellensis_gcf_000525855': 85, 'listeria_costaricensis_gcf_900183385': 83, 'listeria_grandensis_gcf_000525835': 80, 'listeria_rocourtiae_gcf_000525975': 78, 'listeria_riparia_gcf_0005259

## Design temp set of baits

In [126]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/listeria/initial_intervals/Listeria_monocytogenes_GCF_000196035_+6.bed --twobit results/phyluce/listeria/cleaned_genomes/Listeria_monocytogenes_GCF_000196035_formatted.2bit --buffer-to 160 --output results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6.fasta;
Screened 214 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 214.


design the baits

In [127]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6.fasta --probe-prefix uce_listeria_ --design listeria_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGG


Conserved locus count = 213
Probe Count = 424


## Find duplicate baited regions

In [128]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6_temp_probes.fas --query results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  10:43:07
Ended:  Fri Feb 07, 2020  10:43:07
Time for execution:  0.00404454867045 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6_temp_probes.fas                        --lastz results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6_temp_probes_vself.lastz                       --probe-prefix=uce_listeria_;
Parsing lastz file...
Screening results...
Screened 423 fasta sequences.  Filtered 19 duplicates. Kept 386.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [129]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [130]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/listeria/validate_intervals/Listeria_monocytogenes_GCF_000196035_+6_temp_probes.fas --scaffoldlist Listeria_aquatica_GCF_000525795 Listeria_booriae_GCF_000766865 Listeria_cornellensis_GCF_000525855 Listeria_costaricensis_GCF_900183385 Listeria_fleischmannii_GCF_000252625 Listeria_floridensis_GCF_000525875 Listeria_goaensis_GCF_900186125 Listeria_grandensis_GCF_000525835 Listeria_grayi_GCF_000148995 Listeria_innocua_GCF_000195795 Listeria_ivanovii_GCF_900637745 Listeria_kieliensis_GCF_003369925 Listeria_marthii_GCF_000183865 Listeria_newyorkensis_GCF_900461625 Listeria_riparia_GCF_000525995 Listeria_rocourtiae_GCF_000525975 Listeria_seeligeri_GCF_000027145 Listeria_sp_GCF_003856515 Listeria_thailandensis_GCF_900576925 Listeria_weihenstephanensis_GCF_000525955 Listeria_welshimeri_GCF_900187315 Listeria_monocytogenes_GCF_000196035 --genome-base-path results/phyluce/listeria/cleaned_genomes --identity 30 --cores 4 --db re

Inserting data to Listeria_marthii_GCF_000183865 table

Running against Listeria_newyorkensis_GCF_900461625.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpaTHJLW.fasta

Writing the results file...
	/tmp/tmpoe7aR1.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/listeria/validate_intervals/lastz/Listeria_monocytogenes_GCF_000196035_+6_temp_probes.fas_v_Listeria_newyorkensis_GCF_900461625.lastz
Creating Listeria_newyorkensis_GCF_900461625 table
Inserting data to Listeria_newyorkensis_GCF_900461625 table

Running against Listeria_riparia_GCF_000525995.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmptsfym0.fasta

Writing the results file...
	/tmp/tmpBH4ick.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/listeria/validate_intervals/lastz/Listeria_mono

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [131]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/listeria/extract_probes_from_group/listeria_genome.conf --lastz results/phyluce/listeria/validate_intervals/lastz --probes 120 --probe-prefix uce_listeria_ --name-pattern "Listeria_monocytogenes_GCF_000196035_+6_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/listeria/extract_probes_from_group/probe_fasta;
2020-02-07 10:43:50,291 - Phyluce - INFO - ------- Working on Listeria_aquatica_GCF_000525795 genome -------
2020-02-07 10:43:50,292 - Phyluce - INFO - Reading Listeria_aquatica_GCF_000525795 genome
2020-02-07 10:43:51,135 - Phyluce - INFO - Listeria_aquatica_GCF_000525795: 210 uces, 25 dupes, 185 non-dupes, 0 orient drop, 0 length drop, 185 written
2020-02-07 10:43:51,135 - Phyluce - INFO - -------- Working on Listeria_booriae_GCF_000766865 genome -------
2020-02-07 10:43:51,135 - Phyluce - INFO - Reading Listeria_booriae_GCF_000766865 genome
2020-02-07 10:43:51,898 - Phyluce - INFO - Listeria_booriae_GCF_00

In [132]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/listeria/extract_probes_from_group/probe_fasta --output results/phyluce/listeria/extract_probes_from_group/multifastas.sqlite --base-taxon Listeria_monocytogenes_GCF_000196035;
listeria_aquatica_gcf_000525795.
listeria_booriae_gcf_000766865.
listeria_cornellensis_gcf_000525855.
listeria_costaricensis_gcf_900183385.
listeria_fleischmannii_gcf_000252625.
listeria_floridensis_gcf_000525875.
listeria_goaensis_gcf_900186125.
listeria_grandensis_gcf_000525835.
listeria_grayi_gcf_000148995.
listeria_innocua_gcf_000195795.
listeria_ivanovii_gcf_900637745.
listeria_kieliensis_gcf_003369925.
listeria_marthii_gcf_000183865.
listeria_newyorkensis_gcf_900461625.
listeria_riparia_gcf_000525995.
listeria_rocourtiae_gcf_000525975.
listeria_seeligeri_gcf_000027145.
listeria_sp_gcf_003856515.
listeria_thailandensis_gcf_900576925.
listeria_weihenstephanensis_gcf_000525955.
listeria_welshimeri_gcf_900187315.
listeria_monocytogenes_gcf_000196035.

In [133]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(20)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/listeria/extract_probes_from_group/multifastas.sqlite  --base-taxon Listeria_monocytogenes_GCF_000196035 --output results/phyluce/listeria/extract_probes_from_group/Listeria_monocytogenes_GCF_000196035+6-back-to-20.conf --specific-counts 20;
Counter({'listeria_goaensis_gcf_900186125': 145, 'listeria_floridensis_gcf_000525875': 145, 'listeria_fleischmannii_gcf_000252625': 145, 'listeria_welshimeri_gcf_900187315': 145, 'listeria_aquatica_gcf_000525795': 145, 'listeria_cornellensis_gcf_000525855': 145, 'listeria_weihenstephanensis_gcf_000525955': 145, 'listeria_kieliensis_gcf_003369925': 145, 'listeria_thailandensis_gcf_900576925': 145, 'listeria_rocourtiae_gcf_000525975': 145, 'listeria_grayi_gcf_000148995': 145, 'listeria_riparia_gcf_000525995': 145, 'listeria_sp_gcf_003856515': 145, 'listeria_monocytogenes_gcf_000196035': 145, 'listeria_newyorkensis_gcf_900461625': 145, 'listeria_seeligeri_gcf_000027145': 144, 'listeria_ivanovi

## Final group specific bait design

In [134]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/listeria/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/listeria/extract_probes_from_group/Listeria_monocytogenes_GCF_000196035+6-back-to-20.conf --probe-prefix uce_listeria_ --designer rnplattii --design listeria_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list.fasta;
GGGGGGGGGGGNNGGGGGNGGGNNNNNGGGGGGNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 145
Probe Count = 6264


In [135]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list.fasta --query results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  10:44:32
Ended:  Fri Feb 07, 2020  10:44:47
Time for execution:  0.244520934423 minutes


In [136]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list.fasta --lastz results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_listeria_;
Parsing lastz file...
Screening results...
Screened 6263 fasta sequences.  Filtered 2 duplicates. Kept 6180.


## CDhit to reduce numbers

In [137]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list.fasta
         -o
         results/phyluce/listeria/final_probe_design/listeria_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 10:50:30 2020
                            Output                              
----------------------------------------------------------------
total seq: 6264
longest and shortest : 80 and 80
Total letters: 501120
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 87M

Table limit with the given memory limit:
Max number of representatives: 3954331
Max number of word counting entries: 89011995

# comparing sequences from          0  to       1044
.---------- new table with      645 representatives
# comparing sequences from       1044  to 