## Prep python

In [1]:
import os
import subprocess
import pandas as pd
import shutil
from shutil import copy
import time
from collections import defaultdict
from Bio import SeqIO
import glob

os.chdir("/master/nplatt/pathogen_probes/")


def wait_on_running_jobs():
   
    num_jobs = 1
    
    while num_jobs > 0:
        num_jobs = len(subprocess.check_output('qstat', shell=True).split("\n")) - 2
        time.sleep(60)
        print(".")

# Apicomplexa

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [31]:
group = 'apicomplexa'

In [87]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [32]:
group_taxa = [ "Ascogregarina_taiwanensis_GCA_000172235",
               "Babesia_bigemina_GCF_000981445",
               "Babesia_bovis_GCF_000165395",
               "Babesia_divergens_GCA_001077455",
               "Babesia_microti_GCF_000691945",
               "Babesia_ovata_GCF_002897235",
               "Babesia_sp_GCF_002095265",
               "Besnoitia_besnoiti_GCF_002563875",
               "Cryptospridium_andersoni_GCA_001865355",
               "Cryptospridium_baileyi_GCA_001593455",
               "Cryptospridium_bovis_GCA_009768925",
               "Cryptospridium_cuniculus_GCA_004337835",
               "Cryptospridium_hominis_GCA_000006425",
               "Cryptospridium_meleagridis_GCA_001593445",
               "Cryptospridium_muris_GCF_000006515",
               "Cryptospridium_parvum_GCF_000165345",
               "Cryptospridium_ryanae_GCA_009792415",
               "Cryptospridium_sp_GCA_004936735",
               "Cryptospridium_tyzzeri_GCA_007210665",
               "Cryptospridium_ubiquitum_GCF_001865345",
               "Cryptospridium_viatorum_GCA_004337795",
               "Cyclospra_cayetanensis_GCF_002999335",
               "Cystoisospra_suis_GCA_002600585",
               "Eimeria_acervulina_GCF_000499425",
               "Eimeria_brunetti_GCA_000499725",
               "Eimeria_falciformis_GCA_002271815",
               "Eimeria_maxima_GCF_000499605",
               "Eimeria_mitis_GCA_000499745",
               "Eimeria_necatrix_GCF_000499385",
               "Eimeria_nieschulzi_GCA_000826945",
               "Eimeria_praecox_GCA_000499445",
               "Eimeria_tenella_GCF_000499545",
               "Gregarina_niphandrodes_GCF_000223845",
               "Haemoproteus_tartakovskyi_GCA_001625125",
               "Hammondia_hammondi_GCA_000447165",
               "Neospra_caninum_GCF_000208865",
               "Nephromyces_sp_GCA_004523865",
               "Plasmodium_berghei_GCA_900002375",
               "Plasmodium_brasilianum_GCA_001885115",
               "Plasmodium_chabaudi_GCA_900002335",
               "Plasmodium_coatneyi_GCF_001680005",
               "Plasmodium_cynomolgi_GCF_000321355",
               "Plasmodium_fragile_GCF_000956335",
               "Plasmodium_gaboni_GCF_001602025",
               "Plasmodium_gallinaceum_GCF_900005855",
               "Plasmodium_gonderi_GCF_002157705",
               "Plasmodium_inui_GCF_000524495",
               "Plasmodium_knowlesi_GCF_000006355",
               "Plasmodium_malariae_GCF_900090045",
               "Plasmodium_ovale_GCA_900090025",
               "Plasmodium_reichenowi_GCF_001601855",
               "Plasmodium_relictum_GCF_900005765",
               "Plasmodium_sp_GCF_900097015",
               "Plasmodium_vinckei_GCF_000709005",
               "Plasmodium_vivax_GCF_000002415",
               "Plasmodium_yoelii_GCA_900002385",
               "Sarcocystis_neurona_GCA_000727475",
               "Theileria_annulata_GCF_000003225",
               "Theileria_equi_GCF_000342415",
               "Theileria_orientalis_GCF_000740895",
               "Theileria_parva_GCF_000165365",
               "Toxoplasma_gondii_GCF_000006565" ]
                    
reference_taxon = "Plasmodium_falciparum_GCA_000002765"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [89]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000172235.1_ASM17223v1_genomic.fna.gz

sent 42 bytes  received 1911069 bytes  764444.40 bytes/sec
total size is 1910492  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplin

GCA_009768925.1_ASM976892v1_genomic.fna.gz

sent 42 bytes  received 2893793 bytes  1157534.00 bytes/sec
total size is 2892975  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_004337835.1_UKCU2.v0_genomic.fna.gz

sent 42 bytes  received 2384552 bytes  953837.60 bytes/sec
total size is 2383865  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, 

GCA_002999335.1_CcayRef3_genomic.fna.gz

sent 42 bytes  received 14143402 bytes  5657377.60 bytes/sec
total size is 14139842  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002600585.1_ASM260058v1_genomic.fna.gz

sent 42 bytes  received 26643909 bytes  5920878.00 bytes/sec
total size is 26637298  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compu

GCA_000223845.4_GNI3_genomic.fna.gz

sent 42 bytes  received 4126417 bytes  2750972.67 bytes/sec
total size is 4125310  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001625125.1_ASM162512v1_genomic.fna.gz

sent 42 bytes  received 7082052 bytes  2023455.43 bytes/sec
total size is 7080210  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, net

GCA_000956335.1_Plas_frag_nilgiri_V1_genomic.fna.gz

sent 42 bytes  received 7428007 bytes  2971219.60 bytes/sec
total size is 7426076  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001602025.1_ASM160202v1_genomic.fna.gz

sent 42 bytes  received 5758622 bytes  2303465.60 bytes/sec
total size is 5757108  speedup is 1.00


You are accessing a U.S. Government information system which includes th

GCA_000709005.1_Plas_vinc_vinckei_V1_genomic.fna.gz

sent 42 bytes  received 5574903 bytes  2229978.00 bytes/sec
total size is 5573420  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000002415.2_ASM241v2_genomic.fna.gz

sent 42 bytes  received 8568852 bytes  3427557.60 bytes/sec
total size is 8566652  speedup is 1.00


You are accessing a U.S. Government information system which includes this


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [90]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [91]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484306 ("sim_Ascogregarina_taiwanensis_GCA_000172235") has been submitted
Your job 5484307 ("sim_Babesia_bigemina_GCF_000981445") has been submitted
Your job 5484308 ("sim_Babesia_bovis_GCF_000165395") has been submitted
Your job 5484309 ("sim_Babesia_divergens_GCA_001077455") has been submitted
Your job 5484310 ("sim_Babesia_microti_GCF_000691945") has been submitted
Your job 5484311 ("sim_Babesia_ovata_GCF_002897235") has been submitted
Your job 5484312 ("sim_Babesia_sp_GCF_002095265") has been submitted
Your job 5484313 ("sim_Besnoitia_besnoiti_GCF_002563875") has been submitted
Your job 5484314 ("sim_Cryptospridium_andersoni_GCA_001865355") has been submitted
Your job 5484315 ("sim_Cryptospridium_baileyi_GCA_001593455") has been submitted
Your job 5484316 ("sim_Cryptospridium_bovis_GCA_009768925") has been submitted
Your job 5484317 ("sim_Cryptospridium_cuniculus_GCA_004337835") has been submitted
Your job 5484318 ("sim_Cryptospridium_hominis_GCA_000006425") has been subm

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [92]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/apicomplexa/cleaned_genomes/Plasmodium_falciparum_GCA_000002765_formatted.fas path=results/phyluce/apicomplexa/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [93]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484368 ("map_Ascogregarina_taiwanensis_GCA_000172235") has been submitted
Your job 5484369 ("map_Babesia_bigemina_GCF_000981445") has been submitted
Your job 5484370 ("map_Babesia_bovis_GCF_000165395") has been submitted
Your job 5484371 ("map_Babesia_divergens_GCA_001077455") has been submitted
Your job 5484372 ("map_Babesia_microti_GCF_000691945") has been submitted
Your job 5484373 ("map_Babesia_ovata_GCF_002897235") has been submitted
Your job 5484374 ("map_Babesia_sp_GCF_002095265") has been submitted
Your job 5484375 ("map_Besnoitia_besnoiti_GCF_002563875") has been submitted
Your job 5484376 ("map_Cryptospridium_andersoni_GCA_001865355") has been submitted
Your job 5484377 ("map_Cryptospridium_baileyi_GCA_001593455") has been submitted
Your job 5484378 ("map_Cryptospridium_bovis_GCA_009768925") has been submitted
Your job 5484379 ("map_Cryptospridium_cuniculus_GCA_004337835") has been submitted
Your job 5484380 ("map_Cryptospridium_hominis_GCA_000006425") has been subm

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [94]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484430 ("merge_Ascogregarina_taiwanensis_GCA_000172235") has been submitted
Your job 5484431 ("merge_Babesia_bigemina_GCF_000981445") has been submitted
Your job 5484432 ("merge_Babesia_bovis_GCF_000165395") has been submitted
Your job 5484433 ("merge_Babesia_divergens_GCA_001077455") has been submitted
Your job 5484434 ("merge_Babesia_microti_GCF_000691945") has been submitted
Your job 5484435 ("merge_Babesia_ovata_GCF_002897235") has been submitted
Your job 5484436 ("merge_Babesia_sp_GCF_002095265") has been submitted
Your job 5484437 ("merge_Besnoitia_besnoiti_GCF_002563875") has been submitted
Your job 5484438 ("merge_Cryptospridium_andersoni_GCA_001865355") has been submitted
Your job 5484439 ("merge_Cryptospridium_baileyi_GCA_001593455") has been submitted
Your job 5484440 ("merge_Cryptospridium_bovis_GCA_009768925") has been submitted
Your job 5484441 ("merge_Cryptospridium_cuniculus_GCA_004337835") has been submitted
Your job 5484442 ("merge_Cryptospridium_hominis_GCA

remove loci that were masked in the original genome

In [33]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 15 sequences from Ascogregarina_taiwanensis_GCA_000172235_merged.bed.  Filtered 13 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2.
Screened 23 sequences from Babesia_bigemina_GCF_000981445_merged.bed.  Filtered 18 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 5.
Screened 28 sequences from Babesia_bovis_GCF_000165395_merged.bed.  Filtered 24 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 21 sequences from Babesia_divergens_GCA_001077455_merged.bed.  Filtered 18 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3.
Screened 14 sequences from Babesia_microti_GCF_000691945_merged.bed.  Filtered 11 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3.
Screened 19 sequences from Babesia_ovata_GCF_002897235_merged.bed.  Filtered 15 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 31 sequences from Babesia_sp_GCF_002095265_merged.bed.  Filtered 28 with > 25.0% masked bases 

Screened 314 sequences from Plasmodium_vivax_GCF_000002415_merged.bed.  Filtered 271 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 43.
Screened 3160 sequences from Plasmodium_yoelii_GCA_900002385_merged.bed.  Filtered 2738 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 422.
Screened 62 sequences from Sarcocystis_neurona_GCA_000727475_merged.bed.  Filtered 60 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2.
Screened 41 sequences from Theileria_annulata_GCF_000003225_merged.bed.  Filtered 37 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 30 sequences from Theileria_equi_GCF_000342415_merged.bed.  Filtered 26 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 36 sequences from Theileria_orientalis_GCF_000740895_merged.bed.  Filtered 32 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 25 sequences from Theileria_parva_GCF_000165365_merged.bed.  Filtered 21 with > 

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [34]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [35]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/apicomplexa/initial_intervals/Toxoplasma_gondii_GCF_000006565_merged.bed --twobit results/phyluce/apicomplexa/cleaned_genomes/Plasmodium_falciparum_GCA_000002765_formatted.2bit --output results/phyluce/apicomplexa/initial_intervals/Toxoplasma_gondii_GCF_000006565_stripped.bed;
ascogregarina_taiwanensis_gca_000172235.
babesia_bigemina_gcf_000981445.
babesia_bovis_gcf_000165395.
babesia_divergens_gca_001077455.
babesia_microti_gcf_000691945.
babesia_ovata_gcf_002897235.
babesia_sp_gcf_002095265.
besnoitia_besnoiti_gcf_002563875.
cryptospridium_andersoni_gca_001865355.
cryptospridium_baileyi_gca_001593455.
cryptospridium_bovis_gca_009768925.
cryptospridium_cuniculus_gca_004337835.
cryptospridium_hominis_gca_000006425.
cryptospridium_meleagridis_gca_001593445.
cryptospridium_muris_gcf_000006515.
cryptospridium_parvum_gcf_000165345.
cryptospridium_ryanae_gca_009792415.
cryptospridium_sp_gca_00

Quantify probes and the number of targeted taxa for each.

In [36]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/apicomplexa/initial_intervals/apicomplexa-to-Plasmodium_falciparum_GCA_000002765.sqlite --base-taxon Plasmodium_falciparum_GCA_000002765
Loci shared by Plasmodium_falciparum_GCA_000002765 + 0 taxa:	2,585.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 1 taxa:	2,585.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 2 taxa:	1,695.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 3 taxa:	836.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 4 taxa:	604.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 5 taxa:	461.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 6 taxa:	356.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 7 taxa:	278.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 8 taxa:	210.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 9 taxa:	166.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 10 taxa:	135.0
Loci shared by Plasmodium_falciparum_GCA_000002765 + 11 taxa

In [38]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 2
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/apicomplexa/initial_intervals/apicomplexa-to-Plasmodium_falciparum_GCA_000002765.sqlite --base-taxon Plasmodium_falciparum_GCA_000002765 --output results/phyluce/apicomplexa/initial_intervals/Plasmodium_falciparum_GCA_000002765_+2.bed --specific-counts 2;
Counter({'plasmodium_sp_gcf_900097015': 1239, 'plasmodium_gaboni_gcf_001602025': 1150, 'plasmodium_gallinaceum_gcf_900005855': 490, 'plasmodium_relictum_gcf_900005765': 471, 'plasmodium_yoelii_gca_900002385': 428, 'plasmodium_malariae_gcf_900090045': 414, 'plasmodium_vinckei_gcf_000709005': 409, 'plasmodium_brasilianum_gca_001885115': 404, 'plasmodium_chabaudi_gca_900002335': 387, 'plasmodium_berghei_gca_900002375': 341, 'plasmodium_ovale_gca_900090025': 257, 'plasmodium_gonderi_gcf_002157705': 238, 'plasmodium_reichenowi_gcf_001601855': 180, 'haemoproteus_tartakovskyi_gca_001625125': 66, 'plasmodium_coatneyi_gcf_001680005': 47, 'plasmodium_vivax_gcf_000002415': 45, 'plasmodiu

## Design temp set of baits

In [39]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/apicomplexa/initial_intervals/Plasmodium_falciparum_GCA_000002765_+2.bed --twobit results/phyluce/apicomplexa/cleaned_genomes/Plasmodium_falciparum_GCA_000002765_formatted.2bit --buffer-to 160 --output results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2.fasta;
Screened 1695 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 1695.


design the baits

In [40]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2.fasta --probe-prefix uce_apicomplexa_ --design apicomplexa_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

## Find duplicate baited regions

In [41]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas --query results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2_temp_probes_vself.lastz;
Started:  Mon Feb 10, 2020  08:31:36
Ended:  Mon Feb 10, 2020  08:31:39
Time for execution:  0.0589553833008 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas                        --lastz results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2_temp_probes_vself.lastz                       --probe-prefix=uce_apicomplexa_;
Parsing lastz file...
Screening results...
Screened 1483 fasta sequences.  Filtered 117 duplicates. K

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [42]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [43]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/apicomplexa/validate_intervals/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas --scaffoldlist Ascogregarina_taiwanensis_GCA_000172235 Babesia_bigemina_GCF_000981445 Babesia_bovis_GCF_000165395 Babesia_divergens_GCA_001077455 Babesia_microti_GCF_000691945 Babesia_ovata_GCF_002897235 Babesia_sp_GCF_002095265 Besnoitia_besnoiti_GCF_002563875 Cryptospridium_andersoni_GCA_001865355 Cryptospridium_baileyi_GCA_001593455 Cryptospridium_bovis_GCA_009768925 Cryptospridium_cuniculus_GCA_004337835 Cryptospridium_hominis_GCA_000006425 Cryptospridium_meleagridis_GCA_001593445 Cryptospridium_muris_GCF_000006515 Cryptospridium_parvum_GCF_000165345 Cryptospridium_ryanae_GCA_009792415 Cryptospridium_sp_GCA_004936735 Cryptospridium_tyzzeri_GCA_007210665 Cryptospridium_ubiquitum_GCF_001865345 Cryptospridium_viatorum_GCA_004337795 Cyclospra_cayetanensis_GCF_002999335 Cystoisospra_suis_GCA_002600585 Eimeria_acervulina_GCF_000499425 

Creating Cryptospridium_baileyi_GCA_001593455 table
Inserting data to Cryptospridium_baileyi_GCA_001593455 table

Running against Cryptospridium_bovis_GCA_009768925.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpl4qUO5.fasta

Writing the results file...
	/tmp/tmp4koWzP.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/apicomplexa/validate_intervals/lastz/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas_v_Cryptospridium_bovis_GCA_009768925.lastz
Creating Cryptospridium_bovis_GCA_009768925 table
Inserting data to Cryptospridium_bovis_GCA_009768925 table

Running against Cryptospridium_cuniculus_GCA_004337835.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp_zQ2Ke.fasta

Writing the results file...
	/tmp/tmpXY1DDd.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes

Running the targets against 5 queries...
	/tmp/tmpywt0HR.fasta
	/tmp/tmpP_basr.fasta
	/tmp/tmpLIKqXn.fasta
	/tmp/tmpBMykat.fasta
	/tmp/tmpHwjqO5.fasta

Writing the results file...
	/tmp/tmpCCxX_q.lastz
	/tmp/tmpVE_Hw7.lastz
	/tmp/tmpD7Pb_T.lastz
	/tmp/tmpwEmPWf.lastz
	/tmp/tmpXF6pYE.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/apicomplexa/validate_intervals/lastz/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas_v_Eimeria_acervulina_GCF_000499425.lastz
Creating Eimeria_acervulina_GCF_000499425 table
Inserting data to Eimeria_acervulina_GCF_000499425 table

Running against Eimeria_brunetti_GCA_000499725.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 7 queries...
	/tmp/tmpMx9z_E.fasta
	/tmp/tmpnSTqGP.fasta
	/tmp/tmpIp0k9_.fasta
	/tmp/tmp3XXICS.fasta
	/tmp/tmpsTNCxJ.fasta
	/tmp/tmptxCEEI.fasta
	/tmp/tmplBBZKT.fasta

Writing the results file...
	/tmp/tmphg_SYO.lastz
	/tmp/tmpyEA8sH.


Running against Hammondia_hammondi_GCA_000447165.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 6 queries...
	/tmp/tmpyhceI_.fasta
	/tmp/tmptoOM3S.fasta
	/tmp/tmpWw6lym.fasta
	/tmp/tmpVyWe3a.fasta
	/tmp/tmpTjySHt.fasta
	/tmp/tmpBncjq8.fasta

Writing the results file...
	/tmp/tmp3ClmzQ.lastz
	/tmp/tmprjzB7H.lastz
	/tmp/tmp04LXPs.lastz
	/tmp/tmpqBy3v9.lastz
	/tmp/tmptaIndL.lastz
	/tmp/tmp2oVDYe.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/apicomplexa/validate_intervals/lastz/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas_v_Hammondia_hammondi_GCA_000447165.lastz
Creating Hammondia_hammondi_GCA_000447165 table
Inserting data to Hammondia_hammondi_GCA_000447165 table

Running against Neospra_caninum_GCF_000208865.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 5 queries...
	/tmp/tmpQ3V5N0.fasta
	/tmp/tmpjVtAfe.fasta
	/tmp/tmp_q

Creating Plasmodium_fragile_GCF_000956335 table
Inserting data to Plasmodium_fragile_GCF_000956335 table

Running against Plasmodium_gaboni_GCF_001602025.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpjDm1by.fasta
	/tmp/tmpHZmECQ.fasta
	/tmp/tmpUTMR7c.fasta

Writing the results file...
	/tmp/tmpyssxrD.lastz
	/tmp/tmphFeXjt.lastz
	/tmp/tmpIUBquQ.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/apicomplexa/validate_intervals/lastz/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas_v_Plasmodium_gaboni_GCF_001602025.lastz
Creating Plasmodium_gaboni_GCF_001602025 table
Inserting data to Plasmodium_gaboni_GCF_001602025 table

Running against Plasmodium_gallinaceum_GCF_900005855.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpJXCDmj.fasta
	/tmp/tmpBnnn6z.fasta
	/tmp/tmpLa6w3R.fasta

Writing the 

	/tmp/tmpiEfxAJ.lastz
	/tmp/tmpN80c3H.lastz
	/tmp/tmpxNKQAL.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/apicomplexa/validate_intervals/lastz/Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas_v_Plasmodium_yoelii_GCA_900002385.lastz
Creating Plasmodium_yoelii_GCA_900002385 table
Inserting data to Plasmodium_yoelii_GCA_900002385 table

Running against Sarcocystis_neurona_GCA_000727475.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 11 queries...
	/tmp/tmpJSkzBj.fasta
	/tmp/tmpVi2MzA.fasta
	/tmp/tmpRJuqUu.fasta
	/tmp/tmp1nlll1.fasta
	/tmp/tmpMz08OI.fasta
	/tmp/tmpoRecPW.fasta
	/tmp/tmpAtGhDv.fasta
	/tmp/tmpDaUMA0.fasta
	/tmp/tmphDl4uh.fasta
	/tmp/tmpZf58GE.fasta
	/tmp/tmpiiJXbv.fasta

Writing the results file...
	/tmp/tmpuqJcqa.lastz
	/tmp/tmpgo4c2Y.lastz
	/tmp/tmp0vidyL.lastz
	/tmp/tmpjguxoL.lastz
	/tmp/tmpOUMYTv.lastz
	/tmp/tmprQwaE6.lastz
	/tmp/tmp4Rdirm.lastz
	/tmp/tmppL6Sjl.las

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [44]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/apicomplexa/extract_probes_from_group/apicomplexa_genome.conf --lastz results/phyluce/apicomplexa/validate_intervals/lastz --probes 120 --probe-prefix uce_apicomplexa_ --name-pattern "Plasmodium_falciparum_GCA_000002765_+2_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/apicomplexa/extract_probes_from_group/probe_fasta;
2020-02-10 09:02:58,570 - Phyluce - INFO - --- Working on Ascogregarina_taiwanensis_GCA_000172235 genome ---
2020-02-10 09:02:58,584 - Phyluce - INFO - Reading Ascogregarina_taiwanensis_GCA_000172235 genome
2020-02-10 09:03:01,175 - Phyluce - INFO - Ascogregarina_taiwanensis_GCA_000172235: 110 uces, 35 dupes, 75 non-dupes, 0 orient drop, 0 length drop, 74 written
2020-02-10 09:03:01,176 - Phyluce - INFO - -------- Working on Babesia_bigemina_GCF_000981445 genome -------
2020-02-10 09:03:01,177 - Phyluce - INFO - Reading Babesia_bigemina_GCF_000981445 genome
2020-02-10 09:03:01,927 - Phyluce - IN

2020-02-10 09:03:29,204 - Phyluce - INFO - Cyclospra_cayetanensis_GCF_002999335: 205 uces, 42 dupes, 163 non-dupes, 0 orient drop, 0 length drop, 163 written
2020-02-10 09:03:29,204 - Phyluce - INFO - ------- Working on Cystoisospra_suis_GCA_002600585 genome -------
2020-02-10 09:03:29,209 - Phyluce - INFO - Reading Cystoisospra_suis_GCA_002600585 genome
2020-02-10 09:03:30,287 - Phyluce - INFO - Cystoisospra_suis_GCA_002600585: 243 uces, 70 dupes, 173 non-dupes, 0 orient drop, 3 length drop, 170 written
2020-02-10 09:03:30,287 - Phyluce - INFO - ------- Working on Eimeria_acervulina_GCF_000499425 genome ------
2020-02-10 09:03:30,337 - Phyluce - INFO - Reading Eimeria_acervulina_GCF_000499425 genome
2020-02-10 09:03:31,018 - Phyluce - INFO - Eimeria_acervulina_GCF_000499425: 211 uces, 42 dupes, 169 non-dupes, 0 orient drop, 1 length drop, 168 written
2020-02-10 09:03:31,018 - Phyluce - INFO - -------- Working on Eimeria_brunetti_GCA_000499725 genome -------
2020-02-10 09:03:31,019 - P

2020-02-10 09:04:57,746 - Phyluce - INFO - Plasmodium_gonderi_GCF_002157705: 742 uces, 124 dupes, 618 non-dupes, 6 orient drop, 7 length drop, 605 written
2020-02-10 09:04:57,746 - Phyluce - INFO - -------- Working on Plasmodium_inui_GCF_000524495 genome --------
2020-02-10 09:04:57,749 - Phyluce - INFO - Reading Plasmodium_inui_GCF_000524495 genome
2020-02-10 09:05:01,333 - Phyluce - INFO - Plasmodium_inui_GCF_000524495: 733 uces, 108 dupes, 625 non-dupes, 6 orient drop, 6 length drop, 613 written
2020-02-10 09:05:01,333 - Phyluce - INFO - ------ Working on Plasmodium_knowlesi_GCF_000006355 genome ------
2020-02-10 09:05:01,379 - Phyluce - INFO - Reading Plasmodium_knowlesi_GCF_000006355 genome
2020-02-10 09:05:04,899 - Phyluce - INFO - Plasmodium_knowlesi_GCF_000006355: 726 uces, 114 dupes, 612 non-dupes, 6 orient drop, 6 length drop, 600 written
2020-02-10 09:05:04,900 - Phyluce - INFO - ------ Working on Plasmodium_malariae_GCF_900090045 genome ------
2020-02-10 09:05:04,902 - Phyl

In [45]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/apicomplexa/extract_probes_from_group/probe_fasta --output results/phyluce/apicomplexa/extract_probes_from_group/multifastas.sqlite --base-taxon Plasmodium_falciparum_GCA_000002765;
ascogregarina_taiwanensis_gca_000172235.
babesia_bigemina_gcf_000981445.
babesia_bovis_gcf_000165395.
babesia_divergens_gca_001077455.
babesia_microti_gcf_000691945.
babesia_ovata_gcf_002897235.
babesia_sp_gcf_002095265.
besnoitia_besnoiti_gcf_002563875.
cryptospridium_andersoni_gca_001865355.
cryptospridium_baileyi_gca_001593455.
cryptospridium_bovis_gca_009768925.
cryptospridium_cuniculus_gca_004337835.
cryptospridium_hominis_gca_000006425.
cryptospridium_meleagridis_gca_001593445.
cryptospridium_muris_gcf_000006515.
cryptospridium_parvum_gcf_000165345.
cryptospridium_ryanae_gca_009792415.
cryptospridium_sp_gca_004936735.
cryptospridium_tyzzeri_gca_007210665.
cryptospridium_ubiquitum_gcf_001865345.
cryptospridium_viatorum_gca_004337795.
cyclospr

In [47]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(44)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/apicomplexa/extract_probes_from_group/multifastas.sqlite  --base-taxon Plasmodium_falciparum_GCA_000002765 --output results/phyluce/apicomplexa/extract_probes_from_group/Plasmodium_falciparum_GCA_000002765+2-back-to-44.conf --specific-counts 44;
Counter({'plasmodium_sp_gcf_900097015': 100, 'plasmodium_coatneyi_gcf_001680005': 100, 'plasmodium_inui_gcf_000524495': 100, 'plasmodium_gonderi_gcf_002157705': 99, 'plasmodium_vinckei_gcf_000709005': 99, 'plasmodium_knowlesi_gcf_000006355': 99, 'plasmodium_relictum_gcf_900005765': 98, 'plasmodium_reichenowi_gcf_001601855': 98, 'plasmodium_falciparum_gca_000002765': 98, 'plasmodium_cynomolgi_gcf_000321355': 98, 'plasmodium_chabaudi_gca_900002335': 97, 'plasmodium_malariae_gcf_900090045': 97, 'plasmodium_yoelii_gca_900002385': 97, 'plasmodium_gaboni_gcf_001602025': 97, 'plasmodium_fragile_gcf_000956335': 97, 'plasmodium_vivax_gcf_000002415': 97, 'plasmodium_ovale_gca_900090025': 97, 'pla

## Final group specific bait design

In [48]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/apicomplexa/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/apicomplexa/extract_probes_from_group/Plasmodium_falciparum_GCA_000002765+2-back-to-44.conf --probe-prefix uce_apicomplexa_ --designer rnplattii --design apicomplexa_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNNNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [49]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list.fasta --query results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Mon Feb 10, 2020  09:09:19
Ended:  Mon Feb 10, 2020  09:09:46
Time for execution:  0.454900900523 minutes


In [50]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list.fasta --lastz results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_apicomplexa_;
Parsing lastz file...
Screening results...
Screened 9288 fasta sequences.  Filtered 0 duplicates. Kept 9289.


## CDhit to reduce numbers

In [51]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list.fasta
         -o
         results/phyluce/apicomplexa/final_probe_design/apicomplexa_v1-master_probe_list.95P_cdhit

Started: Mon Feb 10 09:19:36 2020
                            Output                              
----------------------------------------------------------------
total seq: 9289
longest and shortest : 80 and 80
Total letters: 743120
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 88M

Table limit with the given memory limit:
Max number of representatives: 3949185
Max number of word counting entries: 88896166

# comparing sequences from          0  to       1548
.---------- new table with     1265 representatives
# comparing sequences from    

# Cestodes

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [56]:
group = 'cestoda'

In [96]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [57]:
group_taxa = [ "Dibothriocephalus_latus_GCA_900617775",
               "Echinococcus_canadensis_GCA_900004735",
               "Echinococcus_granulosus_GCF_000524195",
               "Echinococcus_multilocularis_GCA_000469725",
               "Echinococcus_oligarthrus_GCA_900683695",
               "Hydatigera_taeniaeformis_GCA_900622495",
               "Hymenolepis_diminuta_GCA_900708905",
               "Hymenolepis_microstoma_GCA_000469805",
               "Mesocestoides_corti_GCA_900604375",
               "Rodentolepis_nana_GCA_900617975",
               "Schistocephalus_solidus_GCA_900618435",
               "Sparganum_proliferum_GCA_902702955",
               "Spirometra_erinaceieuropaei_GCA_000951995",
               "Taenia_asiatica_GCA_001693035",
               "Taenia_saginata_GCA_001693075",
               "Taenia_solium_GCA_001870725" ]

reference_taxon = "Taenia_multiceps_GCA_001923025"

all_taxa = group_taxa + [reference_taxon]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [98]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900617775.1_D_latum_Geneva_0011_upd_genomic.fna.gz

sent 42 bytes  received 167490987 bytes  9570915.94 bytes/sec
total size is 167449980  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may r

GCA_900618435.1_S_solidus_NST_G2_0011_upd_genomic.fna.gz

sent 42 bytes  received 171382592 bytes  9263926.16 bytes/sec
total size is 171340639  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_902702955.1_SprV_v2_2_genomic.fna.gz

sent 42 bytes  received 197599344 bytes  8782194.93 bytes/sec
total size is 197551007  speedup is 1.00


You are accessing a U.S. Government information system which 

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [99]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [100]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484492 ("sim_Dibothriocephalus_latus_GCA_900617775") has been submitted
Your job 5484493 ("sim_Echinococcus_canadensis_GCA_900004735") has been submitted
Your job 5484494 ("sim_Echinococcus_granulosus_GCF_000524195") has been submitted
Your job 5484495 ("sim_Echinococcus_multilocularis_GCA_000469725") has been submitted
Your job 5484496 ("sim_Echinococcus_oligarthrus_GCA_900683695") has been submitted
Your job 5484497 ("sim_Hydatigera_taeniaeformis_GCA_900622495") has been submitted
Your job 5484498 ("sim_Hymenolepis_diminuta_GCA_900708905") has been submitted
Your job 5484499 ("sim_Hymenolepis_microstoma_GCA_000469805") has been submitted
Your job 5484500 ("sim_Mesocestoides_corti_GCA_900604375") has been submitted
Your job 5484501 ("sim_Rodentolepis_nana_GCA_900617975") has been submitted
Your job 5484502 ("sim_Schistocephalus_solidus_GCA_900618435") has been submitted
Your job 5484503 ("sim_Sparganum_proliferum_GCA_902702955") has been submitted
Your job 5484504 ("sim_Spir

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [101]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/cestoda/cleaned_genomes/Taenia_multiceps_GCA_001923025_formatted.fas path=results/phyluce/cestoda/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [102]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484508 ("map_Dibothriocephalus_latus_GCA_900617775") has been submitted
Your job 5484509 ("map_Echinococcus_canadensis_GCA_900004735") has been submitted
Your job 5484510 ("map_Echinococcus_granulosus_GCF_000524195") has been submitted
Your job 5484511 ("map_Echinococcus_multilocularis_GCA_000469725") has been submitted
Your job 5484512 ("map_Echinococcus_oligarthrus_GCA_900683695") has been submitted
Your job 5484513 ("map_Hydatigera_taeniaeformis_GCA_900622495") has been submitted
Your job 5484514 ("map_Hymenolepis_diminuta_GCA_900708905") has been submitted
Your job 5484515 ("map_Hymenolepis_microstoma_GCA_000469805") has been submitted
Your job 5484516 ("map_Mesocestoides_corti_GCA_900604375") has been submitted
Your job 5484517 ("map_Rodentolepis_nana_GCA_900617975") has been submitted
Your job 5484518 ("map_Schistocephalus_solidus_GCA_900618435") has been submitted
Your job 5484519 ("map_Sparganum_proliferum_GCA_902702955") has been submitted
Your job 5484520 ("map_Spir

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [9]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5488019 ("merge_Dibothriocephalus_latus_GCA_900617775") has been submitted
Your job 5488020 ("merge_Echinococcus_canadensis_GCA_900004735") has been submitted
Your job 5488021 ("merge_Echinococcus_granulosus_GCF_000524195") has been submitted
Your job 5488022 ("merge_Echinococcus_multilocularis_GCA_000469725") has been submitted
Your job 5488023 ("merge_Echinococcus_oligarthrus_GCA_900683695") has been submitted
Your job 5488024 ("merge_Hydatigera_taeniaeformis_GCA_900622495") has been submitted
Your job 5488025 ("merge_Hymenolepis_diminuta_GCA_900708905") has been submitted
Your job 5488026 ("merge_Hymenolepis_microstoma_GCA_000469805") has been submitted
Your job 5488027 ("merge_Mesocestoides_corti_GCA_900604375") has been submitted
Your job 5488028 ("merge_Rodentolepis_nana_GCA_900617975") has been submitted
Your job 5488029 ("merge_Schistocephalus_solidus_GCA_900618435") has been submitted
Your job 5488030 ("merge_Sparganum_proliferum_GCA_902702955") has been submitted
You

remove loci that were masked in the original genome

In [58]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 231 sequences from Dibothriocephalus_latus_GCA_900617775_merged.bed.  Filtered 225 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 47288 sequences from Echinococcus_canadensis_GCA_900004735_merged.bed.  Filtered 26600 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 20688.
Screened 46393 sequences from Echinococcus_granulosus_GCF_000524195_merged.bed.  Filtered 26290 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 20103.
Screened 43636 sequences from Echinococcus_multilocularis_GCA_000469725_merged.bed.  Filtered 25669 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 17967.
Screened 26722 sequences from Echinococcus_oligarthrus_GCA_900683695_merged.bed.  Filtered 18324 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 8398.
Screened 34187 sequences from Hydatigera_taeniaeformis_GCA_900622495_merged.bed.  Filtered 21464 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 12723.
Screene

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [59]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [60]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/cestoda/initial_intervals/Taenia_solium_GCA_001870725_merged.bed --twobit results/phyluce/cestoda/cleaned_genomes/Taenia_multiceps_GCA_001923025_formatted.2bit --output results/phyluce/cestoda/initial_intervals/Taenia_solium_GCA_001870725_stripped.bed;
dibothriocephalus_latus_gca_900617775.
echinococcus_canadensis_gca_900004735.....................
echinococcus_granulosus_gcf_000524195.....................
echinococcus_multilocularis_gca_000469725..................
echinococcus_oligarthrus_gca_900683695.........
hydatigera_taeniaeformis_gca_900622495.............
hymenolepis_diminuta_gca_900708905.
hymenolepis_microstoma_gca_000469805.
mesocestoides_corti_gca_900604375.
rodentolepis_nana_gca_900617975.
schistocephalus_solidus_gca_900618435.
sparganum_proliferum_gca_902702955.
spirometra_erinaceieuropaei_gca_000951995.
taenia_asiatica_gca_001693035..........................................

Quantify probes and the number of targeted taxa for each.

In [61]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/cestoda/initial_intervals/cestoda-to-Taenia_multiceps_GCA_001923025.sqlite --base-taxon Taenia_multiceps_GCA_001923025
Loci shared by Taenia_multiceps_GCA_001923025 + 0 taxa:	172,255.0
Loci shared by Taenia_multiceps_GCA_001923025 + 1 taxa:	172,255.0
Loci shared by Taenia_multiceps_GCA_001923025 + 2 taxa:	111,926.0
Loci shared by Taenia_multiceps_GCA_001923025 + 3 taxa:	60,355.0
Loci shared by Taenia_multiceps_GCA_001923025 + 4 taxa:	26,818.0
Loci shared by Taenia_multiceps_GCA_001923025 + 5 taxa:	21,044.0
Loci shared by Taenia_multiceps_GCA_001923025 + 6 taxa:	17,362.0
Loci shared by Taenia_multiceps_GCA_001923025 + 7 taxa:	10,785.0
Loci shared by Taenia_multiceps_GCA_001923025 + 8 taxa:	3,350.0
Loci shared by Taenia_multiceps_GCA_001923025 + 9 taxa:	28.0
Loci shared by Taenia_multiceps_GCA_001923025 + 10 taxa:	12.0
Loci shared by Taenia_multiceps_GCA_001923025 + 11 taxa:	9.0
Loci shared by Taenia_multiceps_GCA_001923025 + 12 

In [62]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 8
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/cestoda/initial_intervals/cestoda-to-Taenia_multiceps_GCA_001923025.sqlite --base-taxon Taenia_multiceps_GCA_001923025 --output results/phyluce/cestoda/initial_intervals/Taenia_multiceps_GCA_001923025_+8.bed --specific-counts 8;
Counter({'echinococcus_multilocularis_gca_000469725': 3350, 'echinococcus_canadensis_gca_900004735': 3350, 'taenia_saginata_gca_001693075': 3349, 'taenia_solium_gca_001870725': 3349, 'taenia_asiatica_gca_001693035': 3349, 'echinococcus_granulosus_gcf_000524195': 3348, 'hydatigera_taeniaeformis_gca_900622495': 3344, 'echinococcus_oligarthrus_gca_900683695': 3326, 'mesocestoides_corti_gca_900604375': 33, 'hymenolepis_diminuta_gca_900708905': 12, 'hymenolepis_microstoma_gca_000469805': 12, 'rodentolepis_nana_gca_900617975': 12, 'sparganum_proliferum_gca_902702955': 9, 'spirometra_erinaceieuropaei_gca_000951995': 9, 'dibothriocephalus_latus_gca_900617775': 6, 'schistocephalus_solidus_gca_900618435': 6})


## Design temp set of baits

In [63]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/cestoda/initial_intervals/Taenia_multiceps_GCA_001923025_+8.bed --twobit results/phyluce/cestoda/cleaned_genomes/Taenia_multiceps_GCA_001923025_formatted.2bit --buffer-to 160 --output results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8.fasta;
Screened 3350 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 3350.


design the baits

In [64]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8.fasta --probe-prefix uce_cestoda_ --design cestoda_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGG


Conserved locus count = 3350
Probe Count = 6696


## Find duplicate baited regions

In [65]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8_temp_probes.fas --query results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  14:23:34
Ended:  Tue Feb 11, 2020  14:23:37
Time for execution:  0.0525618354479 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8_temp_probes.fas                        --lastz results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8_temp_probes_vself.lastz                       --probe-prefix=uce_cestoda_;
Parsing lastz file...
Screening results...
Screened 6695 fasta sequences.  Filtered 175 duplicates. Kept 6346.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [66]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [67]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/cestoda/validate_intervals/Taenia_multiceps_GCA_001923025_+8_temp_probes.fas --scaffoldlist Dibothriocephalus_latus_GCA_900617775 Echinococcus_canadensis_GCA_900004735 Echinococcus_granulosus_GCF_000524195 Echinococcus_multilocularis_GCA_000469725 Echinococcus_oligarthrus_GCA_900683695 Hydatigera_taeniaeformis_GCA_900622495 Hymenolepis_diminuta_GCA_900708905 Hymenolepis_microstoma_GCA_000469805 Mesocestoides_corti_GCA_900604375 Rodentolepis_nana_GCA_900617975 Schistocephalus_solidus_GCA_900618435 Sparganum_proliferum_GCA_902702955 Spirometra_erinaceieuropaei_GCA_000951995 Taenia_asiatica_GCA_001693035 Taenia_saginata_GCA_001693075 Taenia_solium_GCA_001870725 Taenia_multiceps_GCA_001923025 --genome-base-path results/phyluce/cestoda/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/cestoda/validate_intervals/cestoda-to-Taenia_multiceps_GCA_001923025.sqlite --output results/phyluce/cestoda/validate_intervals/l

	/tmp/tmpKUkawo.fasta
	/tmp/tmpdPFjml.fasta
	/tmp/tmpvYVO_i.fasta

Writing the results file...
	/tmp/tmpEAo7Pj.lastz
	/tmp/tmpbZnU61.lastz
	/tmp/tmpaJr9iH.lastz
	/tmp/tmp_ZWgnl.lastz
	/tmp/tmpv34SIc.lastz
	/tmp/tmpes_zpi.lastz
	/tmp/tmpIXYYjS.lastz
	/tmp/tmpw6k8So.lastz
	/tmp/tmp4LHPLK.lastz
	/tmp/tmpsPXZeK.lastz
	/tmp/tmpgtYQrT.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/cestoda/validate_intervals/lastz/Taenia_multiceps_GCA_001923025_+8_temp_probes.fas_v_Hydatigera_taeniaeformis_GCA_900622495.lastz
Creating Hydatigera_taeniaeformis_GCA_900622495 table
Inserting data to Hydatigera_taeniaeformis_GCA_900622495 table

Running against Hymenolepis_diminuta_GCA_900708905.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 16 queries...
	/tmp/tmpSxImsi.fasta
	/tmp/tmpODIMXA.fasta
	/tmp/tmpMDIlHf.fasta
	/tmp/tmphkaBYA.fasta
	/tmp/tmpeVIfSe.fasta
	/tmp/tmpvx9dTj.fasta
	/tmp/tmpxfH_5_.fasta
	/tmp/

	/tmp/tmpqpcT8o.fasta
	/tmp/tmpcKouOZ.fasta
	/tmp/tmp786ygv.fasta
	/tmp/tmpy26qT_.fasta
	/tmp/tmpUaQ4rz.fasta
	/tmp/tmp__3Ju6.fasta
	/tmp/tmppUmNH6.fasta
	/tmp/tmpxKewCG.fasta
	/tmp/tmpGvqOYM.fasta
	/tmp/tmpwbfQBH.fasta
	/tmp/tmpFceksi.fasta
	/tmp/tmpprKbMO.fasta
	/tmp/tmpQdShzD.fasta
	/tmp/tmpOL1awv.fasta
	/tmp/tmpNaCRJH.fasta
	/tmp/tmpuLJOgz.fasta
	/tmp/tmpZZ9BNr.fasta
	/tmp/tmpcSejsf.fasta
	/tmp/tmpr_BwCn.fasta
	/tmp/tmpteNs6N.fasta
	/tmp/tmp4wmaWN.fasta
	/tmp/tmp94ZwZ1.fasta
	/tmp/tmp7OPs65.fasta
	/tmp/tmpkeLZS2.fasta
	/tmp/tmpomGaDH.fasta
	/tmp/tmpGS9lQG.fasta
	/tmp/tmpKJCZRK.fasta
	/tmp/tmpgxgz93.fasta
	/tmp/tmpk3c3vK.fasta
	/tmp/tmpscP9vy.fasta
	/tmp/tmpiSh907.fasta
	/tmp/tmpn8gFNd.fasta
	/tmp/tmpF5uEea.fasta
	/tmp/tmpa3q5kJ.fasta
	/tmp/tmpCwW4yA.fasta
	/tmp/tmp2e3i97.fasta
	/tmp/tmpnAtcoV.fasta
	/tmp/tmpYB63QV.fasta
	/tmp/tmpcHgsUt.fasta
	/tmp/tmpbHa_3T.fasta
	/tmp/tmplprHFg.fasta
	/tmp/tmpQguMdg.fasta
	/tmp/tmpzI9tYs.fasta
	/tmp/tmpcbXY4b.fasta
	/tmp/tmpqtAJro.fasta
	/tmp/tmpe

Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/cestoda/validate_intervals/lastz/Taenia_multiceps_GCA_001923025_+8_temp_probes.fas_v_Spirometra_erinaceieuropaei_GCA_000951995.lastz
Creating Spirometra_erinaceieuropaei_GCA_000951995 table
Inserting data to Spirometra_erinaceieuropaei_GCA_000951995 table

Running against Taenia_asiatica_GCA_001693035.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 17 queries...
	/tmp/tmpzO_lrh.fasta
	/tmp/tmpl60uqi.fasta
	/tmp/tmphtlpvg.fasta
	/tmp/tmpikXAEX.fasta
	/tmp/tmpu0u55i.fasta
	/tmp/tmpLR5e4n.fasta
	/tmp/tmpqcjaHs.fasta
	/tmp/tmpUT7q5Q.fasta
	/tmp/tmpfQTcDX.fasta
	/tmp/tmp6ziVjn.fasta
	/tmp/tmp3zI_TI.fasta
	/tmp/tmpjf8564.fasta
	/tmp/tmpk8ls5K.fasta
	/tmp/tmp0l8XC_.fasta
	/tmp/tmp3zY9hZ.fasta
	/tmp/tmpJt0pno.fasta
	/tmp/tmp2DkN2D.fasta

Writing the results file...
	/tmp/tmp79n__V.lastz
	/tmp/tmp35Mliw.lastz
	/tmp/tmpcs1Vll.lastz
	/tmp/tmp8Jcwiv.lastz
	/

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [68]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/cestoda/extract_probes_from_group/cestoda_genome.conf --lastz results/phyluce/cestoda/validate_intervals/lastz --probes 120 --probe-prefix uce_cestoda_ --name-pattern "Taenia_multiceps_GCA_001923025_+8_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/cestoda/extract_probes_from_group/probe_fasta;
2020-02-11 14:50:49,343 - Phyluce - INFO - ---- Working on Dibothriocephalus_latus_GCA_900617775 genome ----
2020-02-11 14:50:49,344 - Phyluce - INFO - Reading Dibothriocephalus_latus_GCA_900617775 genome
2020-02-11 14:51:02,766 - Phyluce - INFO - Dibothriocephalus_latus_GCA_900617775: 1907 uces, 390 dupes, 1517 non-dupes, 0 orient drop, 1 length drop, 1513 written
2020-02-11 14:51:02,766 - Phyluce - INFO - ---- Working on Echinococcus_canadensis_GCA_900004735 genome ----
2020-02-11 14:51:02,803 - Phyluce - INFO - Reading Echinococcus_canadensis_GCA_900004735 genome
2020-02-11 14:51:17,651 - Phyluce - INFO - Echinococcu

In [69]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/cestoda/extract_probes_from_group/probe_fasta --output results/phyluce/cestoda/extract_probes_from_group/multifastas.sqlite --base-taxon Taenia_multiceps_GCA_001923025;
dibothriocephalus_latus_gca_900617775..
echinococcus_canadensis_gca_900004735...
echinococcus_granulosus_gcf_000524195...
echinococcus_multilocularis_gca_000469725...
echinococcus_oligarthrus_gca_900683695...
hydatigera_taeniaeformis_gca_900622495...
hymenolepis_diminuta_gca_900708905...
hymenolepis_microstoma_gca_000469805...
mesocestoides_corti_gca_900604375...
rodentolepis_nana_gca_900617975...
schistocephalus_solidus_gca_900618435..
sparganum_proliferum_gca_902702955..
spirometra_erinaceieuropaei_gca_000951995.
taenia_asiatica_gca_001693035...
taenia_saginata_gca_001693075...
taenia_solium_gca_001870725...
taenia_multiceps_gca_001923025...
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/cestoda/extract_probes_

In [70]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(17)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/cestoda/extract_probes_from_group/multifastas.sqlite  --base-taxon Taenia_multiceps_GCA_001923025 --output results/phyluce/cestoda/extract_probes_from_group/Taenia_multiceps_GCA_001923025+8-back-to-17.conf --specific-counts 17;
Counter({'dibothriocephalus_latus_gca_900617775': 334, 'echinococcus_canadensis_gca_900004735': 334, 'taenia_saginata_gca_001693075': 334, 'echinococcus_multilocularis_gca_000469725': 334, 'echinococcus_oligarthrus_gca_900683695': 334, 'hymenolepis_diminuta_gca_900708905': 334, 'hymenolepis_microstoma_gca_000469805': 334, 'rodentolepis_nana_gca_900617975': 334, 'taenia_multiceps_gca_001923025': 334, 'mesocestoides_corti_gca_900604375': 334, 'schistocephalus_solidus_gca_900618435': 334, 'echinococcus_granulosus_gcf_000524195': 334, 'sparganum_proliferum_gca_902702955': 334, 'taenia_solium_gca_001870725': 334, 'taenia_asiatica_gca_001693035': 334, 'spirometra_erinaceieuropaei_gca_000951995': 334, 'hydatige

## Final group specific bait design

In [71]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/cestoda/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/cestoda/extract_probes_from_group/Taenia_multiceps_GCA_001923025+8-back-to-17.conf --probe-prefix uce_cestoda_ --designer rnplattii --design cestoda_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list.fasta;
NNNNNNGGGGGGGNGGGGNGG


Conserved locus count = 334
Probe Count = 11335


In [72]:
i_design_txt   = group + "_v1"
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

i_probes_fas   = o_dir + i_design_txt + "-master_probe_list.fasta"
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list.fasta --query results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  14:55:22
Ended:  Tue Feb 11, 2020  14:55:40
Time for execution:  0.292155579726 minutes


In [73]:
i_lastz_tbl = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"
i_probe_prefix = "uce_" + group + "_"


#i_lastz_tbl = o_lastz_tbl
#i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list.fasta --lastz results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_cestoda_;
Parsing lastz file...
Screening results...
Screened 11334 fasta sequences.  Filtered 0 duplicates. Kept 11335.


## CDhit to reduce numbers

In [74]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list.fasta
         -o
         results/phyluce/cestoda/final_probe_design/cestoda_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 15:01:48 2020
                            Output                              
----------------------------------------------------------------
total seq: 11335
longest and shortest : 80 and 80
Total letters: 906800
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 2M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 89M

Table limit with the given memory limit:
Max number of representatives: 3945719
Max number of word counting entries: 88818154

# comparing sequences from          0  to       1889
.---------- new table with     1201 representatives
# comparing sequences from       1889  to    

# Eurotiales

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [102]:
group = 'eurotiales'

In [105]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [103]:
group_taxa = [ "Asprgillus_aculeatinus_GCF_003184765",
               "Asprgillus_aculeatus_GCF_001890905",
               "Asprgillus_aff_GCA_001931935",
               "Asprgillus_albertensis_GCA_009193605",
               "Asprgillus_alliaceus_GCF_009176365",
               "Asprgillus_amoenus_GCA_009812435",
               "Asprgillus_arachidicola_GCA_009193545",
               "Asprgillus_avenaceus_GCA_009193465",
               "Asprgillus_awamori_GCA_003850985",
               "Asprgillus_bertholletius_GCA_009193595",
               "Asprgillus_bombycis_GCF_001792695",
               "Asprgillus_brasiliensis_GCA_001889945",
               "Asprgillus_brunneoviolaceus_GCF_003184695",
               "Asprgillus_caelatus_GCF_009193585",
               "Asprgillus_calidoustus_GCA_001511075",
               "Asprgillus_campestris_GCF_002847485",
               "Asprgillus_candidus_GCF_002847045",
               "Asprgillus_carbonarius_GCA_001990825",
               "Asprgillus_cejpii_GCA_004769165",
               "Asprgillus_chevalieri_GCA_001599875",
               "Asprgillus_clavatus_GCF_000002715",
               "Asprgillus_coremiiformis_GCA_009193565",
               "Asprgillus_costaricaensis_GCF_003184835",
               "Asprgillus_cristatus_GCA_001693355",
               "Asprgillus_ellipticus_GCA_003184645",
               "Asprgillus_eucalypticola_GCF_003184535",
               "Asprgillus_fijiensis_GCA_003184825",
               "Asprgillus_fischeri_GCF_000149645",
               "Asprgillus_flavus_GCF_000006275",
               "Asprgillus_fumigatus_GCF_000002655",
               "Asprgillus_glaucus_GCF_001890805",
               "Asprgillus_hancockii_GCA_001696595",
               "Asprgillus_heteromorphus_GCF_003184545",
               "Asprgillus_homomorphus_GCF_003184865",
               "Asprgillus_ibericus_GCF_003184845",
               "Asprgillus_indologenus_GCA_003184685",
               "Asprgillus_japonicus_GCF_003184785",
               "Asprgillus_kawachii_GCA_000239835",
               "Asprgillus_lentulus_GCA_001445615",
               "Asprgillus_leporis_GCA_009176345",
               "Asprgillus_luchuensis_GCA_001890685",
               "Asprgillus_minisclerotigenes_GCA_009176455",
               "Asprgillus_mulundensis_GCF_003369625",
               "Asprgillus_neoellipticus_GCA_003116565",
               "Asprgillus_neoniger_GCF_003184625",
               "Asprgillus_nidulans_GCF_000149205",
               "Asprgillus_niger_GCF_000002855",
               "Asprgillus_nomius_GCF_001204775",
               "Asprgillus_novofumigatus_GCF_002847465",
               "Asprgillus_novoparasiticus_GCA_009176405",
               "Asprgillus_ochraceoroseus_GCA_002846915",
               "Asprgillus_ochraceus_GCA_004849945",
               "Asprgillus_olivimuriae_GCA_003719415",
               "Asprgillus_oryzae_GCF_000184455",
               "Asprgillus_parasiticus_GCA_009176385",
               "Asprgillus_persii_GCA_002215965",
               "Asprgillus_phoenicis_GCA_003344505",
               "Asprgillus_piperis_GCF_003184755",
               "Asprgillus_pseudocaelatus_GCA_009193665",
               "Asprgillus_pseudotamarii_GCF_009193445",
               "Asprgillus_pseudoterreus_GCA_002927005",
               "Asprgillus_rambellii_GCA_000986645",
               "Asprgillus_ruber_GCA_000600275",
               "Asprgillus_saccharolyticus_GCF_003184585",
               "Asprgillus_sclerotialis_GCA_003589665",
               "Asprgillus_sclerotiicarbonarius_GCA_003184635",
               "Asprgillus_sclerotioniger_GCF_003184525",
               "Asprgillus_sclerotiorum_GCA_000530345",
               "Asprgillus_sergii_GCA_009193525",
               "Asprgillus_sojae_GCA_008274985",
               "Asprgillus_sp_GCA_001044295",
               "Asprgillus_spnulosprus_GCA_003574815",
               "Asprgillus_steynii_GCF_002849105",
               "Asprgillus_sydowii_GCA_001890705",
               "Asprgillus_taichungensis_GCA_002850765",
               "Asprgillus_tamarii_GCA_009193485",
               "Asprgillus_tanneri_GCA_003426965",
               "Asprgillus_terreus_GCF_000149615",
               "Asprgillus_thermomutatus_GCF_002237265",
               "Asprgillus_transmontanensis_GCA_009193505",
               "Asprgillus_tritici_GCA_009812425",
               "Asprgillus_tubingensis_GCA_001890745",
               "Asprgillus_turcosus_GCA_002234965",
               "Asprgillus_udagawae_GCA_001078395",
               "Asprgillus_unguis_GCA_003324175",
               "Asprgillus_ustus_GCA_000812125",
               "Asprgillus_uvarum_GCF_003184745",
               "Asprgillus_vadensis_GCF_003184925",
               "Asprgillus_versicolor_GCA_001890125",
               "Asprgillus_violaceofuscus_GCA_003184705",
               "Asprgillus_viridinutans_GCA_004368095",
               "Asprgillus_welwitschiae_GCF_003344945",
               "Asprgillus_wentii_GCA_001890725",
               "Asprgillus_westerdijkiae_GCA_001307345",
               "Basipetospra_chlamydospra_GCA_001599675",
               "Byssochlamys_nivea_GCA_003116535",
               "Byssochlamys_sp_GCA_002914405",
               "Byssochlamys_spctabilis_GCF_004022145",
               "Elaphomyces_granulatus_GCA_002240705",
               "Monascus_purpureus_GCA_003184285",
               "Monascus_ruber_GCA_002976275",
               "Penicilliopsis_zonata_GCF_001890105",
               "Penicillium_antarcticum_GCA_002072345",
               "Penicillium_arizonense_GCF_001773325",
               "Penicillium_biforme_GCA_000577785",
               "Penicillium_brasilianum_GCA_001048715",
               "Penicillium_camemberti_GCA_000513335",
               "Penicillium_capsulatum_GCA_000943775",
               "Penicillium_carneum_GCA_000577495",
               "Penicillium_chrysogenum_GCA_000710275",
               "Penicillium_citrinum_GCA_001950535",
               "Penicillium_coprophilum_GCA_002072405",
               "Penicillium_decumbens_GCA_002072245",
               "Penicillium_digitatum_GCF_000315645",
               "Penicillium_expansum_GCF_000769745",
               "Penicillium_flavigenum_GCA_002072365",
               "Penicillium_freii_GCA_001513925",
               "Penicillium_fuscoglaucum_GCA_000576735",
               "Penicillium_griseofulvum_GCA_001561935",
               "Penicillium_italicum_GCA_002116305",
               "Penicillium_janthinellum_GCA_002369805",
               "Penicillium_nalgiovense_GCA_002072425",
               "Penicillium_nordicum_GCA_000733025",
               "Penicillium_oxalicum_GCA_000346795",
               "Penicillium_paneum_GCA_000577715",
               "Penicillium_paxilli_GCA_000347475",
               "Penicillium_polonicum_GCA_003344595",
               "Penicillium_roqueforti_GCA_000513255",
               "Penicillium_rubens_GCF_000226395",
               "Penicillium_sclerotiorum_GCA_001750025",
               "Penicillium_solitum_GCA_000952775",
               "Penicillium_sp_GCA_003800485",
               "Penicillium_steckii_GCA_002072375",
               "Penicillium_subrubescens_GCA_001908125",
               "Penicillium_verrucosum_GCA_000970515",
               "Penicillium_vulpinum_GCA_002072255",
               "Phialosimplex_sp_GCA_003698115",
               "Rasamsonia_emersonii_GCF_000968595",
               "Talaromyces_adpressus_GCA_002775195",
               "Talaromyces_amestolkiae_GCA_001896365",
               "Talaromyces_atroroseus_GCF_001907595",
               "Talaromyces_borbonicus_GCA_002916415",
               "Talaromyces_cellulolyticus_GCA_000829775",
               "Talaromyces_funiculosus_GCA_004299765",
               "Talaromyces_islandicus_GCA_000985935",          
               "Talaromyces_piceae_GCA_001657655",
               "Talaromyces_pinophilus_GCA_001571465",
               "Talaromyces_purpureogenus_GCA_001270325",
               "Talaromyces_stipitatus_GCF_000003125",
               "Talaromyces_verruculosus_GCA_001305275",
               "Talaromyces_wortmannii_GCA_001939245",
               "Thermoascaceae_sp_GCA_003123655",
               "Thermoascus_crustaceus_GCA_001599835",
               "Thermomyces_lanuginosus_GCA_000315935",
               "Xeromyces_bisprus_GCA_900006255" ]
                    
reference_taxon = "Talaromyces_marneffei_GCF_000001985"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [107]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003184765.1_Aspacu1_genomic.fna.gz

sent 42 bytes  received 11418006 bytes  4567219.20 bytes/sec
total size is 11415111  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplin

GCA_001792695.1_ASM179269v1_genomic.fna.gz

sent 42 bytes  received 11814876 bytes  3375690.86 bytes/sec
total size is 11811881  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001889945.1_Aspbr1_genomic.fna.gz

sent 42 bytes  received 11291100 bytes  4516456.80 bytes/sec
total size is 11288238  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute

GCA_009193565.1_Aspcor1_genomic.fna.gz

sent 42 bytes  received 9304271 bytes  3721725.20 bytes/sec
total size is 9301896  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003184835.1_Aspcos1_genomic.fna.gz

sent 42 bytes  received 11631406 bytes  2114808.73 bytes/sec
total size is 11628463  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, ne

GCA_003184545.1_Asphet1_genomic.fna.gz

sent 42 bytes  received 10892839 bytes  3112251.71 bytes/sec
total size is 10890072  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003184865.1_Asphom1_genomic.fna.gz

sent 42 bytes  received 10685415 bytes  4274182.80 bytes/sec
total size is 10682696  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, 

GCA_003116565.1_ASM311656v1_genomic.fna.gz

sent 42 bytes  received 8640785 bytes  3456330.80 bytes/sec
total size is 8638566  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003184625.1_Aspneo1_genomic.fna.gz

sent 42 bytes  received 11166112 bytes  3190329.71 bytes/sec
total size is 11163281  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_009176385.1_Asppar1_genomic.fna.gz

sent 42 bytes  received 12042054 bytes  3440598.86 bytes/sec
total size is 12039007  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002215965.1_ASM221596v1_genomic.fna.gz

sent 42 bytes  received 11949324 bytes  4779746.40 bytes/sec
total size is 11946297  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_003589665.1_phiScl_1.0_genomic.fna.gz

sent 42 bytes  received 8936634 bytes  2553336.00 bytes/sec
total size is 8934344  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003184635.1_Aspscle1_genomic.fna.gz

sent 42 bytes  received 11760199 bytes  2613386.89 bytes/sec
total size is 11757223  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_009193485.1_Asptam1_genomic.fna.gz

sent 42 bytes  received 12136008 bytes  4854420.00 bytes/sec
total size is 12132937  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003426965.1_ASM342696v1_genomic.fna.gz

sent 42 bytes  received 12167707 bytes  3476499.71 bytes/sec
total size is 12164624  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_003184745.1_Aspuva1_genomic.fna.gz

sent 42 bytes  received 11312596 bytes  3232182.29 bytes/sec
total size is 11309725  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003184925.1_Aspvad1_genomic.fna.gz

sent 42 bytes  received 11277655 bytes  3222199.14 bytes/sec
total size is 11274792  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, 

GCA_004022145.1_Paevar1_genomic.fna.gz

sent 42 bytes  received 9374786 bytes  2678522.29 bytes/sec
total size is 9372387  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002240705.1_PX439_genomic.fna.gz

sent 42 bytes  received 17344292 bytes  4955524.00 bytes/sec
total size is 17339951  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000577495.1_PCARLCP05634_20131217_genomic.fna.gz

sent 42 bytes  received 8301986 bytes  3320811.20 bytes/sec
total size is 8299838  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000710275.1_ASM71027v1_genomic.fna.gz

sent 42 bytes  received 10177663 bytes  4071082.00 bytes/sec
total size is 10175069  speedup is 1.00


You are accessing a U.S. Government information system which includes 

GCA_002116305.1_ASM211630v1_genomic.fna.gz

sent 42 bytes  received 9647389 bytes  3858972.40 bytes/sec
total size is 9644922  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002369805.1_ASM236980v1_genomic.fna.gz

sent 42 bytes  received 11797809 bytes  4719140.40 bytes/sec
total size is 11794822  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comp

GCA_000952775.2_ASM95277v2_genomic.fna.gz

sent 42 bytes  received 10974357 bytes  4389759.60 bytes/sec
total size is 10971571  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003800485.1_ASM380048v1_genomic.fna.gz

sent 42 bytes  received 11340985 bytes  3240293.43 bytes/sec
total size is 11338102  speedup is 1.00


You are accessing a U.S. Government information system which includes this
com

GCA_002916415.1_ASM291641v1_genomic.fna.gz

sent 42 bytes  received 8552377 bytes  3420967.60 bytes/sec
total size is 8550182  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000829775.1_TcelY94_1.0_genomic.fna.gz

sent 42 bytes  received 11273918 bytes  4509584.00 bytes/sec
total size is 11271059  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comp

GCA_003123655.1_ASM312365v1_genomic.fna.gz

sent 42 bytes  received 10000311 bytes  4000141.20 bytes/sec
total size is 9997756  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001599835.1_JCM_12817_assembly_v001_genomic.fna.gz

sent 42 bytes  received 9870422 bytes  3948185.60 bytes/sec
total size is 9867887  speedup is 1.00


You are accessing a U.S. Government information system which include

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [108]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [109]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484540 ("sim_Asprgillus_aculeatinus_GCF_003184765") has been submitted
Your job 5484541 ("sim_Asprgillus_aculeatus_GCF_001890905") has been submitted
Your job 5484542 ("sim_Asprgillus_aff_GCA_001931935") has been submitted
Your job 5484543 ("sim_Asprgillus_albertensis_GCA_009193605") has been submitted
Your job 5484544 ("sim_Asprgillus_alliaceus_GCF_009176365") has been submitted
Your job 5484545 ("sim_Asprgillus_amoenus_GCA_009812435") has been submitted
Your job 5484546 ("sim_Asprgillus_arachidicola_GCA_009193545") has been submitted
Your job 5484547 ("sim_Asprgillus_avenaceus_GCA_009193465") has been submitted
Your job 5484548 ("sim_Asprgillus_awamori_GCA_003850985") has been submitted
Your job 5484549 ("sim_Asprgillus_bertholletius_GCA_009193595") has been submitted
Your job 5484550 ("sim_Asprgillus_bombycis_GCF_001792695") has been submitted
Your job 5484551 ("sim_Asprgillus_brasiliensis_GCA_001889945") has been submitted
Your job 5484552 ("sim_Asprgillus_brunneoviolaceu

Your job 5484643 ("sim_Penicillium_arizonense_GCF_001773325") has been submitted
Your job 5484644 ("sim_Penicillium_biforme_GCA_000577785") has been submitted
Your job 5484645 ("sim_Penicillium_brasilianum_GCA_001048715") has been submitted
Your job 5484646 ("sim_Penicillium_camemberti_GCA_000513335") has been submitted
Your job 5484647 ("sim_Penicillium_capsulatum_GCA_000943775") has been submitted
Your job 5484648 ("sim_Penicillium_carneum_GCA_000577495") has been submitted
Your job 5484649 ("sim_Penicillium_chrysogenum_GCA_000710275") has been submitted
Your job 5484650 ("sim_Penicillium_citrinum_GCA_001950535") has been submitted
Your job 5484651 ("sim_Penicillium_coprophilum_GCA_002072405") has been submitted
Your job 5484652 ("sim_Penicillium_decumbens_GCA_002072245") has been submitted
Your job 5484653 ("sim_Penicillium_digitatum_GCF_000315645") has been submitted
Your job 5484654 ("sim_Penicillium_expansum_GCF_000769745") has been submitted
Your job 5484655 ("sim_Penicillium_fl

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [110]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/eurotiales/cleaned_genomes/Talaromyces_marneffei_GCF_000001985_formatted.fas path=results/phyluce/eurotiales/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [111]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484695 ("map_Asprgillus_aculeatinus_GCF_003184765") has been submitted
Your job 5484696 ("map_Asprgillus_aculeatus_GCF_001890905") has been submitted
Your job 5484697 ("map_Asprgillus_aff_GCA_001931935") has been submitted
Your job 5484698 ("map_Asprgillus_albertensis_GCA_009193605") has been submitted
Your job 5484699 ("map_Asprgillus_alliaceus_GCF_009176365") has been submitted
Your job 5484700 ("map_Asprgillus_amoenus_GCA_009812435") has been submitted
Your job 5484701 ("map_Asprgillus_arachidicola_GCA_009193545") has been submitted
Your job 5484702 ("map_Asprgillus_avenaceus_GCA_009193465") has been submitted
Your job 5484703 ("map_Asprgillus_awamori_GCA_003850985") has been submitted
Your job 5484704 ("map_Asprgillus_bertholletius_GCA_009193595") has been submitted
Your job 5484705 ("map_Asprgillus_bombycis_GCF_001792695") has been submitted
Your job 5484706 ("map_Asprgillus_brasiliensis_GCA_001889945") has been submitted
Your job 5484707 ("map_Asprgillus_brunneoviolaceu

Your job 5484798 ("map_Penicillium_arizonense_GCF_001773325") has been submitted
Your job 5484799 ("map_Penicillium_biforme_GCA_000577785") has been submitted
Your job 5484800 ("map_Penicillium_brasilianum_GCA_001048715") has been submitted
Your job 5484801 ("map_Penicillium_camemberti_GCA_000513335") has been submitted
Your job 5484802 ("map_Penicillium_capsulatum_GCA_000943775") has been submitted
Your job 5484803 ("map_Penicillium_carneum_GCA_000577495") has been submitted
Your job 5484804 ("map_Penicillium_chrysogenum_GCA_000710275") has been submitted
Your job 5484805 ("map_Penicillium_citrinum_GCA_001950535") has been submitted
Your job 5484806 ("map_Penicillium_coprophilum_GCA_002072405") has been submitted
Your job 5484807 ("map_Penicillium_decumbens_GCA_002072245") has been submitted
Your job 5484808 ("map_Penicillium_digitatum_GCF_000315645") has been submitted
Your job 5484809 ("map_Penicillium_expansum_GCF_000769745") has been submitted
Your job 5484810 ("map_Penicillium_fl

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [112]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5484850 ("merge_Asprgillus_aculeatinus_GCF_003184765") has been submitted
Your job 5484851 ("merge_Asprgillus_aculeatus_GCF_001890905") has been submitted
Your job 5484852 ("merge_Asprgillus_aff_GCA_001931935") has been submitted
Your job 5484853 ("merge_Asprgillus_albertensis_GCA_009193605") has been submitted
Your job 5484854 ("merge_Asprgillus_alliaceus_GCF_009176365") has been submitted
Your job 5484855 ("merge_Asprgillus_amoenus_GCA_009812435") has been submitted
Your job 5484856 ("merge_Asprgillus_arachidicola_GCA_009193545") has been submitted
Your job 5484857 ("merge_Asprgillus_avenaceus_GCA_009193465") has been submitted
Your job 5484858 ("merge_Asprgillus_awamori_GCA_003850985") has been submitted
Your job 5484859 ("merge_Asprgillus_bertholletius_GCA_009193595") has been submitted
Your job 5484860 ("merge_Asprgillus_bombycis_GCF_001792695") has been submitted
Your job 5484861 ("merge_Asprgillus_brasiliensis_GCA_001889945") has been submitted
Your job 5484862 ("merge_

Your job 5484951 ("merge_Penicilliopsis_zonata_GCF_001890105") has been submitted
Your job 5484952 ("merge_Penicillium_antarcticum_GCA_002072345") has been submitted
Your job 5484953 ("merge_Penicillium_arizonense_GCF_001773325") has been submitted
Your job 5484954 ("merge_Penicillium_biforme_GCA_000577785") has been submitted
Your job 5484955 ("merge_Penicillium_brasilianum_GCA_001048715") has been submitted
Your job 5484956 ("merge_Penicillium_camemberti_GCA_000513335") has been submitted
Your job 5484957 ("merge_Penicillium_capsulatum_GCA_000943775") has been submitted
Your job 5484958 ("merge_Penicillium_carneum_GCA_000577495") has been submitted
Your job 5484959 ("merge_Penicillium_chrysogenum_GCA_000710275") has been submitted
Your job 5484960 ("merge_Penicillium_citrinum_GCA_001950535") has been submitted
Your job 5484961 ("merge_Penicillium_coprophilum_GCA_002072405") has been submitted
Your job 5484962 ("merge_Penicillium_decumbens_GCA_002072245") has been submitted
Your job 5

remove loci that were masked in the original genome

In [104]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 419 sequences from Asprgillus_aculeatinus_GCF_003184765_merged.bed.  Filtered 310 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 109.
Screened 467 sequences from Asprgillus_aculeatus_GCF_001890905_merged.bed.  Filtered 335 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 132.
Screened 461 sequences from Asprgillus_aff_GCA_001931935_merged.bed.  Filtered 333 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 128.
Screened 341 sequences from Asprgillus_albertensis_GCA_009193605_merged.bed.  Filtered 256 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 85.
Screened 355 sequences from Asprgillus_alliaceus_GCF_009176365_merged.bed.  Filtered 268 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 87.
Screened 394 sequences from Asprgillus_amoenus_GCA_009812435_merged.bed.  Filtered 284 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 110.
Screened 394 sequences from Asprgillus_arachidicola_GCA_00919354

Screened 421 sequences from Asprgillus_olivimuriae_GCA_003719415_merged.bed.  Filtered 300 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 121.
Screened 464 sequences from Asprgillus_oryzae_GCF_000184455_merged.bed.  Filtered 320 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 144.
Screened 378 sequences from Asprgillus_parasiticus_GCA_009176385_merged.bed.  Filtered 276 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 102.
Screened 398 sequences from Asprgillus_persii_GCA_002215965_merged.bed.  Filtered 283 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 115.
Screened 453 sequences from Asprgillus_phoenicis_GCA_003344505_merged.bed.  Filtered 330 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 123.
Screened 444 sequences from Asprgillus_piperis_GCF_003184755_merged.bed.  Filtered 327 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 117.
Screened 375 sequences from Asprgillus_pseudocaelatus_GCA_0091

Screened 506 sequences from Penicillium_brasilianum_GCA_001048715_merged.bed.  Filtered 349 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 157.
Screened 420 sequences from Penicillium_camemberti_GCA_000513335_merged.bed.  Filtered 281 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 139.
Screened 514 sequences from Penicillium_capsulatum_GCA_000943775_merged.bed.  Filtered 347 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 167.
Screened 355 sequences from Penicillium_carneum_GCA_000577495_merged.bed.  Filtered 251 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 104.
Screened 452 sequences from Penicillium_chrysogenum_GCA_000710275_merged.bed.  Filtered 312 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 140.
Screened 369 sequences from Penicillium_citrinum_GCA_001950535_merged.bed.  Filtered 265 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 104.
Screened 432 sequences from Penicillium_coprophil

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [105]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [106]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/eurotiales/initial_intervals/Xeromyces_bisprus_GCA_900006255_merged.bed --twobit results/phyluce/eurotiales/cleaned_genomes/Talaromyces_marneffei_GCF_000001985_formatted.2bit --output results/phyluce/eurotiales/initial_intervals/Xeromyces_bisprus_GCA_900006255_stripped.bed;
asprgillus_aculeatinus_gcf_003184765.
asprgillus_aculeatus_gcf_001890905.
asprgillus_aff_gca_001931935.
asprgillus_albertensis_gca_009193605.
asprgillus_alliaceus_gcf_009176365.
asprgillus_amoenus_gca_009812435.
asprgillus_arachidicola_gca_009193545.
asprgillus_avenaceus_gca_009193465.
asprgillus_awamori_gca_003850985.
asprgillus_bertholletius_gca_009193595.
asprgillus_bombycis_gcf_001792695.
asprgillus_brasiliensis_gca_001889945.
asprgillus_brunneoviolaceus_gcf_003184695.
asprgillus_caelatus_gcf_009193585.
asprgillus_calidoustus_gca_001511075.
asprgillus_campestris_gcf_002847485.
asprgillus_candidus_gcf_002847045.
asp

Quantify probes and the number of targeted taxa for each.

In [107]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/eurotiales/initial_intervals/eurotiales-to-Talaromyces_marneffei_GCF_000001985.sqlite --base-taxon Talaromyces_marneffei_GCF_000001985
Loci shared by Talaromyces_marneffei_GCF_000001985 + 0 taxa:	26,540.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 1 taxa:	26,540.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 2 taxa:	22,589.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 3 taxa:	19,605.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 4 taxa:	16,932.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 5 taxa:	12,812.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 6 taxa:	8,444.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 7 taxa:	3,977.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 8 taxa:	999.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 9 taxa:	612.0
Loci shared by Talaromyces_marneffei_GCF_000001985 + 10 taxa:	491.0
Loci shared by Talaromyces_marneffei_GCF_00000

In [108]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 50
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/eurotiales/initial_intervals/eurotiales-to-Talaromyces_marneffei_GCF_000001985.sqlite --base-taxon Talaromyces_marneffei_GCF_000001985 --output results/phyluce/eurotiales/initial_intervals/Talaromyces_marneffei_GCF_000001985_+50.bed --specific-counts 50;
Counter({'talaromyces_pinophilus_gca_001571465': 164, 'talaromyces_adpressus_gca_002775195': 163, 'talaromyces_amestolkiae_gca_001896365': 163, 'talaromyces_stipitatus_gcf_000003125': 163, 'talaromyces_verruculosus_gca_001305275': 160, 'talaromyces_funiculosus_gca_004299765': 150, 'talaromyces_wortmannii_gca_001939245': 144, 'talaromyces_islandicus_gca_000985935': 139, 'talaromyces_cellulolyticus_gca_000829775': 137, 'asprgillus_awamori_gca_003850985': 129, 'talaromyces_atroroseus_gcf_001907595': 126, 'penicillium_capsulatum_gca_000943775': 124, 'thermoascaceae_sp_gca_003123655': 124, 'talaromyces_borbonicus_gca_002916415': 123, 'penicillium_brasilianum_gca_001048715': 122, 'pe

## Design temp set of baits

In [109]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/eurotiales/initial_intervals/Talaromyces_marneffei_GCF_000001985_+50.bed --twobit results/phyluce/eurotiales/cleaned_genomes/Talaromyces_marneffei_GCF_000001985_formatted.2bit --buffer-to 160 --output results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50.fasta;
Screened 166 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 166.


design the baits

In [110]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50.fasta --probe-prefix uce_eurotiales_ --design eurotiales_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGG


Conserved locus count = 163
Probe Count = 323


## Find duplicate baited regions

In [111]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas --query results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50_temp_probes_vself.lastz;
Started:  Mon Feb 10, 2020  10:31:31
Ended:  Mon Feb 10, 2020  10:31:31
Time for execution:  0.0032263358434 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas                        --lastz results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50_temp_probes_vself.lastz                       --probe-prefix=uce_eurotiales_;
Parsing lastz file...
Screening results...
Screened 322 fasta sequences.  Filtered 22 duplicates. Kept

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [112]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [116]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/eurotiales/validate_intervals/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas --scaffoldlist Asprgillus_aculeatinus_GCF_003184765 Asprgillus_aculeatus_GCF_001890905 Asprgillus_aff_GCA_001931935 Asprgillus_albertensis_GCA_009193605 Asprgillus_alliaceus_GCF_009176365 Asprgillus_amoenus_GCA_009812435 Asprgillus_arachidicola_GCA_009193545 Asprgillus_avenaceus_GCA_009193465 Asprgillus_awamori_GCA_003850985 Asprgillus_bertholletius_GCA_009193595 Asprgillus_bombycis_GCF_001792695 Asprgillus_brasiliensis_GCA_001889945 Asprgillus_brunneoviolaceus_GCF_003184695 Asprgillus_caelatus_GCF_009193585 Asprgillus_calidoustus_GCA_001511075 Asprgillus_campestris_GCF_002847485 Asprgillus_candidus_GCF_002847045 Asprgillus_carbonarius_GCA_001990825 Asprgillus_cejpii_GCA_004769165 Asprgillus_chevalieri_GCA_001599875 Asprgillus_clavatus_GCF_000002715 Asprgillus_coremiiformis_GCA_009193565 Asprgillus_costaricaensis_GCF_003184835 Asprgi

Running the targets against 4 queries...
	/tmp/tmpYuzqUB.fasta
	/tmp/tmpi7Q2Wl.fasta
	/tmp/tmpRqs2Xk.fasta
	/tmp/tmp364dyt.fasta

Writing the results file...
	/tmp/tmpaR8Nfs.lastz
	/tmp/tmp8TOxwV.lastz
	/tmp/tmpWt3ObF.lastz
	/tmp/tmpF_BN8e.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_albertensis_GCA_009193605.lastz
Creating Asprgillus_albertensis_GCA_009193605 table
Inserting data to Asprgillus_albertensis_GCA_009193605 table

Running against Asprgillus_alliaceus_GCF_009176365.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpZOF58S.fasta
	/tmp/tmpVgvuSB.fasta
	/tmp/tmpjC45cx.fasta
	/tmp/tmpZRVAbf.fasta

Writing the results file...
	/tmp/tmp1jwhNi.lastz
	/tmp/tmp7uZTQI.lastz
	/tmp/tmpugOssw.lastz
	/tmp/tmpKoqEVn.lastz
Cleaning up the chunked files...
Clea


Running against Asprgillus_campestris_GCF_002847485.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpmCOyT3.fasta
	/tmp/tmp25ICUY.fasta
	/tmp/tmpCkArp6.fasta

Writing the results file...
	/tmp/tmp7xQNZv.lastz
	/tmp/tmpFKnbep.lastz
	/tmp/tmpX2czi3.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_campestris_GCF_002847485.lastz
Creating Asprgillus_campestris_GCF_002847485 table
Inserting data to Asprgillus_campestris_GCF_002847485 table

Running against Asprgillus_candidus_GCF_002847045.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmprwp_8N.fasta
	/tmp/tmpBFylNw.fasta
	/tmp/tmpEz69sj.fasta

Writing the results file...
	/tmp/tmp41eh0G.lastz
	/tmp/tmpx5SIug.lastz
	/tmp/tmpUMBobR.lastz
Cle


Running against Asprgillus_fischeri_GCF_000149645.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpP43o_Z.fasta
	/tmp/tmpDpVbMA.fasta
	/tmp/tmpRuuwnL.fasta

Writing the results file...
	/tmp/tmpvbXQtO.lastz
	/tmp/tmpBXfoY8.lastz
	/tmp/tmpKkh6t0.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_fischeri_GCF_000149645.lastz
Creating Asprgillus_fischeri_GCF_000149645 table
Inserting data to Asprgillus_fischeri_GCF_000149645 table

Running against Asprgillus_flavus_GCF_000006275.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmp3sDgZm.fasta
	/tmp/tmpzabSUS.fasta
	/tmp/tmpKOQZ6T.fasta
	/tmp/tmpHO2BGO.fasta

Writing the results file...
	/tmp/tmp980ZJq.lastz
	/tmp/tmpzAQhor.lastz
	/tmp/tmpuEaa


Running against Asprgillus_leporis_GCA_009176345.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpR8zoX9.fasta
	/tmp/tmp5W1cHq.fasta
	/tmp/tmpnlD0mn.fasta
	/tmp/tmpxnujJS.fasta

Writing the results file...
	/tmp/tmpfON7GH.lastz
	/tmp/tmp61muzU.lastz
	/tmp/tmpdNzDTL.lastz
	/tmp/tmpHpCjFf.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_leporis_GCA_009176345.lastz
Creating Asprgillus_leporis_GCA_009176345 table
Inserting data to Asprgillus_leporis_GCA_009176345 table

Running against Asprgillus_luchuensis_GCA_001890685.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpaL2fzB.fasta
	/tmp/tmpNLvVQw.fasta
	/tmp/tmpAJ5Vj6.fasta
	/tmp/tmpY0GxJH.fasta

Writing the results file...
	/tmp/tmp_v1S

Inserting data to Asprgillus_ochraceoroseus_GCA_002846915 table

Running against Asprgillus_ochraceus_GCA_004849945.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpaBSzHm.fasta
	/tmp/tmpWWJEnx.fasta
	/tmp/tmp5yiSyE.fasta

Writing the results file...
	/tmp/tmp2fD82K.lastz
	/tmp/tmpA5qUBH.lastz
	/tmp/tmpw6biA6.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_ochraceus_GCA_004849945.lastz
Creating Asprgillus_ochraceus_GCA_004849945 table
Inserting data to Asprgillus_ochraceus_GCA_004849945 table

Running against Asprgillus_olivimuriae_GCA_003719415.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmp6Gp5Tt.fasta
	/tmp/tmpy6AVYS.fasta
	/tmp/tmpGWOcnj.fasta
	/tmp/tmpAEeUPi.fasta

Writing the 

Inserting data to Asprgillus_ruber_GCA_000600275 table

Running against Asprgillus_saccharolyticus_GCF_003184585.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpsZeQmG.fasta
	/tmp/tmpmvEGoh.fasta
	/tmp/tmpbhwBbW.fasta
	/tmp/tmpkDFCxh.fasta

Writing the results file...
	/tmp/tmpjT7zyj.lastz
	/tmp/tmpFGdvoA.lastz
	/tmp/tmpHZJw9O.lastz
	/tmp/tmpbeuh2f.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_saccharolyticus_GCF_003184585.lastz
Creating Asprgillus_saccharolyticus_GCF_003184585 table
Inserting data to Asprgillus_saccharolyticus_GCF_003184585 table

Running against Asprgillus_sclerotialis_GCA_003589665.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpMi3IJg.fasta
	/tmp/tmpSwtrmn.fas

Creating Asprgillus_taichungensis_GCA_002850765 table
Inserting data to Asprgillus_taichungensis_GCA_002850765 table

Running against Asprgillus_tamarii_GCA_009193485.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpjg_XE_.fasta
	/tmp/tmpUhYGSb.fasta
	/tmp/tmpT7FjZi.fasta
	/tmp/tmpGf4Ayi.fasta

Writing the results file...
	/tmp/tmpLnksZr.lastz
	/tmp/tmpXgW34a.lastz
	/tmp/tmphG_QRH.lastz
	/tmp/tmp54nViJ.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_tamarii_GCA_009193485.lastz
Creating Asprgillus_tamarii_GCA_009193485 table
Inserting data to Asprgillus_tamarii_GCA_009193485 table

Running against Asprgillus_tanneri_GCA_003426965.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpuACHUC.

Inserting data to Asprgillus_uvarum_GCF_003184745 table

Running against Asprgillus_vadensis_GCF_003184925.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpjdCW5j.fasta
	/tmp/tmpBu2YgZ.fasta
	/tmp/tmpIKOczp.fasta
	/tmp/tmpOmpFVA.fasta

Writing the results file...
	/tmp/tmpTnmIaz.lastz
	/tmp/tmpNFVr91.lastz
	/tmp/tmpyE4BwN.lastz
	/tmp/tmp_4aMDE.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Asprgillus_vadensis_GCF_003184925.lastz
Creating Asprgillus_vadensis_GCF_003184925 table
Inserting data to Asprgillus_vadensis_GCF_003184925 table

Running against Asprgillus_versicolor_GCA_001890125.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpygtUxf.fasta
	/tmp/tmpgngklV.fasta
	/tmp/tmpXMaRQu.fasta

Wri

Inserting data to Elaphomyces_granulatus_GCA_002240705 table

Running against Monascus_purpureus_GCA_003184285.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpuGMOqU.fasta
	/tmp/tmpEoqSuB.fasta
	/tmp/tmp8v3Of4.fasta

Writing the results file...
	/tmp/tmp4txp5l.lastz
	/tmp/tmpO55uSC.lastz
	/tmp/tmpuxfdC2.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Monascus_purpureus_GCA_003184285.lastz
Creating Monascus_purpureus_GCA_003184285 table
Inserting data to Monascus_purpureus_GCA_003184285 table

Running against Monascus_ruber_GCA_002976275.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpAezBtt.fasta
	/tmp/tmpx6V6qE.fasta
	/tmp/tmpjOk8Zu.fasta

Writing the results file...
	/tmp/tmp14Iq5M.lastz
	/t


Running against Penicillium_coprophilum_GCA_002072405.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpScZ4Ho.fasta
	/tmp/tmpXulggh.fasta
	/tmp/tmpu4_nNg.fasta

Writing the results file...
	/tmp/tmpMFq62Y.lastz
	/tmp/tmp5iJuv8.lastz
	/tmp/tmpk0JLFO.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Penicillium_coprophilum_GCA_002072405.lastz
Creating Penicillium_coprophilum_GCA_002072405 table
Inserting data to Penicillium_coprophilum_GCA_002072405 table

Running against Penicillium_decumbens_GCA_002072245.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmp1UxKMU.fasta
	/tmp/tmpnzEnM4.fasta
	/tmp/tmpBEqQx_.fasta

Writing the results file...
	/tmp/tmpBmeIWx.lastz
	/tmp/tmpCWgkL9.lastz
	/tmp/tmpDhIeII

Inserting data to Penicillium_nordicum_GCA_000733025 table

Running against Penicillium_oxalicum_GCA_000346795.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpY9cUFM.fasta
	/tmp/tmp3HjynV.fasta
	/tmp/tmp9BCFzv.fasta

Writing the results file...
	/tmp/tmpC3aNUX.lastz
	/tmp/tmpzfwSgZ.lastz
	/tmp/tmpD8E7wQ.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Penicillium_oxalicum_GCA_000346795.lastz
Creating Penicillium_oxalicum_GCA_000346795 table
Inserting data to Penicillium_oxalicum_GCA_000346795 table

Running against Penicillium_paneum_GCA_000577715.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpDaZfka.fasta
	/tmp/tmpKTCEh7.fasta
	/tmp/tmpidqnFw.fasta

Writing the results file...
	/tmp/tmpT25hF1


Running against Penicillium_vulpinum_GCA_002072255.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpQOJoke.fasta
	/tmp/tmpqu_qF5.fasta
	/tmp/tmpqarVJj.fasta

Writing the results file...
	/tmp/tmpGqXuaN.lastz
	/tmp/tmpftUkzh.lastz
	/tmp/tmp7eAo7v.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/eurotiales/extract_probes_from_group/lastz/Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_Penicillium_vulpinum_GCA_002072255.lastz
Creating Penicillium_vulpinum_GCA_002072255 table
Inserting data to Penicillium_vulpinum_GCA_002072255 table

Running against Phialosimplex_sp_GCA_003698115.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmp4paltM.fasta
	/tmp/tmpcPg78V.fasta
	/tmp/tmpneXiCj.fasta

Writing the results file...
	/tmp/tmpYdgR32.lastz
	/tmp/tmpqIhb1q.lastz
	/tmp/tmpgUGtJd.lastz
Cleaning u

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [117]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/eurotiales/extract_probes_from_group/eurotiales_genome.conf --lastz results/phyluce/eurotiales/validate_intervals/lastz --probes 120 --probe-prefix uce_eurotiales_ --name-pattern "Talaromyces_marneffei_GCF_000001985_+50_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/eurotiales/extract_probes_from_group/probe_fasta;
2020-02-10 10:59:01,555 - Phyluce - INFO - ----- Working on Asprgillus_aculeatinus_GCF_003184765 genome ----
2020-02-10 10:59:01,556 - Phyluce - INFO - Reading Asprgillus_aculeatinus_GCF_003184765 genome
2020-02-10 10:59:03,181 - Phyluce - INFO - Asprgillus_aculeatinus_GCF_003184765: 142 uces, 41 dupes, 101 non-dupes, 1 orient drop, 0 length drop, 100 written
2020-02-10 10:59:03,181 - Phyluce - INFO - ------ Working on Asprgillus_aculeatus_GCF_001890905 genome -----
2020-02-10 10:59:03,182 - Phyluce - INFO - Reading Asprgillus_aculeatus_GCF_001890905 genome
2020-02-10 10:59:04,276 - Phyluce - INFO -

2020-02-10 10:59:23,672 - Phyluce - INFO - Asprgillus_coremiiformis_GCA_009193565: 133 uces, 44 dupes, 89 non-dupes, 0 orient drop, 0 length drop, 89 written
2020-02-10 10:59:23,672 - Phyluce - INFO - --- Working on Asprgillus_costaricaensis_GCF_003184835 genome ---
2020-02-10 10:59:23,674 - Phyluce - INFO - Reading Asprgillus_costaricaensis_GCF_003184835 genome
2020-02-10 10:59:24,526 - Phyluce - INFO - Asprgillus_costaricaensis_GCF_003184835: 133 uces, 42 dupes, 91 non-dupes, 0 orient drop, 0 length drop, 91 written
2020-02-10 10:59:24,526 - Phyluce - INFO - ------ Working on Asprgillus_cristatus_GCA_001693355 genome -----
2020-02-10 10:59:24,527 - Phyluce - INFO - Reading Asprgillus_cristatus_GCA_001693355 genome
2020-02-10 10:59:25,903 - Phyluce - INFO - Asprgillus_cristatus_GCA_001693355: 158 uces, 46 dupes, 112 non-dupes, 18 orient drop, 0 length drop, 94 written
2020-02-10 10:59:25,903 - Phyluce - INFO - ----- Working on Asprgillus_ellipticus_GCA_003184645 genome -----
2020-02-1

2020-02-10 10:59:45,836 - Phyluce - INFO - Asprgillus_neoniger_GCF_003184625: 136 uces, 40 dupes, 96 non-dupes, 1 orient drop, 0 length drop, 95 written
2020-02-10 10:59:45,836 - Phyluce - INFO - ------ Working on Asprgillus_nidulans_GCF_000149205 genome ------
2020-02-10 10:59:45,847 - Phyluce - INFO - Reading Asprgillus_nidulans_GCF_000149205 genome
2020-02-10 10:59:46,819 - Phyluce - INFO - Asprgillus_nidulans_GCF_000149205: 149 uces, 52 dupes, 97 non-dupes, 1 orient drop, 0 length drop, 96 written
2020-02-10 10:59:46,820 - Phyluce - INFO - -------- Working on Asprgillus_niger_GCF_000002855 genome -------
2020-02-10 10:59:46,828 - Phyluce - INFO - Reading Asprgillus_niger_GCF_000002855 genome
2020-02-10 10:59:47,904 - Phyluce - INFO - Asprgillus_niger_GCF_000002855: 149 uces, 39 dupes, 110 non-dupes, 1 orient drop, 17 length drop, 92 written
2020-02-10 10:59:47,904 - Phyluce - INFO - ------- Working on Asprgillus_nomius_GCF_001204775 genome -------
2020-02-10 10:59:47,905 - Phyluce 

2020-02-10 11:00:08,870 - Phyluce - INFO - Asprgillus_sclerotiorum_GCA_000530345: 160 uces, 45 dupes, 115 non-dupes, 0 orient drop, 0 length drop, 115 written
2020-02-10 11:00:08,870 - Phyluce - INFO - ------- Working on Asprgillus_sergii_GCA_009193525 genome -------
2020-02-10 11:00:08,877 - Phyluce - INFO - Reading Asprgillus_sergii_GCA_009193525 genome
2020-02-10 11:00:09,930 - Phyluce - INFO - Asprgillus_sergii_GCA_009193525: 137 uces, 49 dupes, 88 non-dupes, 0 orient drop, 0 length drop, 88 written
2020-02-10 11:00:09,930 - Phyluce - INFO - -------- Working on Asprgillus_sojae_GCA_008274985 genome -------
2020-02-10 11:00:09,944 - Phyluce - INFO - Reading Asprgillus_sojae_GCA_008274985 genome
2020-02-10 11:00:10,839 - Phyluce - INFO - Asprgillus_sojae_GCA_008274985: 137 uces, 45 dupes, 92 non-dupes, 5 orient drop, 1 length drop, 86 written
2020-02-10 11:00:10,839 - Phyluce - INFO - --------- Working on Asprgillus_sp_GCA_001044295 genome ---------
2020-02-10 11:00:10,840 - Phyluce 

2020-02-10 11:00:32,168 - Phyluce - INFO - Asprgillus_welwitschiae_GCF_003344945: 135 uces, 43 dupes, 92 non-dupes, 0 orient drop, 0 length drop, 92 written
2020-02-10 11:00:32,169 - Phyluce - INFO - ------- Working on Asprgillus_wentii_GCA_001890725 genome -------
2020-02-10 11:00:32,177 - Phyluce - INFO - Reading Asprgillus_wentii_GCA_001890725 genome
2020-02-10 11:00:33,049 - Phyluce - INFO - Asprgillus_wentii_GCA_001890725: 134 uces, 49 dupes, 85 non-dupes, 2 orient drop, 0 length drop, 83 written
2020-02-10 11:00:33,050 - Phyluce - INFO - ---- Working on Asprgillus_westerdijkiae_GCA_001307345 genome ---
2020-02-10 11:00:33,059 - Phyluce - INFO - Reading Asprgillus_westerdijkiae_GCA_001307345 genome
2020-02-10 11:00:33,989 - Phyluce - INFO - Asprgillus_westerdijkiae_GCA_001307345: 145 uces, 42 dupes, 103 non-dupes, 0 orient drop, 0 length drop, 103 written
2020-02-10 11:00:33,990 - Phyluce - INFO - --- Working on Basipetospra_chlamydospra_GCA_001599675 genome ---
2020-02-10 11:00:3

2020-02-10 11:00:53,611 - Phyluce - INFO - Penicillium_expansum_GCF_000769745: 135 uces, 42 dupes, 93 non-dupes, 0 orient drop, 0 length drop, 93 written
2020-02-10 11:00:53,612 - Phyluce - INFO - ----- Working on Penicillium_flavigenum_GCA_002072365 genome ----
2020-02-10 11:00:53,619 - Phyluce - INFO - Reading Penicillium_flavigenum_GCA_002072365 genome
2020-02-10 11:00:54,662 - Phyluce - INFO - Penicillium_flavigenum_GCA_002072365: 163 uces, 48 dupes, 115 non-dupes, 0 orient drop, 0 length drop, 115 written
2020-02-10 11:00:54,663 - Phyluce - INFO - ------- Working on Penicillium_freii_GCA_001513925 genome -------
2020-02-10 11:00:54,671 - Phyluce - INFO - Reading Penicillium_freii_GCA_001513925 genome
2020-02-10 11:00:55,582 - Phyluce - INFO - Penicillium_freii_GCA_001513925: 161 uces, 48 dupes, 113 non-dupes, 0 orient drop, 0 length drop, 113 written
2020-02-10 11:00:55,582 - Phyluce - INFO - ---- Working on Penicillium_fuscoglaucum_GCA_000576735 genome ---
2020-02-10 11:00:55,589

2020-02-10 11:01:15,353 - Phyluce - INFO - Rasamsonia_emersonii_GCF_000968595: 137 uces, 42 dupes, 95 non-dupes, 0 orient drop, 0 length drop, 95 written
2020-02-10 11:01:15,353 - Phyluce - INFO - ----- Working on Talaromyces_adpressus_GCA_002775195 genome -----
2020-02-10 11:01:15,354 - Phyluce - INFO - Reading Talaromyces_adpressus_GCA_002775195 genome
2020-02-10 11:01:16,298 - Phyluce - INFO - Talaromyces_adpressus_GCA_002775195: 162 uces, 47 dupes, 115 non-dupes, 0 orient drop, 0 length drop, 115 written
2020-02-10 11:01:16,298 - Phyluce - INFO - ---- Working on Talaromyces_amestolkiae_GCA_001896365 genome ----
2020-02-10 11:01:16,300 - Phyluce - INFO - Reading Talaromyces_amestolkiae_GCA_001896365 genome
2020-02-10 11:01:17,103 - Phyluce - INFO - Talaromyces_amestolkiae_GCA_001896365: 163 uces, 55 dupes, 108 non-dupes, 0 orient drop, 1 length drop, 107 written
2020-02-10 11:01:17,103 - Phyluce - INFO - ----- Working on Talaromyces_atroroseus_GCF_001907595 genome ----
2020-02-10 11

In [118]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/eurotiales/extract_probes_from_group/probe_fasta --output results/phyluce/eurotiales/extract_probes_from_group/multifastas.sqlite --base-taxon Talaromyces_marneffei_GCF_000001985;
asprgillus_aculeatinus_gcf_003184765.
asprgillus_aculeatus_gcf_001890905.
asprgillus_aff_gca_001931935.
asprgillus_albertensis_gca_009193605.
asprgillus_alliaceus_gcf_009176365.
asprgillus_amoenus_gca_009812435.
asprgillus_arachidicola_gca_009193545.
asprgillus_avenaceus_gca_009193465.
asprgillus_awamori_gca_003850985.
asprgillus_bertholletius_gca_009193595.
asprgillus_bombycis_gcf_001792695.
asprgillus_brasiliensis_gca_001889945.
asprgillus_brunneoviolaceus_gcf_003184695.
asprgillus_caelatus_gcf_009193585.
asprgillus_calidoustus_gca_001511075.
asprgillus_campestris_gcf_002847485.
asprgillus_candidus_gcf_002847045.
asprgillus_carbonarius_gca_001990825.
asprgillus_cejpii_gca_004769165.
asprgillus_chevalieri_gca_001599875.
asprgillus_clavatus_gcf_0000

In [119]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(137)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/eurotiales/extract_probes_from_group/multifastas.sqlite  --base-taxon Talaromyces_marneffei_GCF_000001985 --output results/phyluce/eurotiales/extract_probes_from_group/Talaromyces_marneffei_GCF_000001985+50-back-to-137.conf --specific-counts 137;
Counter({'asprgillus_steynii_gcf_002849105': 58, 'penicillium_vulpinum_gca_002072255': 58, 'penicillium_griseofulvum_gca_001561935': 58, 'asprgillus_fischeri_gcf_000149645': 58, 'asprgillus_pseudoterreus_gca_002927005': 58, 'asprgillus_versicolor_gca_001890125': 58, 'byssochlamys_sp_gca_002914405': 58, 'asprgillus_welwitschiae_gcf_003344945': 58, 'asprgillus_terreus_gcf_000149615': 58, 'penicillium_antarcticum_gca_002072345': 58, 'penicillium_oxalicum_gca_000346795': 58, 'asprgillus_turcosus_gca_002234965': 58, 'asprgillus_sojae_gca_008274985': 58, 'asprgillus_saccharolyticus_gcf_003184585': 58, 'penicillium_capsulatum_gca_000943775': 58, 'asprgillus_sydowii_gca_001890705': 58, 'penici

## Final group specific bait design

In [120]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/eurotiales/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/eurotiales/extract_probes_from_group/Talaromyces_marneffei_GCF_000001985+50-back-to-137.conf --probe-prefix uce_eurotiales_ --designer rnplattii --design eurotiales_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list.fasta;
GGGGGGGNNGGGNNNNGGGGGGGGNGGGGGGGGGGGGNGGGGGGNNNN


Conserved locus count = 58
Probe Count = 15986


In [121]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list.fasta --query results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Mon Feb 10, 2020  11:20:53
Ended:  Mon Feb 10, 2020  11:24:35
Time for execution:  3.7069364508 minutes


In [122]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list.fasta --lastz results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_eurotiales_;
Parsing lastz file...
Screening results...
Screened 15985 fasta sequences.  Filtered 0 duplicates. Kept 15986.


## CDhit to reduce numbers

In [123]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list.fasta
         -o
         results/phyluce/eurotiales/final_probe_design/eurotiales_v1-master_probe_list.95P_cdhit

Started: Mon Feb 10 13:00:26 2020
                            Output                              
----------------------------------------------------------------
total seq: 15986
longest and shortest : 80 and 80
Total letters: 1278880
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 3M
Buffer          : 4 X 12M = 49M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 90M

Table limit with the given memory limit:
Max number of representatives: 3937783
Max number of word counting entries: 88639506

# comparing sequences from          0  to       2664
..---------- new table with      991 representatives
# comparing sequences from     

# Hexamitidae

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [10]:
group = 'hexamitidae'

In [114]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [11]:
group_taxa = [ "Giardia_intestinalis_GCA_000182405",
               "Giardia_intestinalis_GCA_000182665",
               "Giardia_intestinalis_GCA_000498715",
               "Giardia_intestinalis_GCA_000498735",
               "Giardia_intestinalis_GCA_001493575",
               "Giardia_intestinalis_GCA_001543975",
               "Giardia_intestinalis_GCA_009192805",
               "Giardia_intestinalis_GCA_009192825",
               "Giardia_intestinalis_GCA_902209425",
               "Giardia_intestinalis_GCA_902221465",
               "Giardia_intestinalis_GCA_902221485",
               "Giardia_intestinalis_GCA_902221515",
               "Giardia_intestinalis_GCA_902221535",
               "Giardia_intestinalis_GCA_902221545",
               "Giardia_lamblia_GCA_000182665",
               "Giardia_muris_GCA_006247105",
               "Spironucleus_salmonicida_GCA_000497125" ]
                    
reference_taxon = "Giardia_intestinalis_GCA_000002435"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [116]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000002435.2_UU_WB_2.1_genomic.fna.gz

sent 42 bytes  received 3518754 bytes  2345864.00 bytes/sec
total size is 3517786  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplin

GCA_902221465.1_cyste2_genomic.fna.gz

sent 42 bytes  received 3494782 bytes  258875.85 bytes/sec
total size is 3493825  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_902221485.1_cyste4_genomic.fna.gz

sent 42 bytes  received 3458534 bytes  628832.00 bytes/sec
total size is 3457585  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network,

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [117]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [118]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5485005 ("sim_Giardia_intestinalis_GCA_000182405") has been submitted
Your job 5485006 ("sim_Giardia_intestinalis_GCA_000182665") has been submitted
Your job 5485007 ("sim_Giardia_intestinalis_GCA_000498715") has been submitted
Your job 5485008 ("sim_Giardia_intestinalis_GCA_000498735") has been submitted
Your job 5485009 ("sim_Giardia_intestinalis_GCA_001493575") has been submitted
Your job 5485010 ("sim_Giardia_intestinalis_GCA_001543975") has been submitted
Your job 5485011 ("sim_Giardia_intestinalis_GCA_009192805") has been submitted
Your job 5485012 ("sim_Giardia_intestinalis_GCA_009192825") has been submitted
Your job 5485013 ("sim_Giardia_intestinalis_GCA_902209425") has been submitted
Your job 5485014 ("sim_Giardia_intestinalis_GCA_902221465") has been submitted
Your job 5485015 ("sim_Giardia_intestinalis_GCA_902221485") has been submitted
Your job 5485016 ("sim_Giardia_intestinalis_GCA_902221515") has been submitted
Your job 5485017 ("sim_Giardia_intestinalis_GCA_9022

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [119]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/hexamitidae/cleaned_genomes/Giardia_intestinalis_GCA_000002435_formatted.fas path=results/phyluce/hexamitidae/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [120]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5485022 ("map_Giardia_intestinalis_GCA_000182405") has been submitted
Your job 5485023 ("map_Giardia_intestinalis_GCA_000182665") has been submitted
Your job 5485024 ("map_Giardia_intestinalis_GCA_000498715") has been submitted
Your job 5485025 ("map_Giardia_intestinalis_GCA_000498735") has been submitted
Your job 5485026 ("map_Giardia_intestinalis_GCA_001493575") has been submitted
Your job 5485027 ("map_Giardia_intestinalis_GCA_001543975") has been submitted
Your job 5485028 ("map_Giardia_intestinalis_GCA_009192805") has been submitted
Your job 5485029 ("map_Giardia_intestinalis_GCA_009192825") has been submitted
Your job 5485030 ("map_Giardia_intestinalis_GCA_902209425") has been submitted
Your job 5485031 ("map_Giardia_intestinalis_GCA_902221465") has been submitted
Your job 5485032 ("map_Giardia_intestinalis_GCA_902221485") has been submitted
Your job 5485033 ("map_Giardia_intestinalis_GCA_902221515") has been submitted
Your job 5485034 ("map_Giardia_intestinalis_GCA_9022

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [121]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5485039 ("merge_Giardia_intestinalis_GCA_000182405") has been submitted
Your job 5485040 ("merge_Giardia_intestinalis_GCA_000182665") has been submitted
Your job 5485041 ("merge_Giardia_intestinalis_GCA_000498715") has been submitted
Your job 5485042 ("merge_Giardia_intestinalis_GCA_000498735") has been submitted
Your job 5485043 ("merge_Giardia_intestinalis_GCA_001493575") has been submitted
Your job 5485044 ("merge_Giardia_intestinalis_GCA_001543975") has been submitted
Your job 5485045 ("merge_Giardia_intestinalis_GCA_009192805") has been submitted
Your job 5485046 ("merge_Giardia_intestinalis_GCA_009192825") has been submitted
Your job 5485047 ("merge_Giardia_intestinalis_GCA_902209425") has been submitted
Your job 5485048 ("merge_Giardia_intestinalis_GCA_902221465") has been submitted
Your job 5485049 ("merge_Giardia_intestinalis_GCA_902221485") has been submitted
Your job 5485050 ("merge_Giardia_intestinalis_GCA_902221515") has been submitted
Your job 5485051 ("merge_Gia

remove loci that were masked in the original genome

In [127]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 2565 sequences from Giardia_intestinalis_GCA_000182405_merged.bed.  Filtered 1939 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 626.
Screened 21421 sequences from Giardia_intestinalis_GCA_000182665_merged.bed.  Filtered 8700 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 12721.
Screened 3667 sequences from Giardia_intestinalis_GCA_000498715_merged.bed.  Filtered 1763 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1904.
Screened 2632 sequences from Giardia_intestinalis_GCA_000498735_merged.bed.  Filtered 1987 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 645.
Screened 3610 sequences from Giardia_intestinalis_GCA_001493575_merged.bed.  Filtered 1662 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1948.
Screened 2555 sequences from Giardia_intestinalis_GCA_001543975_merged.bed.  Filtered 1967 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 588.
Screened 1631 sequences from Giardia_inte

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [128]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [129]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/hexamitidae/initial_intervals/Spironucleus_salmonicida_GCA_000497125_merged.bed --twobit results/phyluce/hexamitidae/cleaned_genomes/Giardia_intestinalis_GCA_000002435_formatted.2bit --output results/phyluce/hexamitidae/initial_intervals/Spironucleus_salmonicida_GCA_000497125_stripped.bed;
giardia_intestinalis_gca_000182405.
giardia_intestinalis_gca_000182665.............
giardia_intestinalis_gca_000498715..
giardia_intestinalis_gca_000498735.
giardia_intestinalis_gca_001493575..
giardia_intestinalis_gca_001543975.
giardia_intestinalis_gca_009192805.
giardia_intestinalis_gca_009192825.
giardia_intestinalis_gca_902209425.
giardia_intestinalis_gca_902221465.
giardia_intestinalis_gca_902221485.
giardia_intestinalis_gca_902221515.
giardia_intestinalis_gca_902221535.
giardia_intestinalis_gca_902221545.
giardia_lamblia_gca_000182665.............
giardia_muris_gca_006247105.
spironucleus_salmoni

Quantify probes and the number of targeted taxa for each.

In [130]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/hexamitidae/initial_intervals/hexamitidae-to-Giardia_intestinalis_GCA_000002435.sqlite --base-taxon Giardia_intestinalis_GCA_000002435
Loci shared by Giardia_intestinalis_GCA_000002435 + 0 taxa:	15,148.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 1 taxa:	15,148.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 2 taxa:	14,275.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 3 taxa:	13,949.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 4 taxa:	13,569.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 5 taxa:	12,664.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 6 taxa:	12,069.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 7 taxa:	856.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 8 taxa:	698.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 9 taxa:	552.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 10 taxa:	256.0
Loci shared by Giardia_intestinalis_GCA_000002435 + 11 tax

In [131]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 7
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/hexamitidae/initial_intervals/hexamitidae-to-Giardia_intestinalis_GCA_000002435.sqlite --base-taxon Giardia_intestinalis_GCA_000002435 --output results/phyluce/hexamitidae/initial_intervals/Giardia_intestinalis_GCA_000002435_+7.bed --specific-counts 7;
Counter({'giardia_lamblia_gca_000182665': 848, 'giardia_intestinalis_gca_001493575': 846, 'giardia_intestinalis_gca_000182665': 835, 'giardia_intestinalis_gca_009192825': 832, 'giardia_intestinalis_gca_000498715': 831, 'giardia_intestinalis_gca_009192805': 829, 'giardia_intestinalis_gca_000498735': 600, 'giardia_intestinalis_gca_000182405': 598, 'giardia_intestinalis_gca_001543975': 549, 'giardia_intestinalis_gca_902221545': 324, 'giardia_intestinalis_gca_902221515': 324, 'giardia_intestinalis_gca_902209425': 316, 'giardia_intestinalis_gca_902221535': 127, 'giardia_intestinalis_gca_902221465': 126, 'giardia_intestinalis_gca_902221485': 122, 'giardia_muris_gca_006247105': 18, 'spi

## Design temp set of baits

In [132]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/hexamitidae/initial_intervals/Giardia_intestinalis_GCA_000002435_+7.bed --twobit results/phyluce/hexamitidae/cleaned_genomes/Giardia_intestinalis_GCA_000002435_formatted.2bit --buffer-to 160 --output results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7.fasta;
Screened 856 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 856.


design the baits

In [133]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7.fasta --probe-prefix uce_hexamitidae_ --design hexamitidae_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 832
Probe Count = 1652


## Find duplicate baited regions

In [134]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7_temp_probes.fas --query results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7_temp_probes_vself.lastz;
Started:  Mon Feb 10, 2020  13:10:04
Ended:  Mon Feb 10, 2020  13:10:05
Time for execution:  0.0107145984968 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7_temp_probes.fas                        --lastz results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7_temp_probes_vself.lastz                       --probe-prefix=uce_hexamitidae_;
Parsing lastz file...
Screening results...
Screened 1651 fasta sequences.  Filtered 43 duplicates. Kept 15

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [135]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [136]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/hexamitidae/validate_intervals/Giardia_intestinalis_GCA_000002435_+7_temp_probes.fas --scaffoldlist Giardia_intestinalis_GCA_000182405 Giardia_intestinalis_GCA_000182665 Giardia_intestinalis_GCA_000498715 Giardia_intestinalis_GCA_000498735 Giardia_intestinalis_GCA_001493575 Giardia_intestinalis_GCA_001543975 Giardia_intestinalis_GCA_009192805 Giardia_intestinalis_GCA_009192825 Giardia_intestinalis_GCA_902209425 Giardia_intestinalis_GCA_902221465 Giardia_intestinalis_GCA_902221485 Giardia_intestinalis_GCA_902221515 Giardia_intestinalis_GCA_902221535 Giardia_intestinalis_GCA_902221545 Giardia_lamblia_GCA_000182665 Giardia_muris_GCA_006247105 Spironucleus_salmonicida_GCA_000497125 Giardia_intestinalis_GCA_000002435 --genome-base-path results/phyluce/hexamitidae/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/hexamitidae/validate_intervals/hexamitidae-to-Giardia_intestinalis_GCA_000002435.sqlite --output resu

Creating Giardia_intestinalis_GCA_902221515 table
Inserting data to Giardia_intestinalis_GCA_902221515 table

Running against Giardia_intestinalis_GCA_902221535.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 2 queries...
	/tmp/tmpq8npOX.fasta
	/tmp/tmpRQIbRX.fasta

Writing the results file...
	/tmp/tmpIwMAu4.lastz
	/tmp/tmpaU5v65.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/hexamitidae/validate_intervals/lastz/Giardia_intestinalis_GCA_000002435_+7_temp_probes.fas_v_Giardia_intestinalis_GCA_902221535.lastz
Creating Giardia_intestinalis_GCA_902221535 table
Inserting data to Giardia_intestinalis_GCA_902221535 table

Running against Giardia_intestinalis_GCA_902221545.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 2 queries...
	/tmp/tmp8vQ9lk.fasta
	/tmp/tmpF9rkSC.fasta

Writing the results file...
	/tmp/tmpE2NG4F.lastz
	/tmp/tmpVf0y7K

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [137]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/hexamitidae/extract_probes_from_group/hexamitidae_genome.conf --lastz results/phyluce/hexamitidae/validate_intervals/lastz --probes 120 --probe-prefix uce_hexamitidae_ --name-pattern "Giardia_intestinalis_GCA_000002435_+7_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/hexamitidae/extract_probes_from_group/probe_fasta;
2020-02-10 13:11:43,100 - Phyluce - INFO - ------ Working on Giardia_intestinalis_GCA_000182405 genome -----
2020-02-10 13:11:43,101 - Phyluce - INFO - Reading Giardia_intestinalis_GCA_000182405 genome
2020-02-10 13:11:46,703 - Phyluce - INFO - Giardia_intestinalis_GCA_000182405: 831 uces, 108 dupes, 723 non-dupes, 1 orient drop, 2 length drop, 720 written
2020-02-10 13:11:46,703 - Phyluce - INFO - ------ Working on Giardia_intestinalis_GCA_000182665 genome -----
2020-02-10 13:11:46,704 - Phyluce - INFO - Reading Giardia_intestinalis_GCA_000182665 genome
2020-02-10 13:11:50,054 - Phyluce - INFO -

In [138]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/hexamitidae/extract_probes_from_group/probe_fasta --output results/phyluce/hexamitidae/extract_probes_from_group/multifastas.sqlite --base-taxon Giardia_intestinalis_GCA_000002435;
giardia_intestinalis_gca_000182405.
giardia_intestinalis_gca_000182665.
giardia_intestinalis_gca_000498715.
giardia_intestinalis_gca_000498735.
giardia_intestinalis_gca_001493575.
giardia_intestinalis_gca_001543975.
giardia_intestinalis_gca_009192805.
giardia_intestinalis_gca_009192825.
giardia_intestinalis_gca_902209425.
giardia_intestinalis_gca_902221465.
giardia_intestinalis_gca_902221485.
giardia_intestinalis_gca_902221515.
giardia_intestinalis_gca_902221535.
giardia_intestinalis_gca_902221545.
giardia_lamblia_gca_000182665.
giardia_muris_gca_006247105.
spironucleus_salmonicida_gca_000497125.
giardia_intestinalis_gca_000002435.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/hexamitidae/extract_pro

In [139]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(18)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/hexamitidae/extract_probes_from_group/multifastas.sqlite  --base-taxon Giardia_intestinalis_GCA_000002435 --output results/phyluce/hexamitidae/extract_probes_from_group/Giardia_intestinalis_GCA_000002435+7-back-to-18.conf --specific-counts 18;
Counter({'giardia_intestinalis_gca_902221465': 74, 'giardia_intestinalis_gca_000498715': 74, 'giardia_intestinalis_gca_902221485': 74, 'giardia_intestinalis_gca_000182665': 74, 'giardia_intestinalis_gca_009192825': 74, 'giardia_muris_gca_006247105': 74, 'giardia_intestinalis_gca_902221535': 74, 'giardia_intestinalis_gca_001543975': 74, 'giardia_intestinalis_gca_001493575': 74, 'giardia_intestinalis_gca_902221545': 74, 'giardia_intestinalis_gca_000002435': 74, 'giardia_intestinalis_gca_000182405': 74, 'giardia_intestinalis_gca_902209425': 74, 'giardia_intestinalis_gca_902221515': 74, 'giardia_lamblia_gca_000182665': 74, 'giardia_intestinalis_gca_000498735': 74, 'giardia_intestinalis_gca_00

## Final group specific bait design

In [18]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/hexamitidae/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/hexamitidae/extract_probes_from_group/Giardia_intestinalis_GCA_000002435+7-back-to-18.conf --probe-prefix uce_hexamitidae_ --designer rnplattii --design hexamitidae_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list.fasta;
GGNNGNNGGGNNG


Conserved locus count = 74
Probe Count = 2651


In [19]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list.fasta --query results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  12:18:12
Ended:  Tue Feb 11, 2020  12:18:17
Time for execution:  0.0834663192431 minutes


In [20]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list.fasta --lastz results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_hexamitidae_;
Parsing lastz file...
Screening results...
Screened 2650 fasta sequences.  Filtered 0 duplicates. Kept 2651.


## CDhit to reduce numbers

In [21]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list.fasta
         -o
         results/phyluce/hexamitidae/final_probe_design/hexamitidae_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 12:20:08 2020
                            Output                              
----------------------------------------------------------------
total seq: 2651
longest and shortest : 80 and 80
Total letters: 212080
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 86M

Table limit with the given memory limit:
Max number of representatives: 3960486
Max number of word counting entries: 89150560

# comparing sequences from          0  to        441
---------- new table with      204 representatives
# comparing sequences from     

# Kinetoplastea

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [24]:
group = 'kinetoplastea'

In [127]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [25]:
group_taxa = [ "Angomonas_deanei_GCA_001659865",
               "Angomonas_desouzai_GCA_000482185",
               "Bodo_saltans_GCA_001460835",
               "Crithidia_acanthocephali_GCA_000482105",
               "Crithidia_bombi_GCA_900240985",
               "Crithidia_expoeki_GCA_900240875",
               "Crithidia_fasciculata_GCA_000331325",
               "Crithidia_mellificae_GCA_002216565",
               "Endotrypanum_monterogeii_GCA_000333855",
               "Herpetomonas_muscarum_GCA_000482205",
               "Leishmania_aethiopica_GCA_000444285",
               "Leishmania_amazonensis_GCA_005317125",
               "Leishmania_arabica_GCA_000410695",
               "Leishmania_braziliensis_GCF_000002845",
               "Leishmania_donovani_GCF_000227135",
               "Leishmania_enriettii_GCA_000410755",
               "Leishmania_gerbilli_GCA_000443025",
               "Leishmania_infantum_GCF_000002875",
               "Leishmania_lainsoni_GCA_003664395",
               "Leishmania_mexicana_GCF_000234665",
               "Leishmania_panamensis_GCF_000755165",
               "Leishmania_peruviana_GCA_001403675",
               "Leishmania_sp_GCA_000409445",
               "Leishmania_tarentolae_GCA_009731335",
               "Leishmania_tropica_GCA_000410715",
               "Leishmania_turanica_GCA_000441995",
               "Leptomonas_pyrrhocoris_GCF_001293395",
               "Leptomonas_seymouri_GCA_001299535",
               "Lotmaria_passim_GCA_000635995",
               "Paratrypanosoma_confusum_GCA_002921335",
               "Perkinsela_sp_GCA_001235845",
               "Phytomonas_francai_GCA_001766655",
               "Phytomonas_serpens_GCA_000331125",
               "Phytomonas_sp_GCA_000582765",
               "Strigomonas_culicis_GCA_000482145",
               "Strigomonas_galati_GCA_000482125",
               "Strigomonas_oncopelti_GCA_000482165",
               "Trypanosoma_brucei_GCF_000210295",
               "Trypanosoma_congolense_GCA_002287245",
               "Trypanosoma_conorhini_GCF_003719485",
               "Trypanosoma_cruzi_GCF_000209065",
               "Trypanosoma_equiperdum_GCA_001457755",
               "Trypanosoma_grayi_GCF_000691245",
               "Trypanosoma_rangeli_GCF_003719475",
               "Trypanosoma_theileri_GCF_002087225",
               "Trypanosoma_vivax_GCA_000227375",
               "Trypanosomatidae_sp_GCA_003671325" ]
                    
reference_taxon = "Leishmania_major_GCF_000002725"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [129]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001659865.1_Angomonas_deanei_v1.0_genomic.fna.gz

sent 42 bytes  received 5704401 bytes  2281777.20 bytes/sec
total size is 5702885  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result 

GCA_902369305.1_Lad_genomic.fna.gz

sent 42 bytes  received 6014 bytes  1101.09 bytes/sec
total size is 5900  speedup is 0.97


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000444285.2_Leishmania_aethiopica-L147-2.0.3_genomic.fna.gz

sent 42 bytes  received 9904650 bytes  2829912.00 bytes/sec
total size is 9902098  speedup is 1.00


You are accessing a U.S. Government information system which includes this
co

GCA_000002725.2_ASM272v2_genomic.fna.gz

sent 42 bytes  received 10077710 bytes  4031100.80 bytes/sec
total size is 10075142  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000234665.4_ASM23466v4_genomic.fna.gz

sent 42 bytes  received 10052517 bytes  4021023.60 bytes/sec
total size is 10049955  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_002921335.1_ASM292133v1_genomic.fna.gz

sent 42 bytes  received 8214696 bytes  3285895.20 bytes/sec
total size is 8212582  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001235845.1_ASM123584v1_genomic.fna.gz

sent 42 bytes  received 2861076 bytes  1907412.00 bytes/sec
total size is 2860266  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_000209065.1_ASM20906v1_genomic.fna.gz

sent 42 bytes  received 26710398 bytes  5935653.33 bytes/sec
total size is 26703772  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001457755.2_Trypanosoma_equiperdum_OVI_V2_genomic.fna.gz

sent 42 bytes  received 8486624 bytes  3394666.40 bytes/sec
total size is 8484427  speedup is 1.00


You are accessing a U.S. Government information system which i

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [277]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [278]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487555 ("sim_Angomonas_deanei_GCA_001659865") has been submitted
Your job 5487556 ("sim_Angomonas_desouzai_GCA_000482185") has been submitted
Your job 5487557 ("sim_Bodo_saltans_GCA_001460835") has been submitted
Your job 5487558 ("sim_Crithidia_acanthocephali_GCA_000482105") has been submitted
Your job 5487559 ("sim_Crithidia_bombi_GCA_900240985") has been submitted
Your job 5487560 ("sim_Crithidia_expoeki_GCA_900240875") has been submitted
Your job 5487561 ("sim_Crithidia_fasciculata_GCA_000331325") has been submitted
Your job 5487562 ("sim_Crithidia_mellificae_GCA_002216565") has been submitted
Your job 5487563 ("sim_Endotrypanum_monterogeii_GCA_000333855") has been submitted
Your job 5487564 ("sim_Herpetomonas_muscarum_GCA_000482205") has been submitted
Your job 5487565 ("sim_Leishmania_adleri_GCA_902369305") has been submitted
Your job 5487566 ("sim_Leishmania_aethiopica_GCA_000444285") has been submitted
Your job 5487567 ("sim_Leishmania_amazonensis_GCA_005317125") has 

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [279]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/kinetoplastea/cleaned_genomes/Leishmania_major_GCF_000002725_formatted.fas path=results/phyluce/kinetoplastea/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [280]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487605 ("map_Angomonas_deanei_GCA_001659865") has been submitted
Your job 5487606 ("map_Angomonas_desouzai_GCA_000482185") has been submitted
Your job 5487607 ("map_Bodo_saltans_GCA_001460835") has been submitted
Your job 5487608 ("map_Crithidia_acanthocephali_GCA_000482105") has been submitted
Your job 5487609 ("map_Crithidia_bombi_GCA_900240985") has been submitted
Your job 5487610 ("map_Crithidia_expoeki_GCA_900240875") has been submitted
Your job 5487611 ("map_Crithidia_fasciculata_GCA_000331325") has been submitted
Your job 5487612 ("map_Crithidia_mellificae_GCA_002216565") has been submitted
Your job 5487613 ("map_Endotrypanum_monterogeii_GCA_000333855") has been submitted
Your job 5487614 ("map_Herpetomonas_muscarum_GCA_000482205") has been submitted
Your job 5487615 ("map_Leishmania_adleri_GCA_902369305") has been submitted
Your job 5487616 ("map_Leishmania_aethiopica_GCA_000444285") has been submitted
Your job 5487617 ("map_Leishmania_amazonensis_GCA_005317125") has 

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [281]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487655 ("merge_Angomonas_deanei_GCA_001659865") has been submitted
Your job 5487656 ("merge_Angomonas_desouzai_GCA_000482185") has been submitted
Your job 5487657 ("merge_Bodo_saltans_GCA_001460835") has been submitted
Your job 5487658 ("merge_Crithidia_acanthocephali_GCA_000482105") has been submitted
Your job 5487659 ("merge_Crithidia_bombi_GCA_900240985") has been submitted
Your job 5487660 ("merge_Crithidia_expoeki_GCA_900240875") has been submitted
Your job 5487661 ("merge_Crithidia_fasciculata_GCA_000331325") has been submitted
Your job 5487662 ("merge_Crithidia_mellificae_GCA_002216565") has been submitted
Your job 5487663 ("merge_Endotrypanum_monterogeii_GCA_000333855") has been submitted
Your job 5487664 ("merge_Herpetomonas_muscarum_GCA_000482205") has been submitted
Your job 5487665 ("merge_Leishmania_adleri_GCA_902369305") has been submitted
Your job 5487666 ("merge_Leishmania_aethiopica_GCA_000444285") has been submitted
Your job 5487667 ("merge_Leishmania_amazon

remove loci that were masked in the original genome

In [311]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 210 sequences from Angomonas_deanei_GCA_001659865_merged.bed.  Filtered 170 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 40.
Screened 258 sequences from Angomonas_desouzai_GCA_000482185_merged.bed.  Filtered 206 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 52.
Screened 163 sequences from Bodo_saltans_GCA_001460835_merged.bed.  Filtered 121 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 42.
Screened 3876 sequences from Crithidia_acanthocephali_GCA_000482105_merged.bed.  Filtered 2691 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1185.
Screened 2409 sequences from Crithidia_bombi_GCA_900240985_merged.bed.  Filtered 1791 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 618.
Screened 2022 sequences from Crithidia_expoeki_GCA_900240875_merged.bed.  Filtered 1535 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 487.
Screened 3696 sequences from Crithidia_fasciculata_GCA_000331325_merged.

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [312]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [313]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/kinetoplastea/initial_intervals/Trypanosomatidae_sp_GCA_003671325_merged.bed --twobit results/phyluce/kinetoplastea/cleaned_genomes/Leishmania_major_GCF_000002725_formatted.2bit --output results/phyluce/kinetoplastea/initial_intervals/Trypanosomatidae_sp_GCA_003671325_stripped.bed;
angomonas_deanei_gca_001659865.
angomonas_desouzai_gca_000482185.
bodo_saltans_gca_001460835.
crithidia_acanthocephali_gca_000482105..
crithidia_bombi_gca_900240985.
crithidia_expoeki_gca_900240875.
crithidia_fasciculata_gca_000331325..
crithidia_mellificae_gca_002216565..
endotrypanum_monterogeii_gca_000333855..
herpetomonas_muscarum_gca_000482205.
leishmania_adleri_gca_902369305
leishmania_aethiopica_gca_000444285................
leishmania_amazonensis_gca_005317125.....................
leishmania_arabica_gca_000410695.............
leishmania_braziliensis_gcf_000002845...........
leishmania_donovani_gcf_00022

Quantify probes and the number of targeted taxa for each.

In [314]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/kinetoplastea/initial_intervals/kinetoplastea-to-Leishmania_major_GCF_000002725.sqlite --base-taxon Leishmania_major_GCF_000002725
Loci shared by Leishmania_major_GCF_000002725 + 0 taxa:	26,983.0
Loci shared by Leishmania_major_GCF_000002725 + 1 taxa:	26,983.0
Loci shared by Leishmania_major_GCF_000002725 + 2 taxa:	25,971.0
Loci shared by Leishmania_major_GCF_000002725 + 3 taxa:	24,871.0
Loci shared by Leishmania_major_GCF_000002725 + 4 taxa:	23,301.0
Loci shared by Leishmania_major_GCF_000002725 + 5 taxa:	21,705.0
Loci shared by Leishmania_major_GCF_000002725 + 6 taxa:	19,962.0
Loci shared by Leishmania_major_GCF_000002725 + 7 taxa:	17,058.0
Loci shared by Leishmania_major_GCF_000002725 + 8 taxa:	15,509.0
Loci shared by Leishmania_major_GCF_000002725 + 9 taxa:	12,350.0
Loci shared by Leishmania_major_GCF_000002725 + 10 taxa:	10,801.0
Loci shared by Leishmania_major_GCF_000002725 + 11 taxa:	8,754.0
Loci shared by Leishmania_maj

In [315]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 17
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/kinetoplastea/initial_intervals/kinetoplastea-to-Leishmania_major_GCF_000002725.sqlite --base-taxon Leishmania_major_GCF_000002725 --output results/phyluce/kinetoplastea/initial_intervals/Leishmania_major_GCF_000002725_+17.bed --specific-counts 17;
Counter({'leishmania_donovani_gcf_000227135': 3332, 'leishmania_infantum_gcf_000002875': 3331, 'leishmania_mexicana_gcf_000234665': 3330, 'leishmania_amazonensis_gca_005317125': 3320, 'leishmania_tarentolae_gca_009731335': 3313, 'leishmania_panamensis_gcf_000755165': 3308, 'leishmania_arabica_gca_000410695': 3304, 'leishmania_turanica_gca_000441995': 3301, 'leishmania_gerbilli_gca_000443025': 3294, 'leishmania_aethiopica_gca_000444285': 3293, 'leishmania_braziliensis_gcf_000002845': 3292, 'leishmania_tropica_gca_000410715': 3286, 'leishmania_lainsoni_gca_003664395': 3278, 'leishmania_peruviana_gca_001403675': 3266, 'leishmania_enriettii_gca_000410755': 3232, 'leishmania_major_gcf_000

## Design temp set of baits

In [316]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/kinetoplastea/initial_intervals/Leishmania_major_GCF_000002725_+17.bed --twobit results/phyluce/kinetoplastea/cleaned_genomes/Leishmania_major_GCF_000002725_formatted.2bit --buffer-to 160 --output results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17.fasta;
Screened 3352 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 3352.


design the baits

In [317]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17.fasta --probe-prefix uce_kinetoplastea_ --design kinetoplastea_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 3326
Probe Count = 6579


## Find duplicate baited regions

In [318]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17_temp_probes.fas --query results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  10:45:14
Ended:  Tue Feb 11, 2020  10:45:18
Time for execution:  0.0593067526817 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17_temp_probes.fas                        --lastz results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17_temp_probes_vself.lastz                       --probe-prefix=uce_kinetoplastea_;
Parsing lastz file...
Screening results...
Screened 6578 fasta sequences.  Filtered 298 duplicates. Kept 5989

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [319]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

In [29]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"
num_taxa = 17
i_probes_fas    = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"

i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/kinetoplastea/validate_intervals/Leishmania_major_GCF_000002725_+17_temp_probes.fas --scaffoldlist Angomonas_deanei_GCA_001659865 Angomonas_desouzai_GCA_000482185 Bodo_saltans_GCA_001460835 Crithidia_acanthocephali_GCA_000482105 Crithidia_bombi_GCA_900240985 Crithidia_expoeki_GCA_900240875 Crithidia_fasciculata_GCA_000331325 Crithidia_mellificae_GCA_002216565 Endotrypanum_monterogeii_GCA_000333855 Herpetomonas_muscarum_GCA_000482205 Leishmania_aethiopica_GCA_000444285 Leishmania_amazonensis_GCA_005317125 Leishmania_arabica_GCA_000410695 Leishmania_braziliensis_GCF_000002845 Leishmania_donovani_GCF_000227135 Leishmania_enriettii_GCA_000410755 Leishmania_gerbilli_GCA_000443025 Leishmania_infantum_GCF_000002875 Leishmania_lainsoni_GCA_003664395 Leishmania_mexicana_GCF_000234665 Leishmania_panamensis_GCF_000755165 Leishmania_peruviana_GCA_001403675 Leishmania_sp_GCA_000409445 Leishmania_tarentolae_GCA_009731335 Leishmania


Running against Herpetomonas_muscarum_GCA_000482205.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmp3PqTwS.fasta
	/tmp/tmp0vm7qw.fasta
	/tmp/tmp9FEl1w.fasta
	/tmp/tmpgROPbK.fasta

Writing the results file...
	/tmp/tmpKRkokO.lastz
	/tmp/tmpgqIFjI.lastz
	/tmp/tmp_1wwYA.lastz
	/tmp/tmpyQ1jjM.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/kinetoplastea/validate_intervals/lastz/Leishmania_major_GCF_000002725_+17_temp_probes.fas_v_Herpetomonas_muscarum_GCA_000482205.lastz
Creating Herpetomonas_muscarum_GCA_000482205 table
Inserting data to Herpetomonas_muscarum_GCA_000482205 table

Running against Leishmania_aethiopica_GCA_000444285.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpPICDD6.fasta
	/tmp/tmpOu0FaX.fasta
	/tmp/tmpwt5k07.fasta

Writing the results file...
	/tmp/tmpHn9tgc.lastz
	/tmp/tmpz


Running against Leishmania_peruviana_GCA_001403675.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmpnbhNGW.fasta
	/tmp/tmpZi_KRt.fasta
	/tmp/tmpeLxiQn.fasta
	/tmp/tmptwVbAZ.fasta

Writing the results file...
	/tmp/tmpoXCXKe.lastz
	/tmp/tmprkA2Zw.lastz
	/tmp/tmpTo8eSh.lastz
	/tmp/tmpdZchHG.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/kinetoplastea/validate_intervals/lastz/Leishmania_major_GCF_000002725_+17_temp_probes.fas_v_Leishmania_peruviana_GCA_001403675.lastz
Creating Leishmania_peruviana_GCA_001403675 table
Inserting data to Leishmania_peruviana_GCA_001403675 table

Running against Leishmania_sp_GCA_000409445.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmp8WUUlm.fasta
	/tmp/tmpYMyUt9.fasta
	/tmp/tmp0hOJXC.fasta

Writing the results file...
	/tmp/tmpjflZzk.lastz
	/tmp/tmpu_FZ9e.lastz


Creating Phytomonas_sp_GCA_000582765 table
Inserting data to Phytomonas_sp_GCA_000582765 table

Running against Strigomonas_culicis_GCA_000482145.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpCAOV9Z.fasta
	/tmp/tmpFVfihh.fasta
	/tmp/tmpj5C4AM.fasta

Writing the results file...
	/tmp/tmp3GGF7g.lastz
	/tmp/tmplNwQU0.lastz
	/tmp/tmpgnlRI2.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/kinetoplastea/validate_intervals/lastz/Leishmania_major_GCF_000002725_+17_temp_probes.fas_v_Strigomonas_culicis_GCA_000482145.lastz
Creating Strigomonas_culicis_GCA_000482145 table
Inserting data to Strigomonas_culicis_GCA_000482145 table

Running against Strigomonas_galati_GCA_000482125.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpNkftXd.fasta
	/tmp/tmpRu7Kty.fasta
	/tmp/tmpsTKSWM.fasta

Writing the results 

Creating Trypanosoma_vivax_GCA_000227375 table
Inserting data to Trypanosoma_vivax_GCA_000227375 table

Running against Trypanosomatidae_sp_GCA_003671325.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 6 queries...
	/tmp/tmpC1Eta_.fasta
	/tmp/tmpuf9ltq.fasta
	/tmp/tmpQXjF5d.fasta
	/tmp/tmpOvXZw8.fasta
	/tmp/tmpghw5ov.fasta
	/tmp/tmphJygnb.fasta

Writing the results file...
	/tmp/tmp9pzt0z.lastz
	/tmp/tmpwo8iIn.lastz
	/tmp/tmpBVdPWO.lastz
	/tmp/tmpfHW7de.lastz
	/tmp/tmpg4eG_f.lastz
	/tmp/tmp5V3Afx.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/kinetoplastea/validate_intervals/lastz/Leishmania_major_GCF_000002725_+17_temp_probes.fas_v_Trypanosomatidae_sp_GCA_003671325.lastz
Creating Trypanosomatidae_sp_GCA_003671325 table
Inserting data to Trypanosomatidae_sp_GCA_003671325 table

Running against Leishmania_major_GCF_000002725.2bit
Running with the --huge option.  Chunking files into 10000

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [30]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/kinetoplastea/extract_probes_from_group/kinetoplastea_genome.conf --lastz results/phyluce/kinetoplastea/validate_intervals/lastz --probes 120 --probe-prefix uce_kinetoplastea_ --name-pattern "Leishmania_major_GCF_000002725_+17_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/kinetoplastea/extract_probes_from_group/probe_fasta;
2020-02-11 12:42:30,907 - Phyluce - INFO - -------- Working on Angomonas_deanei_GCA_001659865 genome -------
2020-02-11 12:42:31,025 - Phyluce - INFO - Reading Angomonas_deanei_GCA_001659865 genome
2020-02-11 12:42:42,040 - Phyluce - INFO - Angomonas_deanei_GCA_001659865: 2112 uces, 342 dupes, 1770 non-dupes, 0 orient drop, 50 length drop, 1720 written
2020-02-11 12:42:42,041 - Phyluce - INFO - ------- Working on Angomonas_desouzai_GCA_000482185 genome ------
2020-02-11 12:42:42,056 - Phyluce - INFO - Reading Angomonas_desouzai_GCA_000482185 genome
2020-02-11 12:42:51,999 - Phyluce - INFO 

2020-02-11 12:47:21,466 - Phyluce - INFO - Reading Leishmania_peruviana_GCA_001403675 genome
2020-02-11 12:47:34,536 - Phyluce - INFO - Leishmania_peruviana_GCA_001403675: 3117 uces, 420 dupes, 2697 non-dupes, 7 orient drop, 124 length drop, 2566 written
2020-02-11 12:47:34,536 - Phyluce - INFO - --------- Working on Leishmania_sp_GCA_000409445 genome ---------
2020-02-11 12:47:34,618 - Phyluce - INFO - Reading Leishmania_sp_GCA_000409445 genome
2020-02-11 12:47:47,689 - Phyluce - INFO - Leishmania_sp_GCA_000409445: 3128 uces, 449 dupes, 2679 non-dupes, 7 orient drop, 140 length drop, 2532 written
2020-02-11 12:47:47,690 - Phyluce - INFO - ----- Working on Leishmania_tarentolae_GCA_009731335 genome -----
2020-02-11 12:47:47,700 - Phyluce - INFO - Reading Leishmania_tarentolae_GCA_009731335 genome
2020-02-11 12:48:03,961 - Phyluce - INFO - Leishmania_tarentolae_GCA_009731335: 3258 uces, 508 dupes, 2750 non-dupes, 12 orient drop, 185 length drop, 2553 written
2020-02-11 12:48:03,962 - Ph

2020-02-11 12:51:31,815 - Phyluce - INFO - Reading Trypanosoma_theileri_GCF_002087225 genome
2020-02-11 12:51:39,545 - Phyluce - INFO - Trypanosoma_theileri_GCF_002087225: 1831 uces, 270 dupes, 1561 non-dupes, 7 orient drop, 90 length drop, 1464 written
2020-02-11 12:51:39,546 - Phyluce - INFO - ------- Working on Trypanosoma_vivax_GCA_000227375 genome -------
2020-02-11 12:51:40,035 - Phyluce - INFO - Reading Trypanosoma_vivax_GCA_000227375 genome
2020-02-11 12:51:41,035 - Phyluce - INFO - Trypanosoma_vivax_GCA_000227375: 178 uces, 58 dupes, 120 non-dupes, 1 orient drop, 7 length drop, 111 written
2020-02-11 12:51:41,035 - Phyluce - INFO - ------ Working on Trypanosomatidae_sp_GCA_003671325 genome ------
2020-02-11 12:51:41,044 - Phyluce - INFO - Reading Trypanosomatidae_sp_GCA_003671325 genome
2020-02-11 12:52:01,429 - Phyluce - INFO - Trypanosomatidae_sp_GCA_003671325: 2721 uces, 1277 dupes, 1444 non-dupes, 0 orient drop, 38 length drop, 1406 written
2020-02-11 12:52:01,429 - Phyluc

In [31]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/kinetoplastea/extract_probes_from_group/probe_fasta --output results/phyluce/kinetoplastea/extract_probes_from_group/multifastas.sqlite --base-taxon Leishmania_major_GCF_000002725;
angomonas_deanei_gca_001659865..
angomonas_desouzai_gca_000482185..
bodo_saltans_gca_001460835..
crithidia_acanthocephali_gca_000482105...
crithidia_bombi_gca_900240985...
crithidia_expoeki_gca_900240875...
crithidia_fasciculata_gca_000331325..
crithidia_mellificae_gca_002216565..
endotrypanum_monterogeii_gca_000333855...
herpetomonas_muscarum_gca_000482205..
leishmania_aethiopica_gca_000444285...
leishmania_amazonensis_gca_005317125...
leishmania_arabica_gca_000410695...
leishmania_braziliensis_gcf_000002845...
leishmania_donovani_gcf_000227135...
leishmania_enriettii_gca_000410755...
leishmania_gerbilli_gca_000443025...
leishmania_infantum_gcf_000002875...
leishmania_lainsoni_gca_003664395...
leishmania_mexicana_gcf_000234665...
leishmania_paname

In [32]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(45)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/kinetoplastea/extract_probes_from_group/multifastas.sqlite  --base-taxon Leishmania_major_GCF_000002725 --output results/phyluce/kinetoplastea/extract_probes_from_group/Leishmania_major_GCF_000002725+17-back-to-45.conf --specific-counts 45;
Counter({'trypanosoma_rangeli_gcf_003719475': 169, 'trypanosoma_brucei_gcf_000210295': 169, 'leishmania_mexicana_gcf_000234665': 169, 'endotrypanum_monterogeii_gca_000333855': 169, 'leptomonas_seymouri_gca_001299535': 169, 'leptomonas_pyrrhocoris_gcf_001293395': 169, 'leishmania_peruviana_gca_001403675': 169, 'leishmania_turanica_gca_000441995': 169, 'leishmania_gerbilli_gca_000443025': 169, 'leishmania_sp_gca_000409445': 169, 'leishmania_aethiopica_gca_000444285': 169, 'strigomonas_oncopelti_gca_000482165': 169, 'angomonas_desouzai_gca_000482185': 169, 'trypanosoma_grayi_gcf_000691245': 169, 'leishmania_enriettii_gca_000410755': 169, 'crithidia_acanthocephali_gca_000482105': 169, 'leishmani

## Final group specific bait design

In [33]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/kinetoplastea/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/kinetoplastea/extract_probes_from_group/Leishmania_major_GCF_000002725+17-back-to-45.conf --probe-prefix uce_kinetoplastea_ --designer rnplattii --design kinetoplastea_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGGGNNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGGGGG


Conserved locus count = 169
Probe Count = 15149


In [34]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list.fasta --query results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  12:52:56
Ended:  Tue Feb 11, 2020  12:53:57
Time for execution:  1.01459121704 minutes


In [35]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list.fasta --lastz results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_kinetoplastea_;
Parsing lastz file...
Screening results...
Screened 15148 fasta sequences.  Filtered 0 duplicates. Kept 15149.


## CDhit to reduce numbers

In [36]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list.fasta
         -o
         results/phyluce/kinetoplastea/final_probe_design/kinetoplastea_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 13:17:09 2020
                            Output                              
----------------------------------------------------------------
total seq: 15149
longest and shortest : 80 and 80
Total letters: 1211920
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 3M
Buffer          : 4 X 12M = 48M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 90M

Table limit with the given memory limit:
Max number of representatives: 3939168
Max number of word counting entries: 88670684

# comparing sequences from          0  to       2524
..---------- new table with     1780 representatives
# comparing sequenc

# Onygenales

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [170]:
group = 'onygenales'

In [161]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [171]:
group_taxa = [ "Amauroascus_mutatus_GCA_001430935",
               "Amauroascus_niger_GCA_001430945",
               "Ascospaera_apis_GCA_001636715",
               "Blastomyces_dermatitidis_GCA_000003525",
               "Blastomyces_gilchristii_GCF_000003855",
               "Blastomyces_parvus_GCA_002572885",
               "Blastomyces_percursus_GCA_003206225",
               "Blastomyces_silverae_GCA_001014755",
               "Blastomyces_sp_GCA_003206725",
               "Byssoonygena_ceratinophila_GCA_001430925",
               "Chrysosprium_queenslandicum_GCA_001430955",
               "Coccidioides_immitis_GCF_000149335",
               "Coccidioides_posadasii_GCF_000151335",
               "Emergomyces_orientalis_GCA_002110485",
               "Emergomyces_pasteurianus_GCA_001883825",
               "Emmonsia_crescens_GCA_002572855",
               "Emmonsia_sp_GCA_001660665",
               "Helicocarpus_griseus_GCA_002573585",
               "Malbranchea_cinnamomea_GCA_900128795",
               "Microsprum_canis_GCF_000151145",
               "Nannizzia_gypsea_GCF_000150975",
               "Onygena_corvina_GCA_000812245",
               "Ophidiomyces_ophiodiicola_GCA_002167195",
               "Paracoccidioides_brasiliensis_GCF_000150735",
               "Paracoccidioides_lutzii_GCF_000150705",
               "Polytolypa_hystricis_GCA_002573605",
               "Trichophyton_benhamiae_GCF_000151125",
               "Trichophyton_equinum_GCA_000151175",
               "Trichophyton_interdigitale_GCA_000622975",
               "Trichophyton_mentagrophytes_GCA_003664465",
               "Trichophyton_rubrum_GCF_000151425",
               "Trichophyton_soudanense_GCA_000616865",
               "Trichophyton_tonsurans_GCA_000151455",
               "Trichophyton_verrucosum_GCF_000151505",
               "Trichophyton_violaceum_GCA_001651435",
               "Uncinocarpus_reesii_GCF_000003515" ]
reference_taxon = "Histoplasma_capsulatum_GCF_000149585"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [163]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001430935.1_ASM143093v1_genomic.fna.gz

sent 42 bytes  received 8869677 bytes  3547887.60 bytes/sec
total size is 8867402  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
discipl

GCA_001430955.1_ASM143095v1_genomic.fna.gz

sent 42 bytes  received 9716061 bytes  6477402.00 bytes/sec
total size is 9713578  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000149335.2_ASM14933v2_genomic.fna.gz

sent 42 bytes  received 9036510 bytes  6024368.00 bytes/sec
total size is 9034196  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute

GCA_000150975.2_MS_CBS118893_genomic.fna.gz

sent 42 bytes  received 7334051 bytes  4889395.33 bytes/sec
total size is 7332152  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000812245.1_ASM81224v1_genomic.fna.gz

sent 42 bytes  received 6790128 bytes  2716068.00 bytes/sec
total size is 6788359  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_000616865.1_Tric_soud_CBS_452_61_V1_genomic.fna.gz

sent 42 bytes  received 7241144 bytes  499392.14 bytes/sec
total size is 7239258  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000151455.1_ASM15145v1_genomic.fna.gz

sent 42 bytes  received 7165270 bytes  4776874.67 bytes/sec
total size is 7163413  speedup is 1.00


You are accessing a U.S. Government information system which includes t

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [167]:
30204if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [168]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5485968 ("sim_Amauroascus_mutatus_GCA_001430935") has been submitted
Your job 5485969 ("sim_Amauroascus_niger_GCA_001430945") has been submitted
Your job 5485970 ("sim_Ascospaera_apis_GCA_001636715") has been submitted
Your job 5485971 ("sim_Blastomyces_dermatitidis_GCA_000003525") has been submitted
Your job 5485972 ("sim_Blastomyces_gilchristii_GCF_000003855") has been submitted
Your job 5485973 ("sim_Blastomyces_parvus_GCA_002572885") has been submitted
Your job 5485974 ("sim_Blastomyces_percursus_GCA_003206225") has been submitted
Your job 5485975 ("sim_Blastomyces_silverae_GCA_001014755") has been submitted
Your job 5485976 ("sim_Blastomyces_sp_GCA_003206725") has been submitted
Your job 5485977 ("sim_Byssoonygena_ceratinophila_GCA_001430925") has been submitted
Your job 5485978 ("sim_Chrysosprium_queenslandicum_GCA_001430955") has been submitted
Your job 5485979 ("sim_Coccidioides_immitis_GCF_000149335") has been submitted
Your job 5485980 ("sim_Coccidioides_posadasii_GC

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [169]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/onygenales/cleaned_genomes/Histoplasma_capsulatum_GCF_000149585_formatted.fas path=results/phyluce/onygenales/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [170]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486004 ("map_Amauroascus_mutatus_GCA_001430935") has been submitted
Your job 5486005 ("map_Amauroascus_niger_GCA_001430945") has been submitted
Your job 5486006 ("map_Ascospaera_apis_GCA_001636715") has been submitted
Your job 5486007 ("map_Blastomyces_dermatitidis_GCA_000003525") has been submitted
Your job 5486008 ("map_Blastomyces_gilchristii_GCF_000003855") has been submitted
Your job 5486009 ("map_Blastomyces_parvus_GCA_002572885") has been submitted
Your job 5486010 ("map_Blastomyces_percursus_GCA_003206225") has been submitted
Your job 5486011 ("map_Blastomyces_silverae_GCA_001014755") has been submitted
Your job 5486012 ("map_Blastomyces_sp_GCA_003206725") has been submitted
Your job 5486013 ("map_Byssoonygena_ceratinophila_GCA_001430925") has been submitted
Your job 5486014 ("map_Chrysosprium_queenslandicum_GCA_001430955") has been submitted
Your job 5486015 ("map_Coccidioides_immitis_GCF_000149335") has been submitted
Your job 5486016 ("map_Coccidioides_posadasii_GC

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [171]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486040 ("merge_Amauroascus_mutatus_GCA_001430935") has been submitted
Your job 5486041 ("merge_Amauroascus_niger_GCA_001430945") has been submitted
Your job 5486042 ("merge_Ascospaera_apis_GCA_001636715") has been submitted
Your job 5486043 ("merge_Blastomyces_dermatitidis_GCA_000003525") has been submitted
Your job 5486044 ("merge_Blastomyces_gilchristii_GCF_000003855") has been submitted
Your job 5486045 ("merge_Blastomyces_parvus_GCA_002572885") has been submitted
Your job 5486046 ("merge_Blastomyces_percursus_GCA_003206225") has been submitted
Your job 5486047 ("merge_Blastomyces_silverae_GCA_001014755") has been submitted
Your job 5486048 ("merge_Blastomyces_sp_GCA_003206725") has been submitted
Your job 5486049 ("merge_Byssoonygena_ceratinophila_GCA_001430925") has been submitted
Your job 5486050 ("merge_Chrysosprium_queenslandicum_GCA_001430955") has been submitted
Your job 5486051 ("merge_Coccidioides_immitis_GCF_000149335") has been submitted
Your job 5486052 ("merge

remove loci that were masked in the original genome

In [173]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 350 sequences from Amauroascus_mutatus_GCA_001430935_merged.bed.  Filtered 258 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 92.
Traceback (most recent call last):
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 122, in <module>
    main()
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 111, in main
    cnt + 1,
UnboundLocalError: local variable 'cnt' referenced before assignment
Traceback (most recent call last):
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 122, in <module>
    main()
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 111, in main
    cnt + 1,
UnboundLocalError: local variable 'cnt' referenced before assignment
Screened 20819 sequences from Blastomyces_dermatitidis_GCA_000003525_merg

Traceback (most recent call last):
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 122, in <module>
    main()
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 111, in main
    cnt + 1,
UnboundLocalError: local variable 'cnt' referenced before assignment
Traceback (most recent call last):
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 122, in <module>
    main()
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 111, in main
    cnt + 1,
UnboundLocalError: local variable 'cnt' referenced before assignment
Traceback (most recent call last):
  File "/master/nplatt/anaconda3/envs/pathogen_probes-phyluce/bin/phyluce_probe_strip_masked_loci_from_set", line 122, in <module>
    main()
  File "/master/nplatt/anaconda3/envs/pathogen_prob

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [174]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [175]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/onygenales/initial_intervals/Uncinocarpus_reesii_GCF_000003515_merged.bed --twobit results/phyluce/onygenales/cleaned_genomes/Histoplasma_capsulatum_GCF_000149585_formatted.2bit --output results/phyluce/onygenales/initial_intervals/Uncinocarpus_reesii_GCF_000003515_stripped.bed;
amauroascus_mutatus_gca_001430935.
amauroascus_niger_gca_001430945
ascospaera_apis_gca_001636715
blastomyces_dermatitidis_gca_000003525.........
blastomyces_gilchristii_gcf_000003855.........
blastomyces_parvus_gca_002572885........
blastomyces_percursus_gca_003206225.........
blastomyces_silverae_gca_001014755.........
blastomyces_sp_gca_003206725
byssoonygena_ceratinophila_gca_001430925
chrysosprium_queenslandicum_gca_001430955.
coccidioides_immitis_gcf_000149335.
coccidioides_posadasii_gcf_000151335.
emergomyces_orientalis_gca_002110485.........
emergomyces_pasteurianus_gca_001883825......
emmonsia_crescens_gca

Quantify probes and the number of targeted taxa for each.

In [176]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/onygenales/initial_intervals/onygenales-to-Histoplasma_capsulatum_GCF_000149585.sqlite --base-taxon Histoplasma_capsulatum_GCF_000149585
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 0 taxa:	16,588.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 1 taxa:	16,588.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 2 taxa:	12,521.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 3 taxa:	9,887.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 4 taxa:	7,898.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 5 taxa:	6,143.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 6 taxa:	4,434.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 7 taxa:	3,004.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 8 taxa:	1,812.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 9 taxa:	618.0
Loci shared by Histoplasma_capsulatum_GCF_000149585 + 10 taxa:	329.0
Loci shared by Histoplasma_capsula

In [177]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 9
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/onygenales/initial_intervals/onygenales-to-Histoplasma_capsulatum_GCF_000149585.sqlite --base-taxon Histoplasma_capsulatum_GCF_000149585 --output results/phyluce/onygenales/initial_intervals/Histoplasma_capsulatum_GCF_000149585_+9.bed --specific-counts 9;
Counter({'blastomyces_gilchristii_gcf_000003855': 614, 'blastomyces_dermatitidis_gca_000003525': 612, 'blastomyces_percursus_gca_003206225': 611, 'emergomyces_orientalis_gca_002110485': 605, 'blastomyces_parvus_gca_002572885': 591, 'emergomyces_pasteurianus_gca_001883825': 581, 'emmonsia_sp_gca_001660665': 572, 'blastomyces_silverae_gca_001014755': 572, 'paracoccidioides_brasiliensis_gcf_000150735': 438, 'paracoccidioides_lutzii_gcf_000150705': 409, 'helicocarpus_griseus_gca_002573585': 326, 'coccidioides_posadasii_gcf_000151335': 45, 'coccidioides_immitis_gcf_000149335': 42, 'trichophyton_mentagrophytes_gca_003664465': 32, 'chrysosprium_queenslandicum_gca_001430955': 31, 'ama

## Design temp set of baits

In [178]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/onygenales/initial_intervals/Histoplasma_capsulatum_GCF_000149585_+9.bed --twobit results/phyluce/onygenales/cleaned_genomes/Histoplasma_capsulatum_GCF_000149585_formatted.2bit --buffer-to 160 --output results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9.fasta;
Screened 618 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 618.


design the baits

In [179]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9.fasta --probe-prefix uce_onygenales_ --design onygenales_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
G


Conserved locus count = 618
Probe Count = 1235


## Find duplicate baited regions

In [180]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas --query results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes_vself.lastz;
Started:  Mon Feb 10, 2020  13:46:56
Ended:  Mon Feb 10, 2020  13:46:56
Time for execution:  0.00786991516749 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas                        --lastz results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes_vself.lastz                       --probe-prefix=uce_onygenales_;
Parsing lastz file...
Screening results...
Screened 1234 fasta sequences.  Filtered 8 duplicates. Kep

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [181]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [182]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/onygenales/validate_intervals/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas --scaffoldlist Amauroascus_mutatus_GCA_001430935 Amauroascus_niger_GCA_001430945 Ascospaera_apis_GCA_001636715 Blastomyces_dermatitidis_GCA_000003525 Blastomyces_gilchristii_GCF_000003855 Blastomyces_parvus_GCA_002572885 Blastomyces_percursus_GCA_003206225 Blastomyces_silverae_GCA_001014755 Blastomyces_sp_GCA_003206725 Byssoonygena_ceratinophila_GCA_001430925 Chrysosprium_queenslandicum_GCA_001430955 Coccidioides_immitis_GCF_000149335 Coccidioides_posadasii_GCF_000151335 Emergomyces_orientalis_GCA_002110485 Emergomyces_pasteurianus_GCA_001883825 Emmonsia_crescens_GCA_002572855 Emmonsia_sp_GCA_001660665 Helicocarpus_griseus_GCA_002573585 Malbranchea_cinnamomea_GCA_900128795 Microsprum_canis_GCF_000151145 Nannizzia_gypsea_GCF_000150975 Onygena_corvina_GCA_000812245 Ophidiomyces_ophiodiicola_GCA_002167195 Paracoccidioides_brasiliensis_G

Running the targets against 3 queries...
	/tmp/tmpUNunqp.fasta
	/tmp/tmpwBJQIA.fasta
	/tmp/tmpYWYdgP.fasta

Writing the results file...
	/tmp/tmpb659Bz.lastz
	/tmp/tmpbO2SI2.lastz
	/tmp/tmpZUzx6p.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/onygenales/validate_intervals/lastz/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas_v_Byssoonygena_ceratinophila_GCA_001430925.lastz
Creating Byssoonygena_ceratinophila_GCA_001430925 table
Inserting data to Byssoonygena_ceratinophila_GCA_001430925 table

Running against Chrysosprium_queenslandicum_GCA_001430955.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 4 queries...
	/tmp/tmp_jKUd0.fasta
	/tmp/tmpCzN5Of.fasta
	/tmp/tmpUdWzd9.fasta
	/tmp/tmpQdaeZU.fasta

Writing the results file...
	/tmp/tmp9K0lUI.lastz
	/tmp/tmptmJW5D.lastz
	/tmp/tmptrqzUT.lastz
	/tmp/tmpJeJpab.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_pro

Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/onygenales/validate_intervals/lastz/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas_v_Onygena_corvina_GCA_000812245.lastz
Creating Onygena_corvina_GCA_000812245 table
Inserting data to Onygena_corvina_GCA_000812245 table

Running against Ophidiomyces_ophiodiicola_GCA_002167195.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpp6yWQ5.fasta
	/tmp/tmp1P9uVk.fasta
	/tmp/tmpwfzTwI.fasta

Writing the results file...
	/tmp/tmpksKEFm.lastz
	/tmp/tmpo9KwCy.lastz
	/tmp/tmpnxur2A.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/onygenales/validate_intervals/lastz/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas_v_Ophidiomyces_ophiodiicola_GCA_002167195.lastz
Creating Ophidiomyces_ophiodiicola_GCA_002167195 table
Inserting data to Ophidiomyces_ophiodiicola_GCA_002167195 table

R

Creating Trichophyton_verrucosum_GCF_000151505 table
Inserting data to Trichophyton_verrucosum_GCF_000151505 table

Running against Trichophyton_violaceum_GCA_001651435.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpGj6VkP.fasta
	/tmp/tmp6TShDG.fasta
	/tmp/tmpKaQb0r.fasta

Writing the results file...
	/tmp/tmpCYpw03.lastz
	/tmp/tmpKavwzi.lastz
	/tmp/tmpQ_dWMn.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/onygenales/validate_intervals/lastz/Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas_v_Trichophyton_violaceum_GCA_001651435.lastz
Creating Trichophyton_violaceum_GCA_001651435 table
Inserting data to Trichophyton_violaceum_GCA_001651435 table

Running against Uncinocarpus_reesii_GCF_000003515.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmp9kHFj5.fasta
	/tmp/tmpklxJO1.fasta
	/tmp/tm

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [183]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/onygenales/extract_probes_from_group/onygenales_genome.conf --lastz results/phyluce/onygenales/validate_intervals/lastz --probes 120 --probe-prefix uce_onygenales_ --name-pattern "Histoplasma_capsulatum_GCF_000149585_+9_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/onygenales/extract_probes_from_group/probe_fasta;
2020-02-10 13:49:49,236 - Phyluce - INFO - ------ Working on Amauroascus_mutatus_GCA_001430935 genome ------
2020-02-10 13:49:49,247 - Phyluce - INFO - Reading Amauroascus_mutatus_GCA_001430935 genome
2020-02-10 13:49:51,440 - Phyluce - INFO - Amauroascus_mutatus_GCA_001430935: 584 uces, 77 dupes, 507 non-dupes, 0 orient drop, 1 length drop, 506 written
2020-02-10 13:49:51,441 - Phyluce - INFO - ------- Working on Amauroascus_niger_GCA_001430945 genome -------
2020-02-10 13:49:51,441 - Phyluce - INFO - Reading Amauroascus_niger_GCA_001430945 genome
2020-02-10 13:49:53,645 - Phyluce - INFO - Amauroas

2020-02-10 13:50:39,196 - Phyluce - INFO - Onygena_corvina_GCA_000812245: 567 uces, 66 dupes, 501 non-dupes, 0 orient drop, 0 length drop, 501 written
2020-02-10 13:50:39,196 - Phyluce - INFO - --- Working on Ophidiomyces_ophiodiicola_GCA_002167195 genome ---
2020-02-10 13:50:39,197 - Phyluce - INFO - Reading Ophidiomyces_ophiodiicola_GCA_002167195 genome
2020-02-10 13:50:41,266 - Phyluce - INFO - Ophidiomyces_ophiodiicola_GCA_002167195: 584 uces, 61 dupes, 523 non-dupes, 2 orient drop, 4 length drop, 517 written
2020-02-10 13:50:41,267 - Phyluce - INFO - - Working on Paracoccidioides_brasiliensis_GCF_000150735 genome -
2020-02-10 13:50:41,280 - Phyluce - INFO - Reading Paracoccidioides_brasiliensis_GCF_000150735 genome
2020-02-10 13:50:43,924 - Phyluce - INFO - Paracoccidioides_brasiliensis_GCF_000150735: 610 uces, 72 dupes, 538 non-dupes, 5 orient drop, 5 length drop, 528 written
2020-02-10 13:50:43,924 - Phyluce - INFO - ---- Working on Paracoccidioides_lutzii_GCF_000150705 genome -

In [184]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/onygenales/extract_probes_from_group/probe_fasta --output results/phyluce/onygenales/extract_probes_from_group/multifastas.sqlite --base-taxon Histoplasma_capsulatum_GCF_000149585;
amauroascus_mutatus_gca_001430935.
amauroascus_niger_gca_001430945.
ascospaera_apis_gca_001636715.
blastomyces_dermatitidis_gca_000003525.
blastomyces_gilchristii_gcf_000003855.
blastomyces_parvus_gca_002572885.
blastomyces_percursus_gca_003206225.
blastomyces_silverae_gca_001014755.
blastomyces_sp_gca_003206725.
byssoonygena_ceratinophila_gca_001430925.
chrysosprium_queenslandicum_gca_001430955.
coccidioides_immitis_gcf_000149335.
coccidioides_posadasii_gcf_000151335.
emergomyces_orientalis_gca_002110485.
emergomyces_pasteurianus_gca_001883825.
emmonsia_crescens_gca_002572855.
emmonsia_sp_gca_001660665.
helicocarpus_griseus_gca_002573585.
malbranchea_cinnamomea_gca_900128795.
microsprum_canis_gcf_000151145.
nannizzia_gypsea_gcf_000150975.
onygena_

In [185]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(37)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/onygenales/extract_probes_from_group/multifastas.sqlite  --base-taxon Histoplasma_capsulatum_GCF_000149585 --output results/phyluce/onygenales/extract_probes_from_group/Histoplasma_capsulatum_GCF_000149585+9-back-to-37.conf --specific-counts 37;
Counter({'trichophyton_verrucosum_gcf_000151505': 250, 'coccidioides_immitis_gcf_000149335': 250, 'amauroascus_niger_gca_001430945': 250, 'blastomyces_percursus_gca_003206225': 250, 'microsprum_canis_gcf_000151145': 250, 'blastomyces_parvus_gca_002572885': 250, 'blastomyces_sp_gca_003206725': 250, 'histoplasma_capsulatum_gcf_000149585': 250, 'blastomyces_gilchristii_gcf_000003855': 250, 'emergomyces_orientalis_gca_002110485': 250, 'trichophyton_soudanense_gca_000616865': 250, 'malbranchea_cinnamomea_gca_900128795': 250, 'trichophyton_violaceum_gca_001651435': 250, 'onygena_corvina_gca_000812245': 250, 'amauroascus_mutatus_gca_001430935': 250, 'polytolypa_hystricis_gca_002573605': 250, '

## Final group specific bait design

In [186]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/onygenales/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/onygenales/extract_probes_from_group/Histoplasma_capsulatum_GCF_000149585+9-back-to-37.conf --probe-prefix uce_onygenales_ --designer rnplattii --design onygenales_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list.fasta;
GGGGGGGGGGGGNNGGGG


Conserved locus count = 250
Probe Count = 18482


In [187]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list.fasta --query results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Mon Feb 10, 2020  13:54:04
Ended:  Mon Feb 10, 2020  13:55:09
Time for execution:  1.08550671736 minutes


In [188]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list.fasta --lastz results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_onygenales_;
Parsing lastz file...
Screening results...
Screened 18481 fasta sequences.  Filtered 0 duplicates. Kept 18482.


## CDhit to reduce numbers

In [189]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list.fasta
         -o
         results/phyluce/onygenales/final_probe_design/onygenales_v1-master_probe_list.95P_cdhit

Started: Mon Feb 10 14:20:39 2020
                            Output                              
----------------------------------------------------------------
total seq: 18482
longest and shortest : 80 and 80
Total letters: 1478560
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 3M
Buffer          : 4 X 12M = 49M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 91M

Table limit with the given memory limit:
Max number of representatives: 3933501
Max number of word counting entries: 88543113

# comparing sequences from          0  to       3080
...---------- new table with     1970 representatives
# comparing sequences from    

# Trematodes

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [190]:
group = 'trematoda'

In [173]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [191]:
group_taxa = [ "Clonorchis_sinensis_GCA_003604175",
               "Dicrocoelium_dendriticum_GCA_000950715",
               "Echinostoma_caproni_GCA_900618425",
               "Fasciola_gigantica_GCA_006461475",
               "Fasciola_hepatica_GCA_900302435",
               "Fasciolopsis_buski_GCA_008360955",
               "Opisthorchis_felineus_GCA_004794785",
               "Opisthorchis_viverrini_GCF_000715545",
               "Paragonimus_westermani_GCA_008508345",
               "Schistosoma_bovis_GCA_003958945",
               "Schistosoma_curassoni_GCA_900618015",
               "Schistosoma_haematobium_GCF_000699445",
               "Schistosoma_japonicum_GCA_006368765",
               "Schistosoma_margrebowiei_GCA_900618395",
               "Schistosoma_mattheei_GCA_900617995",
               "Trichobilharzia_regenti_GCA_900618515" ]


reference_taxon = "Schistosoma_mansoni_GCA_000237925"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [6]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003604175.1_ASM360417v1_genomic.fna.gz

sent 42 bytes  received 175932734 bytes  7180929.63 bytes/sec
total size is 175889683  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
dis


GCA_900618015.1_S_curassoni_Dakar_0011_upd_genomic.fna.gz

sent 42 bytes  received 110903031 bytes  7155036.97 bytes/sec
total size is 110875837  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000699445.1_SchHae_1.0_genomic.fna.gz

sent 42 bytes  received 115390522 bytes  6993367.52 bytes/sec
total size is 115362248  speedup is 1.00


You are accessing a U.S. Government information system whi

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [7]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [8]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486076 ("sim_Clonorchis_sinensis_GCA_003604175") has been submitted
Your job 5486077 ("sim_Dicrocoelium_dendriticum_GCA_000950715") has been submitted
Your job 5486078 ("sim_Echinostoma_caproni_GCA_900618425") has been submitted
Your job 5486079 ("sim_Fasciola_gigantica_GCA_006461475") has been submitted
Your job 5486080 ("sim_Fasciola_hepatica_GCA_900302435") has been submitted
Your job 5486081 ("sim_Fasciolopsis_buski_GCA_008360955") has been submitted
Your job 5486082 ("sim_Opisthorchis_felineus_GCA_004794785") has been submitted
Your job 5486083 ("sim_Opisthorchis_viverrini_GCF_000715545") has been submitted
Your job 5486084 ("sim_Paragonimus_westermani_GCA_008508345") has been submitted
Your job 5486085 ("sim_Schistosoma_bovis_GCA_003958945") has been submitted
Your job 5486086 ("sim_Schistosoma_curassoni_GCA_900618015") has been submitted
Your job 5486087 ("sim_Schistosoma_haematobium_GCF_000699445") has been submitted
Your job 5486088 ("sim_Schistosoma_japonicum_GCA_00

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [9]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/trematoda/cleaned_genomes/Schistosoma_mansoni_GCA_000237925_formatted.fas path=results/phyluce/trematoda/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [10]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486092 ("map_Clonorchis_sinensis_GCA_003604175") has been submitted
Your job 5486093 ("map_Dicrocoelium_dendriticum_GCA_000950715") has been submitted
Your job 5486094 ("map_Echinostoma_caproni_GCA_900618425") has been submitted
Your job 5486095 ("map_Fasciola_gigantica_GCA_006461475") has been submitted
Your job 5486096 ("map_Fasciola_hepatica_GCA_900302435") has been submitted
Your job 5486097 ("map_Fasciolopsis_buski_GCA_008360955") has been submitted
Your job 5486098 ("map_Opisthorchis_felineus_GCA_004794785") has been submitted
Your job 5486099 ("map_Opisthorchis_viverrini_GCF_000715545") has been submitted
Your job 5486100 ("map_Paragonimus_westermani_GCA_008508345") has been submitted
Your job 5486101 ("map_Schistosoma_bovis_GCA_003958945") has been submitted
Your job 5486102 ("map_Schistosoma_curassoni_GCA_900618015") has been submitted
Your job 5486103 ("map_Schistosoma_haematobium_GCF_000699445") has been submitted
Your job 5486104 ("map_Schistosoma_japonicum_GCA_00

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [192]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486832 ("merge_Clonorchis_sinensis_GCA_003604175") has been submitted
Your job 5486833 ("merge_Dicrocoelium_dendriticum_GCA_000950715") has been submitted
Your job 5486834 ("merge_Echinostoma_caproni_GCA_900618425") has been submitted
Your job 5486835 ("merge_Fasciola_gigantica_GCA_006461475") has been submitted
Your job 5486836 ("merge_Fasciola_hepatica_GCA_900302435") has been submitted
Your job 5486837 ("merge_Fasciolopsis_buski_GCA_008360955") has been submitted
Your job 5486838 ("merge_Opisthorchis_felineus_GCA_004794785") has been submitted
Your job 5486839 ("merge_Opisthorchis_viverrini_GCF_000715545") has been submitted
Your job 5486840 ("merge_Paragonimus_westermani_GCA_008508345") has been submitted
Your job 5486841 ("merge_Schistosoma_bovis_GCA_003958945") has been submitted
Your job 5486842 ("merge_Schistosoma_curassoni_GCA_900618015") has been submitted
Your job 5486843 ("merge_Schistosoma_haematobium_GCF_000699445") has been submitted
Your job 5486844 ("merge_Sc

remove loci that were masked in the original genome

In [193]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 831 sequences from Clonorchis_sinensis_GCA_003604175_merged.bed.  Filtered 805 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 26.
Screened 217 sequences from Dicrocoelium_dendriticum_GCA_000950715_merged.bed.  Filtered 213 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 926 sequences from Echinostoma_caproni_GCA_900618425_merged.bed.  Filtered 909 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 17.
Screened 1179 sequences from Fasciola_gigantica_GCA_006461475_merged.bed.  Filtered 1138 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 41.
Screened 1337 sequences from Fasciola_hepatica_GCA_900302435_merged.bed.  Filtered 1297 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 40.
Screened 1077 sequences from Fasciolopsis_buski_GCA_008360955_merged.bed.  Filtered 1042 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 35.
Screened 620 sequences from Opisthorchis_felineus_GCA_004794785_

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [194]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [195]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/trematoda/initial_intervals/Trichobilharzia_regenti_GCA_900618515_merged.bed --twobit results/phyluce/trematoda/cleaned_genomes/Schistosoma_mansoni_GCA_000237925_formatted.2bit --output results/phyluce/trematoda/initial_intervals/Trichobilharzia_regenti_GCA_900618515_stripped.bed;
clonorchis_sinensis_gca_003604175.
dicrocoelium_dendriticum_gca_000950715.
echinostoma_caproni_gca_900618425.
fasciola_gigantica_gca_006461475.
fasciola_hepatica_gca_900302435.
fasciolopsis_buski_gca_008360955.
opisthorchis_felineus_gca_004794785.
opisthorchis_viverrini_gcf_000715545.
paragonimus_westermani_gca_008508345.
schistosoma_bovis_gca_003958945......................................................................................................
schistosoma_curassoni_gca_900618015.......................................................................................................
schistosoma_haematobiu

Quantify probes and the number of targeted taxa for each.

In [196]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/trematoda/initial_intervals/trematoda-to-Schistosoma_mansoni_GCA_000237925.sqlite --base-taxon Schistosoma_mansoni_GCA_000237925
Loci shared by Schistosoma_mansoni_GCA_000237925 + 0 taxa:	150,106.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 1 taxa:	150,106.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 2 taxa:	120,696.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 3 taxa:	101,035.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 4 taxa:	82,688.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 5 taxa:	59,269.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 6 taxa:	6,107.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 7 taxa:	695.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 8 taxa:	38.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 9 taxa:	28.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 10 taxa:	23.0
Loci shared by Schistosoma_mansoni_GCA_000237925 + 11 taxa:	20.0
Loci share

In [197]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 6
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/trematoda/initial_intervals/trematoda-to-Schistosoma_mansoni_GCA_000237925.sqlite --base-taxon Schistosoma_mansoni_GCA_000237925 --output results/phyluce/trematoda/initial_intervals/Schistosoma_mansoni_GCA_000237925_+6.bed --specific-counts 6;
Counter({'schistosoma_curassoni_gca_900618015': 6098, 'schistosoma_bovis_gca_003958945': 6094, 'schistosoma_margrebowiei_gca_900618395': 6093, 'schistosoma_haematobium_gcf_000699445': 6089, 'schistosoma_mattheei_gca_900617995': 6082, 'schistosoma_japonicum_gca_006368765': 5974, 'trichobilharzia_regenti_gca_900618515': 860, 'fasciola_gigantica_gca_006461475': 33, 'fasciolopsis_buski_gca_008360955': 33, 'fasciola_hepatica_gca_900302435': 29, 'paragonimus_westermani_gca_008508345': 27, 'clonorchis_sinensis_gca_003604175': 25, 'opisthorchis_felineus_gca_004794785': 22, 'opisthorchis_viverrini_gcf_000715545': 19, 'echinostoma_caproni_gca_900618425': 17, 'dicrocoelium_dendriticum_gca_000950715'

## Design temp set of baits

In [198]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/trematoda/initial_intervals/Schistosoma_mansoni_GCA_000237925_+6.bed --twobit results/phyluce/trematoda/cleaned_genomes/Schistosoma_mansoni_GCA_000237925_formatted.2bit --buffer-to 160 --output results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6.fasta;
Screened 6107 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 6107.


design the baits

In [199]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6.fasta --probe-prefix uce_trematoda_ --design trematoda_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

## Find duplicate baited regions

In [200]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6_temp_probes.fas --query results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6_temp_probes_vself.lastz;
Started:  Mon Feb 10, 2020  14:41:58
Ended:  Mon Feb 10, 2020  14:42:05
Time for execution:  0.104172933102 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6_temp_probes.fas                        --lastz results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6_temp_probes_vself.lastz                       --probe-prefix=uce_trematoda_;
Parsing lastz file...
Screening results...
Screened 10162 fasta sequences.  Filtered 372 duplicates. Kept 9461.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [201]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [202]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/trematoda/validate_intervals/Schistosoma_mansoni_GCA_000237925_+6_temp_probes.fas --scaffoldlist Clonorchis_sinensis_GCA_003604175 Dicrocoelium_dendriticum_GCA_000950715 Echinostoma_caproni_GCA_900618425 Fasciola_gigantica_GCA_006461475 Fasciola_hepatica_GCA_900302435 Fasciolopsis_buski_GCA_008360955 Opisthorchis_felineus_GCA_004794785 Opisthorchis_viverrini_GCF_000715545 Paragonimus_westermani_GCA_008508345 Schistosoma_bovis_GCA_003958945 Schistosoma_curassoni_GCA_900618015 Schistosoma_haematobium_GCF_000699445 Schistosoma_japonicum_GCA_006368765 Schistosoma_margrebowiei_GCA_900618395 Schistosoma_mattheei_GCA_900617995 Trichobilharzia_regenti_GCA_900618515 Schistosoma_mansoni_GCA_000237925 --genome-base-path results/phyluce/trematoda/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/trematoda/validate_intervals/trematoda-to-Schistosoma_mansoni_GCA_000237925.sqlite --output results/phyluce/trematoda/validat

	/tmp/tmphtGoHb.fasta
	/tmp/tmpaNQPtm.fasta
	/tmp/tmprdqv48.fasta
	/tmp/tmpLSPKuW.fasta
	/tmp/tmpEevwvq.fasta
	/tmp/tmp3bOcBU.fasta
	/tmp/tmps7028B.fasta
	/tmp/tmpoRdEiP.fasta
	/tmp/tmpMRuoWu.fasta
	/tmp/tmpsJAnDi.fasta
	/tmp/tmpxwzcsT.fasta
	/tmp/tmpnYZh_j.fasta
	/tmp/tmpuBRM4o.fasta
	/tmp/tmp0HeQIx.fasta
	/tmp/tmpbklzqZ.fasta
	/tmp/tmp5_26ZD.fasta
	/tmp/tmp5p7i6F.fasta
	/tmp/tmpaPMPZO.fasta
	/tmp/tmpF0apuw.fasta
	/tmp/tmpvRA510.fasta
	/tmp/tmpoYy_F2.fasta
	/tmp/tmpY2XeKQ.fasta
	/tmp/tmphflLNO.fasta
	/tmp/tmpLXKKXE.fasta
	/tmp/tmp1Sdykm.fasta

Writing the results file...
	/tmp/tmpC_5D8M.lastz
	/tmp/tmp0D0Vvt.lastz
	/tmp/tmpwY9Crd.lastz
	/tmp/tmpi6l40o.lastz
	/tmp/tmpsPWKEn.lastz
	/tmp/tmpAhJb1O.lastz
	/tmp/tmpOHocZk.lastz
	/tmp/tmp6svkQu.lastz
	/tmp/tmpG4aFE3.lastz
	/tmp/tmpy6oLzY.lastz
	/tmp/tmpjzA41W.lastz
	/tmp/tmpq2U6HU.lastz
	/tmp/tmpX1oJLf.lastz
	/tmp/tmptRnhaW.lastz
	/tmp/tmp91_j2c.lastz
	/tmp/tmpCHzMDi.lastz
	/tmp/tmp4jghab.lastz
	/tmp/tmpikm2hK.lastz
	/tmp/tmpf3UWDQ.lastz
	/t


Running against Fasciola_hepatica_GCA_900302435.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 109 queries...
	/tmp/tmpnVQOOh.fasta
	/tmp/tmpdaJXrM.fasta
	/tmp/tmple6b5z.fasta
	/tmp/tmpxNA6GW.fasta
	/tmp/tmpGaZdFs.fasta
	/tmp/tmpzQIQqL.fasta
	/tmp/tmpwEiolZ.fasta
	/tmp/tmpfsC0t1.fasta
	/tmp/tmpgpk1Y1.fasta
	/tmp/tmpSnf4Go.fasta
	/tmp/tmpAe0i_p.fasta
	/tmp/tmp4FbAqs.fasta
	/tmp/tmpVgG21_.fasta
	/tmp/tmpTd2QtC.fasta
	/tmp/tmpIuSUGN.fasta
	/tmp/tmpALiS38.fasta
	/tmp/tmpXFygqv.fasta
	/tmp/tmpAt7qNF.fasta
	/tmp/tmplqkmZM.fasta
	/tmp/tmp01WsNt.fasta
	/tmp/tmpPcPDeB.fasta
	/tmp/tmpREhYZT.fasta
	/tmp/tmpDcM0P1.fasta
	/tmp/tmpByiKte.fasta
	/tmp/tmpUuKiEk.fasta
	/tmp/tmphEP0eY.fasta
	/tmp/tmpYWhEEt.fasta
	/tmp/tmpx9rAE1.fasta
	/tmp/tmp2akLwZ.fasta
	/tmp/tmpA7TgXN.fasta
	/tmp/tmpF029_J.fasta
	/tmp/tmpRgodXx.fasta
	/tmp/tmpQlBGCT.fasta
	/tmp/tmpU1qP0T.fasta
	/tmp/tmpa4eZnQ.fasta
	/tmp/tmpqOznJS.fasta
	/tmp/tmpoGxP8K.fasta
	/tmp/tmpmFPyy7.fasta

Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/trematoda/validate_intervals/lastz/Schistosoma_mansoni_GCA_000237925_+6_temp_probes.fas_v_Fasciolopsis_buski_GCA_008360955.lastz
Creating Fasciolopsis_buski_GCA_008360955 table
Inserting data to Fasciolopsis_buski_GCA_008360955 table

Running against Opisthorchis_felineus_GCA_004794785.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 66 queries...
	/tmp/tmp9qtCIs.fasta
	/tmp/tmptJ683S.fasta
	/tmp/tmpgrQGc_.fasta
	/tmp/tmpajFItH.fasta
	/tmp/tmpKWMhrL.fasta
	/tmp/tmp5VvR57.fasta
	/tmp/tmpcxsbzj.fasta
	/tmp/tmpSSjxFt.fasta
	/tmp/tmpD8Q3Rs.fasta
	/tmp/tmpevU8bg.fasta
	/tmp/tmpnghmof.fasta
	/tmp/tmpbEqyXV.fasta
	/tmp/tmpQcmkw5.fasta
	/tmp/tmpRITyco.fasta
	/tmp/tmpDJ6how.fasta
	/tmp/tmpiFpa1v.fasta
	/tmp/tmpQgWaKj.fasta
	/tmp/tmpFjDEaS.fasta
	/tmp/tmpJ09B5K.fasta
	/tmp/tmpKG_QLO.fasta
	/tmp/tmps7yh82.fasta
	/tmp/tmpj7Nbge.fasta
	/tmp/tmpdlWnjK.fasta
	/t

	/tmp/tmpKWM4EM.fasta
	/tmp/tmp29XTlF.fasta
	/tmp/tmp89qasT.fasta
	/tmp/tmpuUxLzL.fasta
	/tmp/tmp45VrIg.fasta
	/tmp/tmp8zqKVh.fasta
	/tmp/tmpYFfGbR.fasta
	/tmp/tmpVghc5I.fasta
	/tmp/tmp5I72x8.fasta
	/tmp/tmpWIwRqu.fasta
	/tmp/tmpucNX5a.fasta
	/tmp/tmp5eOATR.fasta
	/tmp/tmpJBo2zG.fasta
	/tmp/tmpxTqgZN.fasta
	/tmp/tmpwMqbUT.fasta
	/tmp/tmpk2icEr.fasta
	/tmp/tmpwAW3wW.fasta
	/tmp/tmp2Gi_k6.fasta
	/tmp/tmphu1fx_.fasta
	/tmp/tmpj9aH14.fasta
	/tmp/tmpSy0OF9.fasta
	/tmp/tmp3BNdBi.fasta
	/tmp/tmp9YwPmT.fasta
	/tmp/tmpoG8TqP.fasta
	/tmp/tmpriugKG.fasta
	/tmp/tmplyJqHv.fasta
	/tmp/tmpJdpnVX.fasta
	/tmp/tmpMaEJcQ.fasta
	/tmp/tmp3VpdIY.fasta
	/tmp/tmp8rXwY0.fasta
	/tmp/tmpaq4Bt7.fasta
	/tmp/tmpQgU4WE.fasta
	/tmp/tmpJ1vUaz.fasta
	/tmp/tmpcol2ZG.fasta
	/tmp/tmpkSJGSS.fasta
	/tmp/tmpYSIUHY.fasta
	/tmp/tmpfnO36o.fasta
	/tmp/tmpIGz6MB.fasta

Writing the results file...
	/tmp/tmpNySt7P.lastz
	/tmp/tmp0Fh0b0.lastz
	/tmp/tmpukSfFk.lastz
	/tmp/tmpNrI9Z9.lastz
	/tmp/tmpt3Z7hz.lastz
	/tmp/tmpWfjCgK.lastz
	/t

	/tmp/tmpv0zHAK.fasta
	/tmp/tmpeJJyec.fasta
	/tmp/tmp0CO5c1.fasta
	/tmp/tmp6yjOQP.fasta
	/tmp/tmpQ1dX1u.fasta
	/tmp/tmp6s_Ei_.fasta
	/tmp/tmpTptiYr.fasta
	/tmp/tmpxX_jMl.fasta
	/tmp/tmpUGwnMI.fasta
	/tmp/tmpVUg5FY.fasta

Writing the results file...
	/tmp/tmpN7wLx6.lastz
	/tmp/tmpFYE5ID.lastz
	/tmp/tmp2mYVGE.lastz
	/tmp/tmpi7ES4p.lastz
	/tmp/tmpHxaOXS.lastz
	/tmp/tmp2CugYf.lastz
	/tmp/tmpPwgX1F.lastz
	/tmp/tmpOYDUea.lastz
	/tmp/tmpyR5Hf2.lastz
	/tmp/tmp6cq4cB.lastz
	/tmp/tmp8Ir_43.lastz
	/tmp/tmpyQ2FmE.lastz
	/tmp/tmpbGkthy.lastz
	/tmp/tmpWFXS6x.lastz
	/tmp/tmpCG4zCU.lastz
	/tmp/tmpaxQ9GY.lastz
	/tmp/tmp6jDvg5.lastz
	/tmp/tmpw2hi4Z.lastz
	/tmp/tmpzpQEXJ.lastz
	/tmp/tmp1nSo2b.lastz
	/tmp/tmpuip0_f.lastz
	/tmp/tmpGSnFPo.lastz
	/tmp/tmpaYg9Uj.lastz
	/tmp/tmptzUg7I.lastz
	/tmp/tmp4PUn9U.lastz
	/tmp/tmphQxRdO.lastz
	/tmp/tmpLpudh2.lastz
	/tmp/tmp466TzR.lastz
	/tmp/tmpUurls9.lastz
	/tmp/tmpQDu_H7.lastz
	/tmp/tmpOqrtSm.lastz
	/tmp/tmpjHIHD3.lastz
	/tmp/tmpRS3wRZ.lastz
	/tmp/tmpds9pFs.lastz
	/t

	/tmp/tmpIkrkyX.fasta
	/tmp/tmpdz7mM9.fasta
	/tmp/tmpKxdDe8.fasta
	/tmp/tmpumRl5X.fasta
	/tmp/tmpYCK7cz.fasta
	/tmp/tmpOA_BXF.fasta
	/tmp/tmpUCvkLm.fasta
	/tmp/tmpVHi0Xd.fasta
	/tmp/tmp51K4_B.fasta
	/tmp/tmpQkMY7P.fasta
	/tmp/tmpyPlxxZ.fasta
	/tmp/tmpxfTIJe.fasta
	/tmp/tmplHj17T.fasta
	/tmp/tmp7zNOTJ.fasta
	/tmp/tmp_AwAb2.fasta
	/tmp/tmpevh75M.fasta
	/tmp/tmp7N3TYT.fasta
	/tmp/tmppQh6c6.fasta
	/tmp/tmpsyUI_Z.fasta
	/tmp/tmpwQXQLo.fasta
	/tmp/tmpRmIDeZ.fasta
	/tmp/tmp3Z7lGy.fasta
	/tmp/tmp5r8ACQ.fasta
	/tmp/tmpdOLYd3.fasta
	/tmp/tmpPzXfY9.fasta
	/tmp/tmpQ4A5xP.fasta
	/tmp/tmpUWWU4S.fasta
	/tmp/tmpv7dpZx.fasta
	/tmp/tmpPeC1d1.fasta
	/tmp/tmpXK0ulu.fasta
	/tmp/tmpK1LIMZ.fasta
	/tmp/tmpSuyKjs.fasta
	/tmp/tmpu0uM57.fasta
	/tmp/tmpUy_8Ab.fasta
	/tmp/tmpU38ovy.fasta
	/tmp/tmpL9yqXR.fasta
	/tmp/tmpoOOXQg.fasta
	/tmp/tmpa_urvT.fasta
	/tmp/tmp63wYhK.fasta
	/tmp/tmpmatTGF.fasta
	/tmp/tmpO0k6g7.fasta
	/tmp/tmp3IEPTr.fasta
	/tmp/tmpCivJMB.fasta
	/tmp/tmpiOFp1W.fasta
	/tmp/tmpdSRnWW.fasta
	/tmp/tmp0

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [203]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/trematoda/extract_probes_from_group/trematoda_genome.conf --lastz results/phyluce/trematoda/validate_intervals/lastz --probes 120 --probe-prefix uce_trematoda_ --name-pattern "Schistosoma_mansoni_GCA_000237925_+6_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/trematoda/extract_probes_from_group/probe_fasta;
2020-02-10 15:57:46,449 - Phyluce - INFO - ------ Working on Clonorchis_sinensis_GCA_003604175 genome ------
2020-02-10 15:57:46,450 - Phyluce - INFO - Reading Clonorchis_sinensis_GCA_003604175 genome
2020-02-10 15:58:01,408 - Phyluce - INFO - Clonorchis_sinensis_GCA_003604175: 3454 uces, 572 dupes, 2882 non-dupes, 15 orient drop, 53 length drop, 2814 written
2020-02-10 15:58:01,408 - Phyluce - INFO - ---- Working on Dicrocoelium_dendriticum_GCA_000950715 genome ---
2020-02-10 15:58:01,409 - Phyluce - INFO - Reading Dicrocoelium_dendriticum_GCA_000950715 genome
2020-02-10 15:58:18,432 - Phyluce - INFO - Dic

In [204]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/trematoda/extract_probes_from_group/probe_fasta --output results/phyluce/trematoda/extract_probes_from_group/multifastas.sqlite --base-taxon Schistosoma_mansoni_GCA_000237925;
clonorchis_sinensis_gca_003604175...
dicrocoelium_dendriticum_gca_000950715.
echinostoma_caproni_gca_900618425...
fasciola_gigantica_gca_006461475...
fasciola_hepatica_gca_900302435...
fasciolopsis_buski_gca_008360955...
opisthorchis_felineus_gca_004794785...
opisthorchis_viverrini_gcf_000715545...
paragonimus_westermani_gca_008508345...
schistosoma_bovis_gca_003958945....
schistosoma_curassoni_gca_900618015.....
schistosoma_haematobium_gcf_000699445.....
schistosoma_japonicum_gca_006368765.....
schistosoma_margrebowiei_gca_900618395.....
schistosoma_mattheei_gca_900617995.....
trichobilharzia_regenti_gca_900618515.....
schistosoma_mansoni_gca_000237925.....
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/t

In [205]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(17)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/trematoda/extract_probes_from_group/multifastas.sqlite  --base-taxon Schistosoma_mansoni_GCA_000237925 --output results/phyluce/trematoda/extract_probes_from_group/Schistosoma_mansoni_GCA_000237925+6-back-to-17.conf --specific-counts 17;
Counter({'schistosoma_mansoni_gca_000237925': 146, 'trichobilharzia_regenti_gca_900618515': 146, 'schistosoma_mattheei_gca_900617995': 146, 'fasciola_hepatica_gca_900302435': 146, 'schistosoma_japonicum_gca_006368765': 146, 'fasciola_gigantica_gca_006461475': 146, 'schistosoma_margrebowiei_gca_900618395': 146, 'dicrocoelium_dendriticum_gca_000950715': 146, 'opisthorchis_viverrini_gcf_000715545': 146, 'fasciolopsis_buski_gca_008360955': 146, 'paragonimus_westermani_gca_008508345': 146, 'echinostoma_caproni_gca_900618425': 146, 'clonorchis_sinensis_gca_003604175': 146, 'schistosoma_bovis_gca_003958945': 146, 'schistosoma_curassoni_gca_900618015': 146, 'schistosoma_haematobium_gcf_000699445': 146,

## Final group specific bait design

In [206]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/trematoda/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/trematoda/extract_probes_from_group/Schistosoma_mansoni_GCA_000237925+6-back-to-17.conf --probe-prefix uce_trematoda_ --designer rnplattii --design trematoda_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGGGGGGGGGGGGGGGGGNNNNGGGGGGGGGGGNNGGGGGGGGGGGGGGNGNNGGGGGGGGGGGGGGGGNNGGGGGGGGGGGGNNGGGGGGGGGGGGGGGGGNNGGGGGGG


Conserved locus count = 146
Probe Count = 4821


In [207]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list.fasta --query results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Mon Feb 10, 2020  16:04:48
Ended:  Mon Feb 10, 2020  16:04:56
Time for execution:  0.118803016345 minutes


In [208]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list.fasta --lastz results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_trematoda_;
Parsing lastz file...
Screening results...
Screened 4820 fasta sequences.  Filtered 0 duplicates. Kept 4821.


## CDhit to reduce numbers

In [209]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list.fasta
         -o
         results/phyluce/trematoda/final_probe_design/trematoda_v1-master_probe_list.95P_cdhit

Started: Mon Feb 10 16:07:30 2020
                            Output                              
----------------------------------------------------------------
total seq: 4821
longest and shortest : 80 and 80
Total letters: 385680
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 87M

Table limit with the given memory limit:
Max number of representatives: 3956764
Max number of word counting entries: 89066764

# comparing sequences from          0  to        803
---------- new table with      518 representatives
# comparing sequences from        803  

# Tremellales

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [210]:
group = 'tremellales'

In [13]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [211]:
group_taxa = [ "Bullera_alba_GCA_001600095",
               "Cryptococcus_amylolentus_GCF_001720205",
               "Cryptococcus_depauperatus_GCA_001720245",
               "Cryptococcus_floricola_GCA_006352305",
               "Cryptococcus_gattii_GCF_000185945",
               "Cryptococcus_sp_GCA_001600855",
               "Cryptococcus_wingfieldii_GCF_001720155",
               "Dioszegia_aurantiaca_GCA_001600655",
               "Dioszegia_crocea_GCA_001600615",
               "Kockovaella_imperatae_GCF_002102565",
               "Kwoniella_bestiolae_GCF_000512585",
               "Kwoniella_dejecticola_GCF_000512565",
               "Kwoniella_heveanensis_GCA_000507405",
               "Kwoniella_mangrovensis_GCF_000507465",
               "Kwoniella_pini_GCF_000512605",
               "Kwoniella_shandongensis_GCF_008629635",
               "Naematelia_encephala_GCA_002105065",
               "Papiliotrema_flavescens_GCA_000442785",
               "Papiliotrema_laurentii_GCA_000738825",
               "Phaeotremella_fagi_GCA_001599715",
               "Phaeotremella_skinneri_GCA_001599695",
               "Saitozyma_podzolica_GCA_003942215",
               "Tremella_fuciformis_GCA_000987905",
               "Tremella_mesenterica_GCF_000271645" ]
                    
reference_taxon = "Cryptococcus_neoformans_GCF_000091045"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [15]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001600095.1_JCM_2954_assembly_v001_genomic.fna.gz

sent 42 bytes  received 6113863 bytes  4075936.67 bytes/sec
total size is 6112250  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result

GCA_002102565.1_Kocim1_genomic.fna.gz

sent 42 bytes  received 5472348 bytes  2188956.00 bytes/sec
total size is 5470911  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000512585.2_Cryp_best_CBS10118_V1_genomic.fna.gz

sent 42 bytes  received 7754417 bytes  3101783.60 bytes/sec
total size is 7752405  speedup is 1.00


You are accessing a U.S. Government information system which includes this
c

GCA_001599695.1_JCM_9039_assembly_v001_genomic.fna.gz

sent 42 bytes  received 6495883 bytes  4330616.67 bytes/sec
total size is 6494174  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003942215.1_ASM394221v1_genomic.fna.gz

sent 42 bytes  received 9470394 bytes  3788174.40 bytes/sec
total size is 9467975  speedup is 1.00


You are accessing a U.S. Government information system which includes 

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [19]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [20]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486124 ("sim_Bullera_alba_GCA_001600095") has been submitted
Your job 5486125 ("sim_Cryptococcus_amylolentus_GCF_001720205") has been submitted
Your job 5486126 ("sim_Cryptococcus_depauperatus_GCA_001720245") has been submitted
Your job 5486127 ("sim_Cryptococcus_floricola_GCA_006352305") has been submitted
Your job 5486128 ("sim_Cryptococcus_gattii_GCF_000185945") has been submitted
Your job 5486129 ("sim_Cryptococcus_sp_GCA_001600855") has been submitted
Your job 5486130 ("sim_Cryptococcus_wingfieldii_GCF_001720155") has been submitted
Your job 5486131 ("sim_Dioszegia_aurantiaca_GCA_001600655") has been submitted
Your job 5486132 ("sim_Dioszegia_crocea_GCA_001600615") has been submitted
Your job 5486133 ("sim_Kockovaella_imperatae_GCF_002102565") has been submitted
Your job 5486134 ("sim_Kwoniella_bestiolae_GCF_000512585") has been submitted
Your job 5486135 ("sim_Kwoniella_dejecticola_GCF_000512565") has been submitted
Your job 5486136 ("sim_Kwoniella_heveanensis_GCA_00050

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [21]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/tremellales/cleaned_genomes/Cryptococcus_neoformans_GCF_000091045_formatted.fas path=results/phyluce/tremellales/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [22]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486148 ("map_Bullera_alba_GCA_001600095") has been submitted
Your job 5486149 ("map_Cryptococcus_amylolentus_GCF_001720205") has been submitted
Your job 5486150 ("map_Cryptococcus_depauperatus_GCA_001720245") has been submitted
Your job 5486151 ("map_Cryptococcus_floricola_GCA_006352305") has been submitted
Your job 5486152 ("map_Cryptococcus_gattii_GCF_000185945") has been submitted
Your job 5486153 ("map_Cryptococcus_sp_GCA_001600855") has been submitted
Your job 5486154 ("map_Cryptococcus_wingfieldii_GCF_001720155") has been submitted
Your job 5486155 ("map_Dioszegia_aurantiaca_GCA_001600655") has been submitted
Your job 5486156 ("map_Dioszegia_crocea_GCA_001600615") has been submitted
Your job 5486157 ("map_Kockovaella_imperatae_GCF_002102565") has been submitted
Your job 5486158 ("map_Kwoniella_bestiolae_GCF_000512585") has been submitted
Your job 5486159 ("map_Kwoniella_dejecticola_GCF_000512565") has been submitted
Your job 5486160 ("map_Kwoniella_heveanensis_GCA_00050

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [23]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486172 ("merge_Bullera_alba_GCA_001600095") has been submitted
Your job 5486173 ("merge_Cryptococcus_amylolentus_GCF_001720205") has been submitted
Your job 5486174 ("merge_Cryptococcus_depauperatus_GCA_001720245") has been submitted
Your job 5486175 ("merge_Cryptococcus_floricola_GCA_006352305") has been submitted
Your job 5486176 ("merge_Cryptococcus_gattii_GCF_000185945") has been submitted
Your job 5486177 ("merge_Cryptococcus_sp_GCA_001600855") has been submitted
Your job 5486178 ("merge_Cryptococcus_wingfieldii_GCF_001720155") has been submitted
Your job 5486179 ("merge_Dioszegia_aurantiaca_GCA_001600655") has been submitted
Your job 5486180 ("merge_Dioszegia_crocea_GCA_001600615") has been submitted
Your job 5486181 ("merge_Kockovaella_imperatae_GCF_002102565") has been submitted
Your job 5486182 ("merge_Kwoniella_bestiolae_GCF_000512585") has been submitted
Your job 5486183 ("merge_Kwoniella_dejecticola_GCF_000512565") has been submitted
Your job 5486184 ("merge_Kwoni

remove loci that were masked in the original genome

In [213]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 176 sequences from Bullera_alba_GCA_001600095_merged.bed.  Filtered 129 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 47.
Screened 940 sequences from Cryptococcus_amylolentus_GCF_001720205_merged.bed.  Filtered 679 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 261.
Screened 290 sequences from Cryptococcus_depauperatus_GCA_001720245_merged.bed.  Filtered 236 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 54.
Screened 938 sequences from Cryptococcus_floricola_GCA_006352305_merged.bed.  Filtered 666 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 272.
Screened 33813 sequences from Cryptococcus_gattii_GCF_000185945_merged.bed.  Filtered 15232 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 18581.
Screened 225 sequences from Cryptococcus_sp_GCA_001600855_merged.bed.  Filtered 173 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 52.
Screened 953 sequences from Cryptococcus_wingfieldii_GCF_0

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [214]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [215]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/tremellales/initial_intervals/Tremella_mesenterica_GCF_000271645_merged.bed --twobit results/phyluce/tremellales/cleaned_genomes/Cryptococcus_neoformans_GCF_000091045_formatted.2bit --output results/phyluce/tremellales/initial_intervals/Tremella_mesenterica_GCF_000271645_stripped.bed;
bullera_alba_gca_001600095.
cryptococcus_amylolentus_gcf_001720205.
cryptococcus_depauperatus_gca_001720245.
cryptococcus_floricola_gca_006352305.
cryptococcus_gattii_gcf_000185945...................
cryptococcus_sp_gca_001600855.
cryptococcus_wingfieldii_gcf_001720155.
dioszegia_aurantiaca_gca_001600655.
dioszegia_crocea_gca_001600615.
kockovaella_imperatae_gcf_002102565.
kwoniella_bestiolae_gcf_000512585.
kwoniella_dejecticola_gcf_000512565.
kwoniella_heveanensis_gca_000507405.
kwoniella_mangrovensis_gcf_000507465.
kwoniella_pini_gcf_000512605.
kwoniella_shandongensis_gcf_008629635.
naematelia_encephala_gc

Quantify probes and the number of targeted taxa for each.

In [216]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/tremellales/initial_intervals/tremellales-to-Cryptococcus_neoformans_GCF_000091045.sqlite --base-taxon Cryptococcus_neoformans_GCF_000091045
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 0 taxa:	18,750.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 1 taxa:	18,750.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 2 taxa:	398.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 3 taxa:	306.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 4 taxa:	254.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 5 taxa:	169.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 6 taxa:	126.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 7 taxa:	98.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 8 taxa:	76.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 9 taxa:	62.0
Loci shared by Cryptococcus_neoformans_GCF_000091045 + 10 taxa:	49.0
Loci shared by Cryptococcus_neoformans

In [217]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 2
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/tremellales/initial_intervals/tremellales-to-Cryptococcus_neoformans_GCF_000091045.sqlite --base-taxon Cryptococcus_neoformans_GCF_000091045 --output results/phyluce/tremellales/initial_intervals/Cryptococcus_neoformans_GCF_000091045_+2.bed --specific-counts 2;
Counter({'cryptococcus_gattii_gcf_000185945': 374, 'cryptococcus_floricola_gca_006352305': 280, 'cryptococcus_wingfieldii_gcf_001720155': 269, 'cryptococcus_amylolentus_gcf_001720205': 261, 'kwoniella_shandongensis_gcf_008629635': 148, 'kwoniella_heveanensis_gca_000507405': 123, 'papiliotrema_laurentii_gca_000738825': 72, 'cryptococcus_depauperatus_gca_001720245': 55, 'saitozyma_podzolica_gca_003942215': 55, 'cryptococcus_sp_gca_001600855': 54, 'papiliotrema_flavescens_gca_000442785': 53, 'bullera_alba_gca_001600095': 47, 'tremella_fuciformis_gca_000987905': 45, 'dioszegia_aurantiaca_gca_001600655': 37, 'dioszegia_crocea_gca_001600615': 35, 'naematelia_encephala_gca_0021

## Design temp set of baits

In [218]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/tremellales/initial_intervals/Cryptococcus_neoformans_GCF_000091045_+2.bed --twobit results/phyluce/tremellales/cleaned_genomes/Cryptococcus_neoformans_GCF_000091045_formatted.2bit --buffer-to 160 --output results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2.fasta;
Screened 398 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 398.


design the baits

In [219]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2.fasta --probe-prefix uce_tremellales_ --design tremellales_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):



Conserved locus count = 398
Probe Count = 796


## Find duplicate baited regions

In [220]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas --query results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes_vself.lastz;
Started:  Mon Feb 10, 2020  16:10:30
Ended:  Mon Feb 10, 2020  16:10:30
Time for execution:  0.0063698331515 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas                        --lastz results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes_vself.lastz                       --probe-prefix=uce_tremellales_;
Parsing lastz file...
Screening results...
Screened 795 fasta sequences.  Filtered 15 dupli

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [221]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [222]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/tremellales/validate_intervals/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas --scaffoldlist Bullera_alba_GCA_001600095 Cryptococcus_amylolentus_GCF_001720205 Cryptococcus_depauperatus_GCA_001720245 Cryptococcus_floricola_GCA_006352305 Cryptococcus_gattii_GCF_000185945 Cryptococcus_sp_GCA_001600855 Cryptococcus_wingfieldii_GCF_001720155 Dioszegia_aurantiaca_GCA_001600655 Dioszegia_crocea_GCA_001600615 Kockovaella_imperatae_GCF_002102565 Kwoniella_bestiolae_GCF_000512585 Kwoniella_dejecticola_GCF_000512565 Kwoniella_heveanensis_GCA_000507405 Kwoniella_mangrovensis_GCF_000507465 Kwoniella_pini_GCF_000512605 Kwoniella_shandongensis_GCF_008629635 Naematelia_encephala_GCA_002105065 Papiliotrema_flavescens_GCA_000442785 Papiliotrema_laurentii_GCA_000738825 Phaeotremella_fagi_GCA_001599715 Phaeotremella_skinneri_GCA_001599695 Saitozyma_podzolica_GCA_003942215 Tremella_fuciformis_GCA_000987905 Tremella_mesenterica_G

Running the targets against 3 queries...
	/tmp/tmp8R_9i9.fasta
	/tmp/tmpASRt7w.fasta
	/tmp/tmpsuo8SU.fasta

Writing the results file...
	/tmp/tmpa9oErW.lastz
	/tmp/tmpIFZ91q.lastz
	/tmp/tmpl5qyDa.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/tremellales/validate_intervals/lastz/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas_v_Kwoniella_dejecticola_GCF_000512565.lastz
Creating Kwoniella_dejecticola_GCF_000512565 table
Inserting data to Kwoniella_dejecticola_GCF_000512565 table

Running against Kwoniella_heveanensis_GCA_000507405.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpqbW5kd.fasta
	/tmp/tmpef2tTu.fasta
	/tmp/tmppguC2H.fasta

Writing the results file...
	/tmp/tmp8I6GpV.lastz
	/tmp/tmp9iyK06.lastz
	/tmp/tmpFPQGDu.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/tremellales/validate_intervals/lastz/Crypto

Creating Tremella_mesenterica_GCF_000271645 table
Inserting data to Tremella_mesenterica_GCF_000271645 table

Running against Cryptococcus_neoformans_GCF_000091045.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 2 queries...
	/tmp/tmpFNEfOI.fasta
	/tmp/tmpuHIYpE.fasta

Writing the results file...
	/tmp/tmp5lrPZE.lastz
	/tmp/tmpGhl1WS.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/tremellales/validate_intervals/lastz/Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas_v_Cryptococcus_neoformans_GCF_000091045.lastz
Creating Cryptococcus_neoformans_GCF_000091045 table
Inserting data to Cryptococcus_neoformans_GCF_000091045 table


## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [223]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/tremellales/extract_probes_from_group/tremellales_genome.conf --lastz results/phyluce/tremellales/validate_intervals/lastz --probes 120 --probe-prefix uce_tremellales_ --name-pattern "Cryptococcus_neoformans_GCF_000091045_+2_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/tremellales/extract_probes_from_group/probe_fasta;
2020-02-10 16:12:14,887 - Phyluce - INFO - ---------- Working on Bullera_alba_GCA_001600095 genome ---------
2020-02-10 16:12:14,888 - Phyluce - INFO - Reading Bullera_alba_GCA_001600095 genome
2020-02-10 16:12:16,228 - Phyluce - INFO - Bullera_alba_GCA_001600095: 362 uces, 38 dupes, 324 non-dupes, 6 orient drop, 2 length drop, 316 written
2020-02-10 16:12:16,228 - Phyluce - INFO - ---- Working on Cryptococcus_amylolentus_GCF_001720205 genome ---
2020-02-10 16:12:16,235 - Phyluce - INFO - Reading Cryptococcus_amylolentus_GCF_001720205 genome
2020-02-10 16:12:17,768 - Phyluce - INFO - Cryptococ

2020-02-10 16:12:46,113 - Phyluce - INFO - Saitozyma_podzolica_GCA_003942215: 366 uces, 53 dupes, 313 non-dupes, 0 orient drop, 2 length drop, 311 written
2020-02-10 16:12:46,114 - Phyluce - INFO - ------ Working on Tremella_fuciformis_GCA_000987905 genome ------
2020-02-10 16:12:46,115 - Phyluce - INFO - Reading Tremella_fuciformis_GCA_000987905 genome
2020-02-10 16:12:47,561 - Phyluce - INFO - Tremella_fuciformis_GCA_000987905: 369 uces, 57 dupes, 312 non-dupes, 0 orient drop, 0 length drop, 312 written
2020-02-10 16:12:47,561 - Phyluce - INFO - ------ Working on Tremella_mesenterica_GCF_000271645 genome -----
2020-02-10 16:12:47,571 - Phyluce - INFO - Reading Tremella_mesenterica_GCF_000271645 genome
2020-02-10 16:12:48,943 - Phyluce - INFO - Tremella_mesenterica_GCF_000271645: 368 uces, 43 dupes, 325 non-dupes, 2 orient drop, 11 length drop, 312 written
2020-02-10 16:12:48,943 - Phyluce - INFO - ---- Working on Cryptococcus_neoformans_GCF_000091045 genome ----
2020-02-10 16:12:48,9

In [224]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/tremellales/extract_probes_from_group/probe_fasta --output results/phyluce/tremellales/extract_probes_from_group/multifastas.sqlite --base-taxon Cryptococcus_neoformans_GCF_000091045;
bullera_alba_gca_001600095.
cryptococcus_amylolentus_gcf_001720205.
cryptococcus_depauperatus_gca_001720245.
cryptococcus_floricola_gca_006352305.
cryptococcus_gattii_gcf_000185945.
cryptococcus_sp_gca_001600855.
cryptococcus_wingfieldii_gcf_001720155.
dioszegia_aurantiaca_gca_001600655.
dioszegia_crocea_gca_001600615.
kockovaella_imperatae_gcf_002102565.
kwoniella_bestiolae_gcf_000512585.
kwoniella_dejecticola_gcf_000512565.
kwoniella_heveanensis_gca_000507405.
kwoniella_mangrovensis_gcf_000507465.
kwoniella_pini_gcf_000512605.
kwoniella_shandongensis_gcf_008629635.
naematelia_encephala_gca_002105065.
papiliotrema_flavescens_gca_000442785.
papiliotrema_laurentii_gca_000738825.
phaeotremella_fagi_gca_001599715.
phaeotremella_skinneri_gca_0015996

In [225]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(25)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/tremellales/extract_probes_from_group/multifastas.sqlite  --base-taxon Cryptococcus_neoformans_GCF_000091045 --output results/phyluce/tremellales/extract_probes_from_group/Cryptococcus_neoformans_GCF_000091045+2-back-to-25.conf --specific-counts 25;
Counter({'kwoniella_dejecticola_gcf_000512565': 159, 'cryptococcus_wingfieldii_gcf_001720155': 159, 'dioszegia_crocea_gca_001600615': 159, 'dioszegia_aurantiaca_gca_001600655': 159, 'cryptococcus_neoformans_gcf_000091045': 159, 'papiliotrema_flavescens_gca_000442785': 159, 'kwoniella_shandongensis_gcf_008629635': 159, 'kockovaella_imperatae_gcf_002102565': 159, 'phaeotremella_skinneri_gca_001599695': 159, 'kwoniella_bestiolae_gcf_000512585': 159, 'naematelia_encephala_gca_002105065': 159, 'cryptococcus_floricola_gca_006352305': 159, 'kwoniella_mangrovensis_gcf_000507465': 159, 'papiliotrema_laurentii_gca_000738825': 159, 'cryptococcus_amylolentus_gcf_001720205': 159, 'phaeotremella_

## Final group specific bait design

In [226]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/tremellales/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/tremellales/extract_probes_from_group/Cryptococcus_neoformans_GCF_000091045+2-back-to-25.conf --probe-prefix uce_tremellales_ --designer rnplattii --design tremellales_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 159
Probe Count = 7923


In [227]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list.fasta --query results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Mon Feb 10, 2020  16:13:35
Ended:  Mon Feb 10, 2020  16:13:54
Time for execution:  0.322025386492 minutes


In [228]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list.fasta --lastz results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_tremellales_;
Parsing lastz file...
Screening results...
Screened 7922 fasta sequences.  Filtered 0 duplicates. Kept 7923.


## CDhit to reduce numbers

In [229]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list.fasta
         -o
         results/phyluce/tremellales/final_probe_design/tremellales_v1-master_probe_list.95P_cdhit

Started: Mon Feb 10 16:21:01 2020
                            Output                              
----------------------------------------------------------------
total seq: 7923
longest and shortest : 80 and 80
Total letters: 633840
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 88M

Table limit with the given memory limit:
Max number of representatives: 3951519
Max number of word counting entries: 88948698

# comparing sequences from          0  to       1320
.---------- new table with     1158 representatives
# comparing sequences from    

# Trypanosoma (Supplement)

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [37]:
group = 'trypanosoma'

In [297]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [38]:
group_taxa = [ "Trypanosoma_brucei_GCF_000210295",
               "Trypanosoma_congolense_GCA_002287245",
               "Trypanosoma_conorhini_GCF_003719485",
               "Trypanosoma_equiperdum_GCA_001457755",
               "Trypanosoma_grayi_GCF_000691245",
               "Trypanosoma_rangeli_GCF_003719475",
               "Trypanosoma_theileri_GCF_002087225",
               "Trypanosoma_vivax_GCA_000227375" ]
                    
reference_taxon = "Trypanosoma_cruzi_GCF_000209065"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [299]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000210295.1_ASM21029v1_genomic.fna.gz

sent 42 bytes  received 6763209 bytes  2705300.40 bytes/sec
total size is 6761448  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
discipli

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [300]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [301]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487732 ("sim_Trypanosoma_brucei_GCF_000210295") has been submitted
Your job 5487733 ("sim_Trypanosoma_congolense_GCA_002287245") has been submitted
Your job 5487734 ("sim_Trypanosoma_conorhini_GCF_003719485") has been submitted
Your job 5487735 ("sim_Trypanosoma_equiperdum_GCA_001457755") has been submitted
Your job 5487736 ("sim_Trypanosoma_grayi_GCF_000691245") has been submitted
Your job 5487737 ("sim_Trypanosoma_rangeli_GCF_003719475") has been submitted
Your job 5487738 ("sim_Trypanosoma_theileri_GCF_002087225") has been submitted
Your job 5487739 ("sim_Trypanosoma_vivax_GCA_000227375") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [302]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/trypanosoma/cleaned_genomes/Trypanosoma_cruzi_GCF_000209065_formatted.fas path=results/phyluce/trypanosoma/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [303]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487740 ("map_Trypanosoma_brucei_GCF_000210295") has been submitted
Your job 5487741 ("map_Trypanosoma_congolense_GCA_002287245") has been submitted
Your job 5487742 ("map_Trypanosoma_conorhini_GCF_003719485") has been submitted
Your job 5487743 ("map_Trypanosoma_equiperdum_GCA_001457755") has been submitted
Your job 5487744 ("map_Trypanosoma_grayi_GCF_000691245") has been submitted
Your job 5487745 ("map_Trypanosoma_rangeli_GCF_003719475") has been submitted
Your job 5487746 ("map_Trypanosoma_theileri_GCF_002087225") has been submitted
Your job 5487747 ("map_Trypanosoma_vivax_GCA_000227375") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [304]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487748 ("merge_Trypanosoma_brucei_GCF_000210295") has been submitted
Your job 5487749 ("merge_Trypanosoma_congolense_GCA_002287245") has been submitted
Your job 5487750 ("merge_Trypanosoma_conorhini_GCF_003719485") has been submitted
Your job 5487751 ("merge_Trypanosoma_equiperdum_GCA_001457755") has been submitted
Your job 5487752 ("merge_Trypanosoma_grayi_GCF_000691245") has been submitted
Your job 5487753 ("merge_Trypanosoma_rangeli_GCF_003719475") has been submitted
Your job 5487754 ("merge_Trypanosoma_theileri_GCF_002087225") has been submitted
Your job 5487755 ("merge_Trypanosoma_vivax_GCA_000227375") has been submitted


remove loci that were masked in the original genome

In [39]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 248 sequences from Trypanosoma_brucei_GCF_000210295_merged.bed.  Filtered 205 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 43.
Screened 394 sequences from Trypanosoma_congolense_GCA_002287245_merged.bed.  Filtered 312 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 82.
Screened 10198 sequences from Trypanosoma_conorhini_GCF_003719485_merged.bed.  Filtered 7875 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2323.
Screened 172 sequences from Trypanosoma_equiperdum_GCA_001457755_merged.bed.  Filtered 146 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 26.
Screened 1157 sequences from Trypanosoma_grayi_GCF_000691245_merged.bed.  Filtered 904 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 253.
Screened 9640 sequences from Trypanosoma_rangeli_GCF_003719475_merged.bed.  Filtered 7490 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2150.
Screened 762 sequences from Trypanosoma_theileri_GCF_0

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [40]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [41]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/trypanosoma/initial_intervals/Trypanosoma_vivax_GCA_000227375_merged.bed --twobit results/phyluce/trypanosoma/cleaned_genomes/Trypanosoma_cruzi_GCF_000209065_formatted.2bit --output results/phyluce/trypanosoma/initial_intervals/Trypanosoma_vivax_GCA_000227375_stripped.bed;
trypanosoma_brucei_gcf_000210295.
trypanosoma_congolense_gca_002287245.
trypanosoma_conorhini_gcf_003719485...
trypanosoma_equiperdum_gca_001457755.
trypanosoma_grayi_gcf_000691245.
trypanosoma_rangeli_gcf_003719475...
trypanosoma_theileri_gcf_002087225.
trypanosoma_vivax_gca_000227375.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [42]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/trypanosoma/initial_intervals/trypanosoma-to-Trypanosoma_cruzi_GCF_000209065.sqlite --base-taxon Trypanosoma_cruzi_GCF_000209065
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 0 taxa:	3,458.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 1 taxa:	3,458.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 2 taxa:	1,182.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 3 taxa:	224.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 4 taxa:	107.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 5 taxa:	56.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 6 taxa:	40.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 7 taxa:	21.0
Loci shared by Trypanosoma_cruzi_GCF_000209065 + 8 taxa:	5.0


In [43]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 1
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/trypanosoma/initial_intervals/trypanosoma-to-Trypanosoma_cruzi_GCF_000209065.sqlite --base-taxon Trypanosoma_cruzi_GCF_000209065 --output results/phyluce/trypanosoma/initial_intervals/Trypanosoma_cruzi_GCF_000209065_+1.bed --specific-counts 1;
Counter({'trypanosoma_conorhini_gcf_003719485': 2325, 'trypanosoma_rangeli_gcf_003719475': 2154, 'trypanosoma_grayi_gcf_000691245': 265, 'trypanosoma_theileri_gcf_002087225': 164, 'trypanosoma_congolense_gca_002287245': 83, 'trypanosoma_brucei_gcf_000210295': 42, 'trypanosoma_vivax_gca_000227375': 34, 'trypanosoma_equiperdum_gca_001457755': 26})


## Design temp set of baits

In [44]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/trypanosoma/initial_intervals/Trypanosoma_cruzi_GCF_000209065_+1.bed --twobit results/phyluce/trypanosoma/cleaned_genomes/Trypanosoma_cruzi_GCF_000209065_formatted.2bit --buffer-to 160 --output results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1.fasta;
Screened 3458 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 3458.


design the baits

In [45]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1.fasta --probe-prefix uce_trypanosoma_ --design trypanosoma_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 3453
Probe Count = 6887


## Find duplicate baited regions

In [46]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1_temp_probes.fas --query results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  13:29:45
Ended:  Tue Feb 11, 2020  13:29:49
Time for execution:  0.0633439660072 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1_temp_probes.fas                        --lastz results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1_temp_probes_vself.lastz                       --probe-prefix=uce_trypanosoma_;
Parsing lastz file...
Screening results...
Screened 6886 fasta sequences.  Filtered 1053 duplicates. Kept 4793.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [47]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [48]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/trypanosoma/validate_intervals/Trypanosoma_cruzi_GCF_000209065_+1_temp_probes.fas --scaffoldlist Trypanosoma_brucei_GCF_000210295 Trypanosoma_congolense_GCA_002287245 Trypanosoma_conorhini_GCF_003719485 Trypanosoma_equiperdum_GCA_001457755 Trypanosoma_grayi_GCF_000691245 Trypanosoma_rangeli_GCF_003719475 Trypanosoma_theileri_GCF_002087225 Trypanosoma_vivax_GCA_000227375 Trypanosoma_cruzi_GCF_000209065 --genome-base-path results/phyluce/trypanosoma/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/trypanosoma/validate_intervals/trypanosoma-to-Trypanosoma_cruzi_GCF_000209065.sqlite --output results/phyluce/trypanosoma/validate_intervals/lastz/;

Running against Trypanosoma_brucei_GCF_000210295.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 3 queries...
	/tmp/tmpzGse5k.fasta
	/tmp/tmpBOm4AV.fasta
	/tmp/tmpg6KY1j.fasta

Writing the results file...
	/tmp/tmpR

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [49]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/trypanosoma/extract_probes_from_group/trypanosoma_genome.conf --lastz results/phyluce/trypanosoma/validate_intervals/lastz --probes 120 --probe-prefix uce_trypanosoma_ --name-pattern "Trypanosoma_cruzi_GCF_000209065_+1_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/trypanosoma/extract_probes_from_group/probe_fasta;
2020-02-11 13:32:33,674 - Phyluce - INFO - ------- Working on Trypanosoma_brucei_GCF_000210295 genome ------
2020-02-11 13:32:33,693 - Phyluce - INFO - Reading Trypanosoma_brucei_GCF_000210295 genome
2020-02-11 13:32:49,814 - Phyluce - INFO - Trypanosoma_brucei_GCF_000210295: 3251 uces, 408 dupes, 2843 non-dupes, 21 orient drop, 309 length drop, 2513 written
2020-02-11 13:32:49,815 - Phyluce - INFO - ----- Working on Trypanosoma_congolense_GCA_002287245 genome ----
2020-02-11 13:32:49,838 - Phyluce - INFO - Reading Trypanosoma_congolense_GCA_002287245 genome
2020-02-11 13:33:16,747 - Phyluce - INFO 

In [50]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/trypanosoma/extract_probes_from_group/probe_fasta --output results/phyluce/trypanosoma/extract_probes_from_group/multifastas.sqlite --base-taxon Trypanosoma_cruzi_GCF_000209065;
trypanosoma_brucei_gcf_000210295...
trypanosoma_congolense_gca_002287245...
trypanosoma_conorhini_gcf_003719485...
trypanosoma_equiperdum_gca_001457755...
trypanosoma_grayi_gcf_000691245...
trypanosoma_rangeli_gcf_003719475...
trypanosoma_theileri_gcf_002087225...
trypanosoma_vivax_gca_000227375.
trypanosoma_cruzi_gcf_000209065.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/trypanosoma/extract_probes_from_group/multifastas.sqlite --base-taxon Trypanosoma_cruzi_GCF_000209065;
Loci shared by 0 taxa:	3,251.0
Loci shared by 1 taxa:	3,251.0
Loci shared by 2 taxa:	3,115.0
Loci shared by 3 taxa:	2,970.0
Loci shared by 4 taxa:	2,815.0
Loci shared by 5 taxa:	2,594.0
Loci shared by 6 taxa:	2,370.0
Loci shared by 

In [51]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(8)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/trypanosoma/extract_probes_from_group/multifastas.sqlite  --base-taxon Trypanosoma_cruzi_GCF_000209065 --output results/phyluce/trypanosoma/extract_probes_from_group/Trypanosoma_cruzi_GCF_000209065+1-back-to-8.conf --specific-counts 8;
Counter({'trypanosoma_grayi_gcf_000691245': 389, 'trypanosoma_congolense_gca_002287245': 389, 'trypanosoma_rangeli_gcf_003719475': 389, 'trypanosoma_equiperdum_gca_001457755': 389, 'trypanosoma_conorhini_gcf_003719485': 389, 'trypanosoma_theileri_gcf_002087225': 388, 'trypanosoma_brucei_gcf_000210295': 388, 'trypanosoma_cruzi_gcf_000209065': 321, 'trypanosoma_vivax_gca_000227375': 77})
Total loci = 389


## Final group specific bait design

In [52]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/trypanosoma/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/trypanosoma/extract_probes_from_group/Trypanosoma_cruzi_GCF_000209065+1-back-to-8.conf --probe-prefix uce_trypanosoma_ --designer rnplattii --design trypanosoma_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGGGGGGGGGGGG


Conserved locus count = 389
Probe Count = 6188


In [53]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list.fasta --query results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  13:35:56
Ended:  Tue Feb 11, 2020  13:36:01
Time for execution:  0.0883343497912 minutes


In [54]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list.fasta --lastz results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_trypanosoma_;
Parsing lastz file...
Screening results...
Screened 6187 fasta sequences.  Filtered 12 duplicates. Kept 6000.


## CDhit to reduce numbers

In [55]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list.fasta
         -o
         results/phyluce/trypanosoma/final_probe_design/trypanosoma_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 13:37:46 2020
                            Output                              
----------------------------------------------------------------
total seq: 6188
longest and shortest : 80 and 80
Total letters: 495040
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 87M

Table limit with the given memory limit:
Max number of representatives: 3954451
Max number of word counting entries: 89014700

# comparing sequences from          0  to       1031
.---------- new table with      912 representatives
# comparing sequences from    