## Prep python

In [1]:
import os
import subprocess
import pandas as pd
import shutil
from shutil import copy
import time
from collections import defaultdict
from Bio import SeqIO
import glob

os.chdir("/master/nplatt/pathogen_probes/")


def wait_on_running_jobs():
   
    num_jobs = 1
    
    while num_jobs > 0:
        num_jobs = len(subprocess.check_output('qstat', shell=True).split("\n")) - 2
        time.sleep(60)
        print(".")

# Mycobacterium

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [141]:
group = 'mycobacterium'

In [142]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [143]:
group_taxa = [ "Mycobacterium_ahvazicum_GCF_900176255",
               "Mycobacterium_alsense_GCF_002086635",
               "Mycobacterium_angelicum_GCF_002086155",
               "Mycobacterium_aquaticum_GCF_002086485",
               "Mycobacterium_arosiense_GCF_002086125",
               "Mycobacterium_asiaticum_GCF_000613245",
               "Mycobacterium_attenuatum_GCF_900566085",
               "Mycobacterium_avium_GCF_000007865",
               "Mycobacterium_basiliense_GCF_900292015",
               "Mycobacterium_bohemicum_GCF_001053185",
               "Mycobacterium_bouchedurhonense_GCF_002086165",
               "Mycobacterium_branderi_GCF_002086575",
               "Mycobacterium_canettii_GCF_000253375",
               "Mycobacterium_celatum_GCF_002101595",
               "Mycobacterium_chimaera_GCF_002219285",
               "Mycobacterium_colombiense_GCF_002105755",
               "Mycobacterium_conspcuum_GCF_002102095",
               "Mycobacterium_decipiens_GCF_002104675",
               "Mycobacterium_dioxanotrophicus_GCF_002157835",
               "Mycobacterium_eburneum_GCF_004354905",
               "Mycobacterium_europaeum_GCF_001373515",
               "Mycobacterium_florentinum_GCF_002101635",
               "Mycobacterium_fragae_GCF_002102185",
               "Mycobacterium_gastri_GCF_002102175",
               "Mycobacterium_genavense_GCF_000526915",
               "Mycobacterium_gordonae_GCF_002101675",
               "Mycobacterium_grossiae_GCF_008329645",
               "Mycobacterium_haemophilum_GCF_000340435",
               "Mycobacterium_heckeshornense_GCF_001881585",
               "Mycobacterium_heidelbergense_GCF_002086215",
               "Mycobacterium_innocens_GCF_900566055",
               "Mycobacterium_interjectum_GCF_900078675",
               "Mycobacterium_intermedium_GCF_002086275",
               "Mycobacterium_intracellulare_GCF_000277125",
               "Mycobacterium_kansasii_GCF_000157895",
               "Mycobacterium_kubicae_GCF_002101745",
               "Mycobacterium_kyorinense_GCF_000759695",
               "Mycobacterium_lacus_GCF_002102215",
               "Mycobacterium_lehmannii_GCF_002245535",
               "Mycobacterium_lentiflavum_GCF_001373395",
               "Mycobacterium_leprae_GCF_000195855",
               "Mycobacterium_lepraemurium_GCF_002291465",
               "Mycobacterium_lepromatosis_GCF_000966355",
               "Mycobacterium_liflandii_GCF_000026445",
               "Mycobacterium_malmoense_GCF_002086305",
               "Mycobacterium_mantenii_GCF_002086335",
               "Mycobacterium_marinum_GCF_000723425",
               "Mycobacterium_marseillense_GCF_002285715",
               "Mycobacterium_montefiorense_GCF_003112775",
               "Mycobacterium_mungi_GCF_001652545",
               "Mycobacterium_nebraskense_GCF_002102255",
               "Mycobacterium_neumannii_GCF_002245615",
               "Mycobacterium_noviomagense_GCF_002086415",
               "Mycobacterium_orygis_GCF_006385035",
               "Mycobacterium_palustre_GCF_002101785",
               "Mycobacterium_paraense_GCF_002101815",
               "Mycobacterium_paraffinicum_GCF_001907675",
               "Mycobacterium_paragordonae_GCF_003614435",
               "Mycobacterium_paraintracellulare_GCF_000276825",
               "Mycobacterium_parascrofulaceum_GCF_000164135",
               "Mycobacterium_paraseoulense_GCF_002086475",
               "Mycobacterium_parmense_GCF_002102335",
               "Mycobacterium_persicum_GCF_002086675",
               "Mycobacterium_pseudokansasii_GCF_900566075",
               "Mycobacterium_pseudoshottsii_GCF_003584745",
               "Mycobacterium_riyadhense_GCF_002101845",
               "Mycobacterium_saskatchewanense_GCF_002101875",
               "Mycobacterium_scrofulaceum_GCF_002086735",
               "Mycobacterium_sherrisii_GCF_002102355",
               "Mycobacterium_shigaense_GCF_002356315",
               "Mycobacterium_shimoidei_GCF_002101905",
               "Mycobacterium_shinjukuense_GCF_002086755",
               "Mycobacterium_simiae_GCF_002093075",
               "Mycobacterium_sp_GCF_000328565",
               "Mycobacterium_syngnathidarum_GCF_001942625",
               "Mycobacterium_szulgai_GCF_002116635",
               "Mycobacterium_talmoniae_GCF_002967005",
               "Mycobacterium_tilburgii_GCF_902168065",
               "Mycobacterium_timonense_GCF_002086775",
               "Mycobacterium_triplex_GCF_000689255",
               "Mycobacterium_uberis_GCF_003408705",
               "Mycobacterium_ulcerans_GCF_000013925",
               "Mycobacterium_vulneris_GCF_002104765"  ]
                    
reference_taxon = "Mycobacterium_tuberculosis_GCF_000195955"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [144]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900176255.2_PRJEB20293_genomic.fna.gz

sent 42 bytes  received 1770222 bytes  1180176.00 bytes/sec
total size is 1769677  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
discipli

GCA_002086165.1_ASM208616v1_genomic.fna.gz

sent 42 bytes  received 1697891 bytes  1131955.33 bytes/sec
total size is 1697369  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002086575.1_ASM208657v1_genomic.fna.gz

sent 42 bytes  received 1709243 bytes  1139523.33 bytes/sec
total size is 1708713  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_002101635.1_ASM210163v1_genomic.fna.gz

sent 42 bytes  received 1788132 bytes  1192116.00 bytes/sec
total size is 1787586  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002102185.1_ASM210218v1_genomic.fna.gz

sent 42 bytes  received 1372234 bytes  914850.67 bytes/sec
total size is 1371792  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute

GCA_002086275.1_ASM208627v1_genomic.fna.gz

sent 42 bytes  received 2000766 bytes  1333872.00 bytes/sec
total size is 2000164  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000277125.1_ASM27712v1_genomic.fna.gz

sent 42 bytes  received 1545607 bytes  1030432.67 bytes/sec
total size is 1545118  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute

GCA_000026445.2_ASM2644v2_genomic.fna.gz

sent 42 bytes  received 1811799 bytes  1207894.00 bytes/sec
total size is 1811247  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002086305.1_ASM208630v1_genomic.fna.gz

sent 42 bytes  received 1537825 bytes  1025244.67 bytes/sec
total size is 1537343  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_002101785.1_ASM210178v1_genomic.fna.gz

sent 42 bytes  received 1738558 bytes  1159066.67 bytes/sec
total size is 1738020  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002101815.1_ASM210181v1_genomic.fna.gz

sent 42 bytes  received 1604399 bytes  641776.40 bytes/sec
total size is 1603901  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute

GCA_002101845.1_ASM210184v1_genomic.fna.gz

sent 42 bytes  received 1836158 bytes  1224133.33 bytes/sec
total size is 1835596  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002101875.1_ASM210187v1_genomic.fna.gz

sent 42 bytes  received 1711662 bytes  1141136.00 bytes/sec
total size is 1711132  speedup is 1.00


You are accessing a U.S. Government information system which includes this
comput

GCA_002967005.1_ASM296700v1_genomic.fna.gz

sent 42 bytes  received 1481308 bytes  987566.67 bytes/sec
total size is 1480834  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_902168065.1_PRJEB33515_genomic.fna.gz

sent 42 bytes  received 945889 bytes  630620.67 bytes/sec
total size is 945552  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, n

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [145]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [146]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480936 ("sim_Mycobacterium_ahvazicum_GCF_900176255") has been submitted
Your job 5480937 ("sim_Mycobacterium_alsense_GCF_002086635") has been submitted
Your job 5480938 ("sim_Mycobacterium_angelicum_GCF_002086155") has been submitted
Your job 5480939 ("sim_Mycobacterium_aquaticum_GCF_002086485") has been submitted
Your job 5480940 ("sim_Mycobacterium_arosiense_GCF_002086125") has been submitted
Your job 5480941 ("sim_Mycobacterium_asiaticum_GCF_000613245") has been submitted
Your job 5480942 ("sim_Mycobacterium_attenuatum_GCF_900566085") has been submitted
Your job 5480943 ("sim_Mycobacterium_avium_GCF_000007865") has been submitted
Your job 5480944 ("sim_Mycobacterium_basiliense_GCF_900292015") has been submitted
Your job 5480945 ("sim_Mycobacterium_bohemicum_GCF_001053185") has been submitted
Your job 5480946 ("sim_Mycobacterium_bouchedurhonense_GCF_002086165") has been submitted
Your job 5480947 ("sim_Mycobacterium_branderi_GCF_002086575") has been submitted
Your job 54809

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [147]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/mycobacterium/cleaned_genomes/Mycobacterium_tuberculosis_GCF_000195955_formatted.fas path=results/phyluce/mycobacterium/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [148]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481019 ("map_Mycobacterium_ahvazicum_GCF_900176255") has been submitted
Your job 5481020 ("map_Mycobacterium_alsense_GCF_002086635") has been submitted
Your job 5481021 ("map_Mycobacterium_angelicum_GCF_002086155") has been submitted
Your job 5481022 ("map_Mycobacterium_aquaticum_GCF_002086485") has been submitted
Your job 5481023 ("map_Mycobacterium_arosiense_GCF_002086125") has been submitted
Your job 5481024 ("map_Mycobacterium_asiaticum_GCF_000613245") has been submitted
Your job 5481025 ("map_Mycobacterium_attenuatum_GCF_900566085") has been submitted
Your job 5481026 ("map_Mycobacterium_avium_GCF_000007865") has been submitted
Your job 5481027 ("map_Mycobacterium_basiliense_GCF_900292015") has been submitted
Your job 5481028 ("map_Mycobacterium_bohemicum_GCF_001053185") has been submitted
Your job 5481029 ("map_Mycobacterium_bouchedurhonense_GCF_002086165") has been submitted
Your job 5481030 ("map_Mycobacterium_branderi_GCF_002086575") has been submitted
Your job 54810

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [149]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481102 ("merge_Mycobacterium_ahvazicum_GCF_900176255") has been submitted
Your job 5481103 ("merge_Mycobacterium_alsense_GCF_002086635") has been submitted
Your job 5481104 ("merge_Mycobacterium_angelicum_GCF_002086155") has been submitted
Your job 5481105 ("merge_Mycobacterium_aquaticum_GCF_002086485") has been submitted
Your job 5481106 ("merge_Mycobacterium_arosiense_GCF_002086125") has been submitted
Your job 5481107 ("merge_Mycobacterium_asiaticum_GCF_000613245") has been submitted
Your job 5481108 ("merge_Mycobacterium_attenuatum_GCF_900566085") has been submitted
Your job 5481109 ("merge_Mycobacterium_avium_GCF_000007865") has been submitted
Your job 5481110 ("merge_Mycobacterium_basiliense_GCF_900292015") has been submitted
Your job 5481111 ("merge_Mycobacterium_bohemicum_GCF_001053185") has been submitted
Your job 5481112 ("merge_Mycobacterium_bouchedurhonense_GCF_002086165") has been submitted
Your job 5481113 ("merge_Mycobacterium_branderi_GCF_002086575") has been 

In [150]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 2386 sequences from Mycobacterium_ahvazicum_GCF_900176255_merged.bed.  Filtered 1589 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 797.
Screened 3113 sequences from Mycobacterium_alsense_GCF_002086635_merged.bed.  Filtered 2002 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1111.
Screened 3568 sequences from Mycobacterium_angelicum_GCF_002086155_merged.bed.  Filtered 2226 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1342.
Screened 850 sequences from Mycobacterium_aquaticum_GCF_002086485_merged.bed.  Filtered 623 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 227.
Screened 2374 sequences from Mycobacterium_arosiense_GCF_002086125_merged.bed.  Filtered 1603 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 771.
Screened 2180 sequences from Mycobacterium_asiaticum_GCF_000613245_merged.bed.  Filtered 1524 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 656.
Screened 3164 sequences from M

Screened 767 sequences from Mycobacterium_neumannii_GCF_002245615_merged.bed.  Filtered 566 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 201.
Screened 1687 sequences from Mycobacterium_noviomagense_GCF_002086415_merged.bed.  Filtered 1184 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 503.
Screened 215 sequences from Mycobacterium_orygis_GCF_006385035_merged.bed.  Filtered 17 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 198.
Screened 3052 sequences from Mycobacterium_palustre_GCF_002101785_merged.bed.  Filtered 1958 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1094.
Screened 3154 sequences from Mycobacterium_paraense_GCF_002101815_merged.bed.  Filtered 1972 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1182.
Screened 2543 sequences from Mycobacterium_paraffinicum_GCF_001907675_merged.bed.  Filtered 1659 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 884.
Screened 2423 sequences from M

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [151]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [152]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/mycobacterium/initial_intervals/Mycobacterium_vulneris_GCF_002104765_merged.bed --twobit results/phyluce/mycobacterium/cleaned_genomes/Mycobacterium_tuberculosis_GCF_000195955_formatted.2bit --output results/phyluce/mycobacterium/initial_intervals/Mycobacterium_vulneris_GCF_002104765_stripped.bed;
mycobacterium_ahvazicum_gcf_900176255.
mycobacterium_alsense_gcf_002086635..
mycobacterium_angelicum_gcf_002086155..
mycobacterium_aquaticum_gcf_002086485.
mycobacterium_arosiense_gcf_002086125.
mycobacterium_asiaticum_gcf_000613245.
mycobacterium_attenuatum_gcf_900566085..
mycobacterium_avium_gcf_000007865.
mycobacterium_basiliense_gcf_900292015.
mycobacterium_bohemicum_gcf_001053185.
mycobacterium_bouchedurhonense_gcf_002086165..
mycobacterium_branderi_gcf_002086575.
mycobacterium_canettii_gcf_000253375.
mycobacterium_celatum_gcf_002101595.
mycobacterium_chimaera_gcf_002219285.
mycobacterium_c

Quantify probes and the number of targeted taxa for each.

In [153]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/mycobacterium/initial_intervals/Mycobacterium_vulneris_GCF_002104765_merged.bed --twobit results/phyluce/mycobacterium/cleaned_genomes/Mycobacterium_tuberculosis_GCF_000195955_formatted.2bit --output results/phyluce/mycobacterium/initial_intervals/Mycobacterium_vulneris_GCF_002104765_stripped.bed;
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 0 taxa:	6,832.0
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 1 taxa:	6,832.0
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 2 taxa:	6,370.0
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 3 taxa:	6,243.0
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 4 taxa:	4,460.0
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 5 taxa:	3,759.0
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 6 taxa:	3,326.0
Loci shared by Mycobacterium_tuberculosis_GCF_000195955 + 7 taxa:	3,066.0
Loci share

In [154]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 70
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/mycobacterium/initial_intervals/mycobacterium-to-Mycobacterium_tuberculosis_GCF_000195955.sqlite --base-taxon Mycobacterium_tuberculosis_GCF_000195955 --output results/phyluce/mycobacterium/initial_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70.bed --specific-counts 70;
Counter({'mycobacterium_vulneris_gcf_002104765': 182, 'mycobacterium_paraffinicum_gcf_001907675': 182, 'mycobacterium_colombiense_gcf_002105755': 182, 'mycobacterium_avium_gcf_000007865': 182, 'mycobacterium_orygis_gcf_006385035': 182, 'mycobacterium_decipiens_gcf_002104675': 182, 'mycobacterium_intracellulare_gcf_000277125': 182, 'mycobacterium_lacus_gcf_002102215': 182, 'mycobacterium_malmoense_gcf_002086305': 182, 'mycobacterium_mungi_gcf_001652545': 182, 'mycobacterium_canettii_gcf_000253375': 182, 'mycobacterium_paraense_gcf_002101815': 182, 'mycobacterium_pseudokansasii_gcf_900566075': 181, 'mycobacterium_palustre_gcf_002101785': 181, 'mycobacteriu

## Design temp set of baits

In [155]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/mycobacterium/initial_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70.bed --twobit results/phyluce/mycobacterium/cleaned_genomes/Mycobacterium_tuberculosis_GCF_000195955_formatted.2bit --buffer-to 160 --output results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70.fasta;
Screened 182 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 182.


design the baits

In [156]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70.fasta --probe-prefix uce_mycobacterium_ --design mycobacterium_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 172
Probe Count = 333


## Find duplicate baited regions

In [157]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas --query results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  14:20:49
Ended:  Thu Feb 06, 2020  14:20:49
Time for execution:  0.00333928267161 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas                        --lastz results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes_vself.lastz                       --probe-prefix=uce_mycobacterium_;
Parsing lastz file...
Screening results...
Screened 332 fa

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [158]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [159]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/mycobacterium/validate_intervals/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas --scaffoldlist Mycobacterium_ahvazicum_GCF_900176255 Mycobacterium_alsense_GCF_002086635 Mycobacterium_angelicum_GCF_002086155 Mycobacterium_aquaticum_GCF_002086485 Mycobacterium_arosiense_GCF_002086125 Mycobacterium_asiaticum_GCF_000613245 Mycobacterium_attenuatum_GCF_900566085 Mycobacterium_avium_GCF_000007865 Mycobacterium_basiliense_GCF_900292015 Mycobacterium_bohemicum_GCF_001053185 Mycobacterium_bouchedurhonense_GCF_002086165 Mycobacterium_branderi_GCF_002086575 Mycobacterium_canettii_GCF_000253375 Mycobacterium_celatum_GCF_002101595 Mycobacterium_chimaera_GCF_002219285 Mycobacterium_colombiense_GCF_002105755 Mycobacterium_conspcuum_GCF_002102095 Mycobacterium_decipiens_GCF_002104675 Mycobacterium_dioxanotrophicus_GCF_002157835 Mycobacterium_eburneum_GCF_004354905 Mycobacterium_europaeum_GCF_001373515 Mycobacterium_flor


Running against Mycobacterium_basiliense_GCF_900292015.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpbAscj9.fasta

Writing the results file...
	/tmp/tmp2QMCbd.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_intervals/lastz/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas_v_Mycobacterium_basiliense_GCF_900292015.lastz
Creating Mycobacterium_basiliense_GCF_900292015 table
Inserting data to Mycobacterium_basiliense_GCF_900292015 table

Running against Mycobacterium_bohemicum_GCF_001053185.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp7YGlnG.fasta

Writing the results file...
	/tmp/tmpwGiqYL.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_intervals/lastz/Mycobacterium_tuberculosis_GCF_000

Inserting data to Mycobacterium_florentinum_GCF_002101635 table

Running against Mycobacterium_fragae_GCF_002102185.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpXpkzrq.fasta

Writing the results file...
	/tmp/tmpTziuKf.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_intervals/lastz/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas_v_Mycobacterium_fragae_GCF_002102185.lastz
Creating Mycobacterium_fragae_GCF_002102185 table
Inserting data to Mycobacterium_fragae_GCF_002102185 table

Running against Mycobacterium_gastri_GCF_002102175.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp3pe4kc.fasta

Writing the results file...
	/tmp/tmpZSuRd3.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_inter

Inserting data to Mycobacterium_kubicae_GCF_002101745 table

Running against Mycobacterium_kyorinense_GCF_000759695.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpxFyehe.fasta

Writing the results file...
	/tmp/tmppJ6dJN.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_intervals/lastz/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas_v_Mycobacterium_kyorinense_GCF_000759695.lastz
Creating Mycobacterium_kyorinense_GCF_000759695 table
Inserting data to Mycobacterium_kyorinense_GCF_000759695 table

Running against Mycobacterium_lacus_GCF_002102215.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp8CTHoA.fasta

Writing the results file...
	/tmp/tmpoGzyfl.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/val

Inserting data to Mycobacterium_mungi_GCF_001652545 table

Running against Mycobacterium_nebraskense_GCF_002102255.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp7nh_1z.fasta

Writing the results file...
	/tmp/tmpOycUsL.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_intervals/lastz/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas_v_Mycobacterium_nebraskense_GCF_002102255.lastz
Creating Mycobacterium_nebraskense_GCF_002102255 table
Inserting data to Mycobacterium_nebraskense_GCF_002102255 table

Running against Mycobacterium_neumannii_GCF_002245615.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpLpPRuf.fasta

Writing the results file...
	/tmp/tmpNAynNn.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacteri

Inserting data to Mycobacterium_pseudokansasii_GCF_900566075 table

Running against Mycobacterium_pseudoshottsii_GCF_003584745.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpvGkav_.fasta

Writing the results file...
	/tmp/tmp9PPyK9.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_intervals/lastz/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas_v_Mycobacterium_pseudoshottsii_GCF_003584745.lastz
Creating Mycobacterium_pseudoshottsii_GCF_003584745 table
Inserting data to Mycobacterium_pseudoshottsii_GCF_003584745 table

Running against Mycobacterium_riyadhense_GCF_002101845.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpI3WgTu.fasta

Writing the results file...
	/tmp/tmpC0aCnW.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/resul

Creating Mycobacterium_tilburgii_GCF_902168065 table
Inserting data to Mycobacterium_tilburgii_GCF_902168065 table

Running against Mycobacterium_timonense_GCF_002086775.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpDDz2uc.fasta

Writing the results file...
	/tmp/tmpvKiEqx.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/mycobacterium/validate_intervals/lastz/Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas_v_Mycobacterium_timonense_GCF_002086775.lastz
Creating Mycobacterium_timonense_GCF_002086775 table
Inserting data to Mycobacterium_timonense_GCF_002086775 table

Running against Mycobacterium_triplex_GCF_000689255.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp3oGfQR.fasta

Writing the results file...
	/tmp/tmpNNxzCS.lastz
Cleaning up the chunked files...
Cleaning /master/npl

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [161]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/mycobacterium/extract_probes_from_group/mycobacterium_genome.conf --lastz results/phyluce/mycobacterium/validate_intervals/lastz --probes 120 --probe-prefix uce_mycobacterium_ --name-pattern "Mycobacterium_tuberculosis_GCF_000195955_+70_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/mycobacterium/extract_probes_from_group/probe_fasta;
2020-02-06 14:25:42,923 - Phyluce - INFO - ---- Working on Mycobacterium_ahvazicum_GCF_900176255 genome ----
2020-02-06 14:25:42,931 - Phyluce - INFO - Reading Mycobacterium_ahvazicum_GCF_900176255 genome
2020-02-06 14:25:43,677 - Phyluce - INFO - Mycobacterium_ahvazicum_GCF_900176255: 172 uces, 18 dupes, 154 non-dupes, 4 orient drop, 16 length drop, 134 written
2020-02-06 14:25:43,678 - Phyluce - INFO - ----- Working on Mycobacterium_alsense_GCF_002086635 genome -----
2020-02-06 14:25:43,686 - Phyluce - INFO - Reading Mycobacterium_alsense_GCF_002086635 genome
2020-02-06 14:25:4

2020-02-06 14:25:58,009 - Phyluce - INFO - Mycobacterium_florentinum_GCF_002101635: 172 uces, 26 dupes, 146 non-dupes, 2 orient drop, 5 length drop, 139 written
2020-02-06 14:25:58,009 - Phyluce - INFO - ------ Working on Mycobacterium_fragae_GCF_002102185 genome -----
2020-02-06 14:25:58,023 - Phyluce - INFO - Reading Mycobacterium_fragae_GCF_002102185 genome
2020-02-06 14:25:58,684 - Phyluce - INFO - Mycobacterium_fragae_GCF_002102185: 172 uces, 25 dupes, 147 non-dupes, 0 orient drop, 4 length drop, 143 written
2020-02-06 14:25:58,684 - Phyluce - INFO - ------ Working on Mycobacterium_gastri_GCF_002102175 genome -----
2020-02-06 14:25:58,685 - Phyluce - INFO - Reading Mycobacterium_gastri_GCF_002102175 genome
2020-02-06 14:25:59,341 - Phyluce - INFO - Mycobacterium_gastri_GCF_002102175: 172 uces, 26 dupes, 146 non-dupes, 0 orient drop, 4 length drop, 142 written
2020-02-06 14:25:59,342 - Phyluce - INFO - ---- Working on Mycobacterium_genavense_GCF_000526915 genome ----
2020-02-06 14:

2020-02-06 14:26:13,206 - Phyluce - INFO - Mycobacterium_malmoense_GCF_002086305: 172 uces, 29 dupes, 143 non-dupes, 0 orient drop, 7 length drop, 136 written
2020-02-06 14:26:13,206 - Phyluce - INFO - ----- Working on Mycobacterium_mantenii_GCF_002086335 genome ----
2020-02-06 14:26:13,207 - Phyluce - INFO - Reading Mycobacterium_mantenii_GCF_002086335 genome
2020-02-06 14:26:13,867 - Phyluce - INFO - Mycobacterium_mantenii_GCF_002086335: 172 uces, 27 dupes, 145 non-dupes, 0 orient drop, 4 length drop, 141 written
2020-02-06 14:26:13,868 - Phyluce - INFO - ----- Working on Mycobacterium_marinum_GCF_000723425 genome -----
2020-02-06 14:26:13,884 - Phyluce - INFO - Reading Mycobacterium_marinum_GCF_000723425 genome
2020-02-06 14:26:14,559 - Phyluce - INFO - Mycobacterium_marinum_GCF_000723425: 172 uces, 0 dupes, 172 non-dupes, 14 orient drop, 20 length drop, 138 written
2020-02-06 14:26:14,559 - Phyluce - INFO - --- Working on Mycobacterium_marseillense_GCF_002285715 genome --
2020-02-0

2020-02-06 14:26:28,727 - Phyluce - INFO - Mycobacterium_scrofulaceum_GCF_002086735: 172 uces, 26 dupes, 146 non-dupes, 0 orient drop, 4 length drop, 142 written
2020-02-06 14:26:28,727 - Phyluce - INFO - ---- Working on Mycobacterium_sherrisii_GCF_002102355 genome ----
2020-02-06 14:26:28,736 - Phyluce - INFO - Reading Mycobacterium_sherrisii_GCF_002102355 genome
2020-02-06 14:26:29,409 - Phyluce - INFO - Mycobacterium_sherrisii_GCF_002102355: 172 uces, 26 dupes, 146 non-dupes, 1 orient drop, 4 length drop, 141 written
2020-02-06 14:26:29,409 - Phyluce - INFO - ---- Working on Mycobacterium_shigaense_GCF_002356315 genome ----
2020-02-06 14:26:29,430 - Phyluce - INFO - Reading Mycobacterium_shigaense_GCF_002356315 genome
2020-02-06 14:26:30,095 - Phyluce - INFO - Mycobacterium_shigaense_GCF_002356315: 172 uces, 0 dupes, 172 non-dupes, 16 orient drop, 16 length drop, 140 written
2020-02-06 14:26:30,096 - Phyluce - INFO - ---- Working on Mycobacterium_shimoidei_GCF_002101905 genome ----


In [162]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/mycobacterium/extract_probes_from_group/probe_fasta --output results/phyluce/mycobacterium/extract_probes_from_group/multifastas.sqlite --base-taxon Mycobacterium_tuberculosis_GCF_000195955;
mycobacterium_ahvazicum_gcf_900176255.
mycobacterium_alsense_gcf_002086635.
mycobacterium_angelicum_gcf_002086155.
mycobacterium_aquaticum_gcf_002086485.
mycobacterium_arosiense_gcf_002086125.
mycobacterium_asiaticum_gcf_000613245.
mycobacterium_attenuatum_gcf_900566085.
mycobacterium_avium_gcf_000007865.
mycobacterium_basiliense_gcf_900292015.
mycobacterium_bohemicum_gcf_001053185.
mycobacterium_bouchedurhonense_gcf_002086165.
mycobacterium_branderi_gcf_002086575.
mycobacterium_canettii_gcf_000253375.
mycobacterium_celatum_gcf_002101595.
mycobacterium_chimaera_gcf_002219285.
mycobacterium_colombiense_gcf_002105755.
mycobacterium_conspcuum_gcf_002102095.
mycobacterium_decipiens_gcf_002104675.
mycobacterium_dioxanotrophicus_gcf_002157835.


In [163]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(80)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/mycobacterium/extract_probes_from_group/multifastas.sqlite  --base-taxon Mycobacterium_tuberculosis_GCF_000195955 --output results/phyluce/mycobacterium/extract_probes_from_group/Mycobacterium_tuberculosis_GCF_000195955+70-back-to-80.conf --specific-counts 80;
Counter({'mycobacterium_basiliense_gcf_900292015': 104, 'mycobacterium_attenuatum_gcf_900566085': 104, 'mycobacterium_shinjukuense_gcf_002086755': 104, 'mycobacterium_innocens_gcf_900566055': 104, 'mycobacterium_europaeum_gcf_001373515': 104, 'mycobacterium_kubicae_gcf_002101745': 104, 'mycobacterium_kansasii_gcf_000157895': 104, 'mycobacterium_orygis_gcf_006385035': 104, 'mycobacterium_decipiens_gcf_002104675': 104, 'mycobacterium_genavense_gcf_000526915': 104, 'mycobacterium_celatum_gcf_002101595': 104, 'mycobacterium_montefiorense_gcf_003112775': 104, 'mycobacterium_shimoidei_gcf_002101905': 104, 'mycobacterium_noviomagense_gcf_002086415': 104, 'mycobacterium_tuberculo

## Final group specific bait design

In [164]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/mycobacterium/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/mycobacterium/extract_probes_from_group/Mycobacterium_tuberculosis_GCF_000195955+70-back-to-80.conf --probe-prefix uce_mycobacterium_ --designer rnplattii --design mycobacterium_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [165]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list.fasta --query results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  14:29:39
Ended:  Thu Feb 06, 2020  14:31:44
Time for execution:  2.08234694799 minutes


In [166]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list.fasta --lastz results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_mycobacterium_;
Parsing lastz file...
Screening results...
Screened 15606 fasta sequences.  Filtered 2 duplicates. Kept 15291.


## CDhit to reduce numbers

In [167]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list.fasta
         -o
         results/phyluce/mycobacterium/final_probe_design/mycobacterium_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 15:25:41 2020
                            Output                              
----------------------------------------------------------------
total seq: 15607
longest and shortest : 80 and 80
Total letters: 1248560
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 3M
Buffer          : 4 X 12M = 48M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 90M

Table limit with the given memory limit:
Max number of representatives: 3938410
Max number of word counting entries: 88653624

# comparing sequences from          0  to       2601
..---------- new table with     1427 representatives
# comparing sequenc

# Pasteurella

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [2]:
group = 'pasteurella'

In [3]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [4]:
group_taxa = [ "Pasteurella_bettyae_GCF_000262245",
               "Pasteurella_caecimuris_GCF_004793515",
               "Pasteurella_canis_GCF_900454865",
               "Pasteurella_dagmatis_GCF_900186835",
               "Pasteurella_langaaensis_GCF_003096995",
               "Pasteurella_oralis_GCF_002850605",
               "Pasteurella_skyensis_GCF_900109845",
               "Pasteurella_sp_GCF_004570945",
               "Pasteurella_testudinis_GCF_900454705" ]
                    
reference_taxon =  "Pasteurella_multocida_GCF_000754275"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [7]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000262245.1_ASM26224v1_genomic.fna.gz

sent 42 bytes  received 666562 bytes  444402.67 bytes/sec
total size is 666289  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinar

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [8]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [9]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480620 ("sim_Pasteurella_bettyae_GCF_000262245") has been submitted
Your job 5480621 ("sim_Pasteurella_caecimuris_GCF_004793515") has been submitted
Your job 5480622 ("sim_Pasteurella_canis_GCF_900454865") has been submitted
Your job 5480623 ("sim_Pasteurella_dagmatis_GCF_900186835") has been submitted
Your job 5480624 ("sim_Pasteurella_langaaensis_GCF_003096995") has been submitted
Your job 5480625 ("sim_Pasteurella_oralis_GCF_002850605") has been submitted
Your job 5480626 ("sim_Pasteurella_skyensis_GCF_900109845") has been submitted
Your job 5480627 ("sim_Pasteurella_sp_GCF_004570945") has been submitted
Your job 5480628 ("sim_Pasteurella_testudinis_GCF_900454705") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [10]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/pasteurella/cleaned_genomes/Pasteurella_multocida_GCF_000754275_formatted.fas path=results/phyluce/pasteurella/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [11]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480629 ("map_Pasteurella_bettyae_GCF_000262245") has been submitted
Your job 5480630 ("map_Pasteurella_caecimuris_GCF_004793515") has been submitted
Your job 5480631 ("map_Pasteurella_canis_GCF_900454865") has been submitted
Your job 5480632 ("map_Pasteurella_dagmatis_GCF_900186835") has been submitted
Your job 5480633 ("map_Pasteurella_langaaensis_GCF_003096995") has been submitted
Your job 5480634 ("map_Pasteurella_oralis_GCF_002850605") has been submitted
Your job 5480635 ("map_Pasteurella_skyensis_GCF_900109845") has been submitted
Your job 5480636 ("map_Pasteurella_sp_GCF_004570945") has been submitted
Your job 5480637 ("map_Pasteurella_testudinis_GCF_900454705") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [12]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480638 ("merge_Pasteurella_bettyae_GCF_000262245") has been submitted
Your job 5480639 ("merge_Pasteurella_caecimuris_GCF_004793515") has been submitted
Your job 5480640 ("merge_Pasteurella_canis_GCF_900454865") has been submitted
Your job 5480641 ("merge_Pasteurella_dagmatis_GCF_900186835") has been submitted
Your job 5480642 ("merge_Pasteurella_langaaensis_GCF_003096995") has been submitted
Your job 5480643 ("merge_Pasteurella_oralis_GCF_002850605") has been submitted
Your job 5480644 ("merge_Pasteurella_skyensis_GCF_900109845") has been submitted
Your job 5480645 ("merge_Pasteurella_sp_GCF_004570945") has been submitted
Your job 5480646 ("merge_Pasteurella_testudinis_GCF_900454705") has been submitted


In [13]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 263 sequences from Pasteurella_bettyae_GCF_000262245_merged.bed.  Filtered 189 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 74.
Screened 263 sequences from Pasteurella_caecimuris_GCF_004793515_merged.bed.  Filtered 186 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 77.
Screened 1116 sequences from Pasteurella_canis_GCF_900454865_merged.bed.  Filtered 702 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 414.
Screened 1066 sequences from Pasteurella_dagmatis_GCF_900186835_merged.bed.  Filtered 682 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 384.
Screened 268 sequences from Pasteurella_langaaensis_GCF_003096995_merged.bed.  Filtered 191 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 77.
Screened 1203 sequences from Pasteurella_oralis_GCF_002850605_merged.bed.  Filtered 764 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 439.
Screened 183 sequences from Pasteurella_skyensis_GCF_900109

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [14]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [15]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/pasteurella/initial_intervals/Pasteurella_testudinis_GCF_900454705_merged.bed --twobit results/phyluce/pasteurella/cleaned_genomes/Pasteurella_multocida_GCF_000754275_formatted.2bit --output results/phyluce/pasteurella/initial_intervals/Pasteurella_testudinis_GCF_900454705_stripped.bed;
pasteurella_bettyae_gcf_000262245.
pasteurella_caecimuris_gcf_004793515.
pasteurella_canis_gcf_900454865.
pasteurella_dagmatis_gcf_900186835.
pasteurella_langaaensis_gcf_003096995.
pasteurella_oralis_gcf_002850605.
pasteurella_skyensis_gcf_900109845.
pasteurella_sp_gcf_004570945.
pasteurella_testudinis_gcf_900454705.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [16]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/pasteurella/initial_intervals/Pasteurella_testudinis_GCF_900454705_merged.bed --twobit results/phyluce/pasteurella/cleaned_genomes/Pasteurella_multocida_GCF_000754275_formatted.2bit --output results/phyluce/pasteurella/initial_intervals/Pasteurella_testudinis_GCF_900454705_stripped.bed;
Loci shared by Pasteurella_multocida_GCF_000754275 + 0 taxa:	751.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 1 taxa:	751.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 2 taxa:	404.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 3 taxa:	228.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 4 taxa:	95.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 5 taxa:	70.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 6 taxa:	59.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 7 taxa:	49.0
Loci shared by Pasteurella_multocida_GCF_000754275 + 8 taxa:	36.0
Loci shared by 

In [18]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 3
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/pasteurella/initial_intervals/pasteurella-to-Pasteurella_multocida_GCF_000754275.sqlite --base-taxon Pasteurella_multocida_GCF_000754275 --output results/phyluce/pasteurella/initial_intervals/Pasteurella_multocida_GCF_000754275_+3.bed --specific-counts 3;
Counter({'pasteurella_dagmatis_gcf_900186835': 221, 'pasteurella_canis_gcf_900454865': 219, 'pasteurella_oralis_gcf_002850605': 217, 'pasteurella_langaaensis_gcf_003096995': 70, 'pasteurella_caecimuris_gcf_004793515': 68, 'pasteurella_bettyae_gcf_000262245': 67, 'pasteurella_sp_gcf_004570945': 63, 'pasteurella_testudinis_gcf_900454705': 45, 'pasteurella_skyensis_gcf_900109845': 43})


## Design temp set of baits

In [19]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/pasteurella/initial_intervals/Pasteurella_multocida_GCF_000754275_+3.bed --twobit results/phyluce/pasteurella/cleaned_genomes/Pasteurella_multocida_GCF_000754275_formatted.2bit --buffer-to 160 --output results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3.fasta;
Screened 228 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 228.


design the baits

In [20]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3.fasta --probe-prefix uce_pasteurella_ --design pasteurella_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGG


Conserved locus count = 227
Probe Count = 446


## Find duplicate baited regions

In [21]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3_temp_probes.fas --query results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  11:50:51
Ended:  Thu Feb 06, 2020  11:50:51
Time for execution:  0.00381955305735 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3_temp_probes.fas                        --lastz results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3_temp_probes_vself.lastz                       --probe-prefix=uce_pasteurella_;
Parsing lastz file...
Screening results...
Screened 445 fasta sequences.  Filtered 10 duplicates. Ke

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [22]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [23]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/pasteurella/validate_intervals/Pasteurella_multocida_GCF_000754275_+3_temp_probes.fas --scaffoldlist Pasteurella_bettyae_GCF_000262245 Pasteurella_caecimuris_GCF_004793515 Pasteurella_canis_GCF_900454865 Pasteurella_dagmatis_GCF_900186835 Pasteurella_langaaensis_GCF_003096995 Pasteurella_oralis_GCF_002850605 Pasteurella_skyensis_GCF_900109845 Pasteurella_sp_GCF_004570945 Pasteurella_testudinis_GCF_900454705 Pasteurella_multocida_GCF_000754275 --genome-base-path results/phyluce/pasteurella/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/pasteurella/validate_intervals/pasteurella-to-Pasteurella_multocida_GCF_000754275.sqlite --output results/phyluce/pasteurella/validate_intervals/lastz/;

Running against Pasteurella_bettyae_GCF_000262245.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpv4tqUR.fasta

Writing the results file...
	/tmp/t

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [31]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/pasteurella/extract_probes_from_group/pasteurella_genome.conf --lastz results/phyluce/pasteurella/validate_intervals/lastz --probes 120 --probe-prefix uce_pasteurella_ --name-pattern "Pasteurella_multocida_GCF_000754275_+3_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/pasteurella/extract_probes_from_group/probe_fasta;
2020-02-06 11:53:01,078 - Phyluce - INFO - ------ Working on Pasteurella_bettyae_GCF_000262245 genome ------
2020-02-06 11:53:01,092 - Phyluce - INFO - Reading Pasteurella_bettyae_GCF_000262245 genome
2020-02-06 11:53:01,856 - Phyluce - INFO - Pasteurella_bettyae_GCF_000262245: 213 uces, 10 dupes, 203 non-dupes, 0 orient drop, 0 length drop, 203 written
2020-02-06 11:53:01,856 - Phyluce - INFO - ----- Working on Pasteurella_caecimuris_GCF_004793515 genome ----
2020-02-06 11:53:01,857 - Phyluce - INFO - Reading Pasteurella_caecimuris_GCF_004793515 genome
2020-02-06 11:53:02,541 - Phyluce - INFO -

In [32]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/pasteurella/extract_probes_from_group/probe_fasta --output results/phyluce/pasteurella/extract_probes_from_group/multifastas.sqlite --base-taxon Pasteurella_multocida_GCF_000754275;
pasteurella_bettyae_gcf_000262245.
pasteurella_caecimuris_gcf_004793515.
pasteurella_canis_gcf_900454865.
pasteurella_dagmatis_gcf_900186835.
pasteurella_langaaensis_gcf_003096995.
pasteurella_oralis_gcf_002850605.
pasteurella_skyensis_gcf_900109845.
pasteurella_sp_gcf_004570945.
pasteurella_testudinis_gcf_900454705.
pasteurella_multocida_gcf_000754275.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/pasteurella/extract_probes_from_group/multifastas.sqlite --base-taxon Pasteurella_multocida_GCF_000754275;
Loci shared by 0 taxa:	226.0
Loci shared by 1 taxa:	226.0
Loci shared by 2 taxa:	224.0
Loci shared by 3 taxa:	223.0
Loci shared by 4 taxa:	221.0
Loci shared by 5 taxa:	212.0
Loci shared by 6 taxa:	20

In [33]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(10)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/pasteurella/extract_probes_from_group/multifastas.sqlite  --base-taxon Pasteurella_multocida_GCF_000754275 --output results/phyluce/pasteurella/extract_probes_from_group/Pasteurella_multocida_GCF_000754275+3-back-to-10.conf --specific-counts 10;
Counter({'pasteurella_bettyae_gcf_000262245': 149, 'pasteurella_langaaensis_gcf_003096995': 149, 'pasteurella_dagmatis_gcf_900186835': 149, 'pasteurella_oralis_gcf_002850605': 149, 'pasteurella_canis_gcf_900454865': 149, 'pasteurella_caecimuris_gcf_004793515': 149, 'pasteurella_multocida_gcf_000754275': 149, 'pasteurella_sp_gcf_004570945': 149, 'pasteurella_skyensis_gcf_900109845': 149, 'pasteurella_testudinis_gcf_900454705': 149})
Total loci = 149


## Final group specific bait design

In [34]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/pasteurella/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/pasteurella/extract_probes_from_group/Pasteurella_multocida_GCF_000754275+3-back-to-10.conf --probe-prefix uce_pasteurella_ --designer rnplattii --design pasteurella_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 149
Probe Count = 2932


In [35]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list.fasta --query results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  11:53:18
Ended:  Thu Feb 06, 2020  11:53:21
Time for execution:  0.0533509174983 minutes


In [36]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list.fasta --lastz results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_pasteurella_;
Parsing lastz file...
Screening results...
Screened 2931 fasta sequences.  Filtered 2 duplicates. Kept 2892.


## CDhit to reduce numbers

In [37]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list.fasta
         -o
         results/phyluce/pasteurella/final_probe_design/pasteurella_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 11:54:31 2020
                            Output                              
----------------------------------------------------------------
total seq: 2932
longest and shortest : 80 and 80
Total letters: 234560
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 86M

Table limit with the given memory limit:
Max number of representatives: 3959997
Max number of word counting entries: 89139541

# comparing sequences from          0  to        488
---------- new table with      378 representatives
# comparing sequences from     

# Rickettsia

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [38]:
group = 'rickettsia'

In [39]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [40]:
group_taxa = [ "Rickettsia_aeschlimannii_GCF_001051325",
               "Rickettsia_africae_GCF_000023005",
               "Rickettsia_akari_GCF_000018205",
               "Rickettsia_amblyommatis_GCF_001273795",
               "Rickettsia_argasii_GCF_000965185",
               "Rickettsia_asembonensis_GCF_000828125",
               "Rickettsia_asiatica_GCF_007989425",
               "Rickettsia_australis_GCF_000284155",
               "Rickettsia_bellii_GCF_000012385",
               "Rickettsia_buchneri_GCF_000696365",
               "Rickettsia_canadensis_GCF_000014345",
               "Rickettsia_conorii_GCF_000007025",
               "Rickettsia_endosymbiont_GCF_002285905",
               "Rickettsia_felis_GCF_000012145",
               "Rickettsia_fournieri_GCF_900243065",
               "Rickettsia_gravesii_GCF_000485845",
               "Rickettsia_heilongjiangensis_GCF_000221205",
               "Rickettsia_helvetica_GCF_000255355",
               "Rickettsia_honei_GCF_000263055",
               "Rickettsia_hoogstraalii_GCF_000825685",
               "Rickettsia_japonica_GCF_000283595",
               "Rickettsia_massiliae_GCF_000016625",
               "Rickettsia_monacensis_GCF_000499665",
               "Rickettsia_montanensis_GCF_000284175",
               "Rickettsia_parkeri_GCF_005549115",
               "Rickettsia_peacockii_GCF_000021525",
               "Rickettsia_philipii_GCF_000283995",
               "Rickettsia_prowazekii_GCF_000195735",
               "Rickettsia_raoultii_GCF_001975185",
               "Rickettsia_rhipicephali_GCF_000284075",
               "Rickettsia_sibirica_GCF_000166935",
               "Rickettsia_slovaca_GCF_000237845",
               "Rickettsia_sp_GCF_001653015",
               "Rickettsia_tamurae_GCF_000751075",
               "Rickettsia_typhi_GCF_000008045"  ]
                    
reference_taxon = "Rickettsia_rickettsii_GCF_001951015"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [41]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001051325.1_Rickettsia_aeschlimannii_genomic.fna.gz

sent 42 bytes  received 383879 bytes  255947.33 bytes/sec
total size is 383664  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result 

GCA_000014345.1_ASM1434v1_genomic.fna.gz

sent 42 bytes  received 337308 bytes  224900.00 bytes/sec
total size is 337116  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000007025.1_ASM702v1_genomic.fna.gz

sent 42 bytes  received 369605 bytes  246431.33 bytes/sec
total size is 369406  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network

GCA_000016625.1_ASM1662v1_genomic.fna.gz

sent 42 bytes  received 400457 bytes  800998.00 bytes/sec
total size is 400249  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000499665.2_RMONA_1_genomic.fna.gz

sent 42 bytes  received 377742 bytes  755568.00 bytes/sec
total size is 377544  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network,

GCA_000237845.1_ASM23784v1_genomic.fna.gz

sent 42 bytes  received 371207 bytes  247499.33 bytes/sec
total size is 371006  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001653015.1_ASM165301v1_genomic.fna.gz

sent 42 bytes  received 368825 bytes  245911.33 bytes/sec
total size is 368623  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, net

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [42]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [43]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480647 ("sim_Rickettsia_aeschlimannii_GCF_001051325") has been submitted
Your job 5480648 ("sim_Rickettsia_africae_GCF_000023005") has been submitted
Your job 5480649 ("sim_Rickettsia_akari_GCF_000018205") has been submitted
Your job 5480650 ("sim_Rickettsia_amblyommatis_GCF_001273795") has been submitted
Your job 5480651 ("sim_Rickettsia_argasii_GCF_000965185") has been submitted
Your job 5480652 ("sim_Rickettsia_asembonensis_GCF_000828125") has been submitted
Your job 5480653 ("sim_Rickettsia_asiatica_GCF_007989425") has been submitted
Your job 5480654 ("sim_Rickettsia_australis_GCF_000284155") has been submitted
Your job 5480655 ("sim_Rickettsia_bellii_GCF_000012385") has been submitted
Your job 5480656 ("sim_Rickettsia_buchneri_GCF_000696365") has been submitted
Your job 5480657 ("sim_Rickettsia_canadensis_GCF_000014345") has been submitted
Your job 5480658 ("sim_Rickettsia_conorii_GCF_000007025") has been submitted
Your job 5480659 ("sim_Rickettsia_endosymbiont_GCF_00228

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [44]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/rickettsia/cleaned_genomes/Rickettsia_rickettsii_GCF_001951015_formatted.fas path=results/phyluce/rickettsia/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [45]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480682 ("map_Rickettsia_aeschlimannii_GCF_001051325") has been submitted
Your job 5480683 ("map_Rickettsia_africae_GCF_000023005") has been submitted
Your job 5480684 ("map_Rickettsia_akari_GCF_000018205") has been submitted
Your job 5480685 ("map_Rickettsia_amblyommatis_GCF_001273795") has been submitted
Your job 5480686 ("map_Rickettsia_argasii_GCF_000965185") has been submitted
Your job 5480687 ("map_Rickettsia_asembonensis_GCF_000828125") has been submitted
Your job 5480688 ("map_Rickettsia_asiatica_GCF_007989425") has been submitted
Your job 5480689 ("map_Rickettsia_australis_GCF_000284155") has been submitted
Your job 5480690 ("map_Rickettsia_bellii_GCF_000012385") has been submitted
Your job 5480691 ("map_Rickettsia_buchneri_GCF_000696365") has been submitted
Your job 5480692 ("map_Rickettsia_canadensis_GCF_000014345") has been submitted
Your job 5480693 ("map_Rickettsia_conorii_GCF_000007025") has been submitted
Your job 5480694 ("map_Rickettsia_endosymbiont_GCF_00228

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [46]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480717 ("merge_Rickettsia_aeschlimannii_GCF_001051325") has been submitted
Your job 5480718 ("merge_Rickettsia_africae_GCF_000023005") has been submitted
Your job 5480719 ("merge_Rickettsia_akari_GCF_000018205") has been submitted
Your job 5480720 ("merge_Rickettsia_amblyommatis_GCF_001273795") has been submitted
Your job 5480721 ("merge_Rickettsia_argasii_GCF_000965185") has been submitted
Your job 5480722 ("merge_Rickettsia_asembonensis_GCF_000828125") has been submitted
Your job 5480723 ("merge_Rickettsia_asiatica_GCF_007989425") has been submitted
Your job 5480724 ("merge_Rickettsia_australis_GCF_000284155") has been submitted
Your job 5480725 ("merge_Rickettsia_bellii_GCF_000012385") has been submitted
Your job 5480726 ("merge_Rickettsia_buchneri_GCF_000696365") has been submitted
Your job 5480727 ("merge_Rickettsia_canadensis_GCF_000014345") has been submitted
Your job 5480728 ("merge_Rickettsia_conorii_GCF_000007025") has been submitted
Your job 5480729 ("merge_Rickett

In [47]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 578 sequences from Rickettsia_aeschlimannii_GCF_001051325_merged.bed.  Filtered 43 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 535.
Screened 209 sequences from Rickettsia_africae_GCF_000023005_merged.bed.  Filtered 11 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 198.
Screened 1575 sequences from Rickettsia_akari_GCF_000018205_merged.bed.  Filtered 362 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1213.
Screened 542 sequences from Rickettsia_amblyommatis_GCF_001273795_merged.bed.  Filtered 64 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 478.
Screened 389 sequences from Rickettsia_argasii_GCF_000965185_merged.bed.  Filtered 34 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 355.
Screened 1387 sequences from Rickettsia_asembonensis_GCF_000828125_merged.bed.  Filtered 253 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1134.
Screened 1392 sequences from Rickettsia_asiatica_GCF_007

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [48]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [49]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/rickettsia/initial_intervals/Rickettsia_typhi_GCF_000008045_merged.bed --twobit results/phyluce/rickettsia/cleaned_genomes/Rickettsia_rickettsii_GCF_001951015_formatted.2bit --output results/phyluce/rickettsia/initial_intervals/Rickettsia_typhi_GCF_000008045_stripped.bed;
rickettsia_aeschlimannii_gcf_001051325.
rickettsia_africae_gcf_000023005.
rickettsia_akari_gcf_000018205..
rickettsia_amblyommatis_gcf_001273795.
rickettsia_argasii_gcf_000965185.
rickettsia_asembonensis_gcf_000828125..
rickettsia_asiatica_gcf_007989425..
rickettsia_australis_gcf_000284155..
rickettsia_bellii_gcf_000012385.
rickettsia_buchneri_gcf_000696365.
rickettsia_canadensis_gcf_000014345..
rickettsia_conorii_gcf_000007025.
rickettsia_endosymbiont_gcf_002285905.
rickettsia_felis_gcf_000012145..
rickettsia_fournieri_gcf_900243065.
rickettsia_gravesii_gcf_000485845.
rickettsia_heilongjiangensis_gcf_000221205.
ricketts

Quantify probes and the number of targeted taxa for each.

In [50]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/rickettsia/initial_intervals/Rickettsia_typhi_GCF_000008045_merged.bed --twobit results/phyluce/rickettsia/cleaned_genomes/Rickettsia_rickettsii_GCF_001951015_formatted.2bit --output results/phyluce/rickettsia/initial_intervals/Rickettsia_typhi_GCF_000008045_stripped.bed;
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 0 taxa:	648.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 1 taxa:	648.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 2 taxa:	645.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 3 taxa:	644.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 4 taxa:	643.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 5 taxa:	643.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 6 taxa:	642.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 7 taxa:	641.0
Loci shared by Rickettsia_rickettsii_GCF_001951015 + 8 taxa:	640.0
Loci shared by Rickettsia

In [51]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 33
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/rickettsia/initial_intervals/rickettsia-to-Rickettsia_rickettsii_GCF_001951015.sqlite --base-taxon Rickettsia_rickettsii_GCF_001951015 --output results/phyluce/rickettsia/initial_intervals/Rickettsia_rickettsii_GCF_001951015_+33.bed --specific-counts 33;
Counter({'rickettsia_japonica_gcf_000283595': 254, 'rickettsia_gravesii_gcf_000485845': 254, 'rickettsia_argasii_gcf_000965185': 254, 'rickettsia_sibirica_gcf_000166935': 254, 'rickettsia_tamurae_gcf_000751075': 254, 'rickettsia_conorii_gcf_000007025': 254, 'rickettsia_honei_gcf_000263055': 254, 'rickettsia_heilongjiangensis_gcf_000221205': 254, 'rickettsia_monacensis_gcf_000499665': 254, 'rickettsia_amblyommatis_gcf_001273795': 254, 'rickettsia_fournieri_gcf_900243065': 254, 'rickettsia_philipii_gcf_000283995': 254, 'rickettsia_asembonensis_gcf_000828125': 254, 'rickettsia_sp_gcf_001653015': 254, 'rickettsia_buchneri_gcf_000696365': 254, 'rickettsia_rhipicephali_gcf_000284075'

## Design temp set of baits

In [52]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/rickettsia/initial_intervals/Rickettsia_rickettsii_GCF_001951015_+33.bed --twobit results/phyluce/rickettsia/cleaned_genomes/Rickettsia_rickettsii_GCF_001951015_formatted.2bit --buffer-to 160 --output results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33.fasta;
Screened 254 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 254.


design the baits

In [53]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33.fasta --probe-prefix uce_rickettsia_ --design rickettsia_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 197
Probe Count = 354


## Find duplicate baited regions

In [54]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas --query results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  12:00:43
Ended:  Thu Feb 06, 2020  12:00:43
Time for execution:  0.00349261760712 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas                        --lastz results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes_vself.lastz                       --probe-prefix=uce_rickettsia_;
Parsing lastz file...
Screening results...
Screened 353 fasta sequences.  Filtered 2 duplicates. Kept

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [55]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [56]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/rickettsia/validate_intervals/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas --scaffoldlist Rickettsia_aeschlimannii_GCF_001051325 Rickettsia_africae_GCF_000023005 Rickettsia_akari_GCF_000018205 Rickettsia_amblyommatis_GCF_001273795 Rickettsia_argasii_GCF_000965185 Rickettsia_asembonensis_GCF_000828125 Rickettsia_asiatica_GCF_007989425 Rickettsia_australis_GCF_000284155 Rickettsia_bellii_GCF_000012385 Rickettsia_buchneri_GCF_000696365 Rickettsia_canadensis_GCF_000014345 Rickettsia_conorii_GCF_000007025 Rickettsia_endosymbiont_GCF_002285905 Rickettsia_felis_GCF_000012145 Rickettsia_fournieri_GCF_900243065 Rickettsia_gravesii_GCF_000485845 Rickettsia_heilongjiangensis_GCF_000221205 Rickettsia_helvetica_GCF_000255355 Rickettsia_honei_GCF_000263055 Rickettsia_hoogstraalii_GCF_000825685 Rickettsia_japonica_GCF_000283595 Rickettsia_massiliae_GCF_000016625 Rickettsia_monacensis_GCF_000499665 Rickettsia_montanensis_G


Running against Rickettsia_endosymbiont_GCF_002285905.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpAzTzWg.fasta

Writing the results file...
	/tmp/tmpFQmGVK.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/rickettsia/validate_intervals/lastz/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas_v_Rickettsia_endosymbiont_GCF_002285905.lastz
Creating Rickettsia_endosymbiont_GCF_002285905 table
Inserting data to Rickettsia_endosymbiont_GCF_002285905 table

Running against Rickettsia_felis_GCF_000012145.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpvfRvlO.fasta

Writing the results file...
	/tmp/tmpP8kqDw.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/rickettsia/validate_intervals/lastz/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas_

	/tmp/tmptWLdbG.fasta

Writing the results file...
	/tmp/tmpXE2Qeu.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/rickettsia/validate_intervals/lastz/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas_v_Rickettsia_philipii_GCF_000283995.lastz
Creating Rickettsia_philipii_GCF_000283995 table
Inserting data to Rickettsia_philipii_GCF_000283995 table

Running against Rickettsia_prowazekii_GCF_000195735.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpIxEith.fasta

Writing the results file...
	/tmp/tmpshMUDx.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/rickettsia/validate_intervals/lastz/Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas_v_Rickettsia_prowazekii_GCF_000195735.lastz
Creating Rickettsia_prowazekii_GCF_000195735 table
Inserting data to Rickettsia_prowazekii_GCF_000195735 table

Running against Rick

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [57]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/rickettsia/extract_probes_from_group/rickettsia_genome.conf --lastz results/phyluce/rickettsia/validate_intervals/lastz --probes 120 --probe-prefix uce_rickettsia_ --name-pattern "Rickettsia_rickettsii_GCF_001951015_+33_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/rickettsia/extract_probes_from_group/probe_fasta;
2020-02-06 12:01:49,369 - Phyluce - INFO - ---- Working on Rickettsia_aeschlimannii_GCF_001051325 genome ---
2020-02-06 12:01:49,370 - Phyluce - INFO - Reading Rickettsia_aeschlimannii_GCF_001051325 genome
2020-02-06 12:01:50,072 - Phyluce - INFO - Rickettsia_aeschlimannii_GCF_001051325: 197 uces, 4 dupes, 193 non-dupes, 0 orient drop, 1 length drop, 192 written
2020-02-06 12:01:50,072 - Phyluce - INFO - ------- Working on Rickettsia_africae_GCF_000023005 genome ------
2020-02-06 12:01:50,073 - Phyluce - INFO - Reading Rickettsia_africae_GCF_000023005 genome
2020-02-06 12:01:50,686 - Phyluce - INFO 

2020-02-06 12:02:03,095 - Phyluce - INFO - Rickettsia_massiliae_GCF_000016625: 196 uces, 0 dupes, 196 non-dupes, 3 orient drop, 3 length drop, 190 written
2020-02-06 12:02:03,095 - Phyluce - INFO - ----- Working on Rickettsia_monacensis_GCF_000499665 genome -----
2020-02-06 12:02:03,096 - Phyluce - INFO - Reading Rickettsia_monacensis_GCF_000499665 genome
2020-02-06 12:02:03,662 - Phyluce - INFO - Rickettsia_monacensis_GCF_000499665: 188 uces, 0 dupes, 188 non-dupes, 4 orient drop, 0 length drop, 184 written
2020-02-06 12:02:03,662 - Phyluce - INFO - ----- Working on Rickettsia_montanensis_GCF_000284175 genome ----
2020-02-06 12:02:03,662 - Phyluce - INFO - Reading Rickettsia_montanensis_GCF_000284175 genome
2020-02-06 12:02:04,288 - Phyluce - INFO - Rickettsia_montanensis_GCF_000284175: 197 uces, 0 dupes, 197 non-dupes, 3 orient drop, 5 length drop, 189 written
2020-02-06 12:02:04,289 - Phyluce - INFO - ------- Working on Rickettsia_parkeri_GCF_005549115 genome ------
2020-02-06 12:02

In [58]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/rickettsia/extract_probes_from_group/probe_fasta --output results/phyluce/rickettsia/extract_probes_from_group/multifastas.sqlite --base-taxon Rickettsia_rickettsii_GCF_001951015;
rickettsia_aeschlimannii_gcf_001051325.
rickettsia_africae_gcf_000023005.
rickettsia_akari_gcf_000018205.
rickettsia_amblyommatis_gcf_001273795.
rickettsia_argasii_gcf_000965185.
rickettsia_asembonensis_gcf_000828125.
rickettsia_asiatica_gcf_007989425.
rickettsia_australis_gcf_000284155.
rickettsia_bellii_gcf_000012385.
rickettsia_buchneri_gcf_000696365.
rickettsia_canadensis_gcf_000014345.
rickettsia_conorii_gcf_000007025.
rickettsia_endosymbiont_gcf_002285905.
rickettsia_felis_gcf_000012145.
rickettsia_fournieri_gcf_900243065.
rickettsia_gravesii_gcf_000485845.
rickettsia_heilongjiangensis_gcf_000221205.
rickettsia_helvetica_gcf_000255355.
rickettsia_honei_gcf_000263055.
rickettsia_hoogstraalii_gcf_000825685.
rickettsia_japonica_gcf_000283595.
ric

In [59]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(35)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/rickettsia/extract_probes_from_group/multifastas.sqlite  --base-taxon Rickettsia_rickettsii_GCF_001951015 --output results/phyluce/rickettsia/extract_probes_from_group/Rickettsia_rickettsii_GCF_001951015+33-back-to-35.conf --specific-counts 35;
Counter({'rickettsia_japonica_gcf_000283595': 168, 'rickettsia_gravesii_gcf_000485845': 168, 'rickettsia_argasii_gcf_000965185': 168, 'rickettsia_sibirica_gcf_000166935': 168, 'rickettsia_tamurae_gcf_000751075': 168, 'rickettsia_conorii_gcf_000007025': 168, 'rickettsia_honei_gcf_000263055': 168, 'rickettsia_heilongjiangensis_gcf_000221205': 168, 'rickettsia_monacensis_gcf_000499665': 168, 'rickettsia_fournieri_gcf_900243065': 168, 'rickettsia_philipii_gcf_000283995': 168, 'rickettsia_rhipicephali_gcf_000284075': 168, 'rickettsia_aeschlimannii_gcf_001051325': 168, 'rickettsia_asiatica_gcf_007989425': 168, 'rickettsia_prowazekii_gcf_000195735': 168, 'rickettsia_akari_gcf_000018205': 168, '

## Final group specific bait design

In [60]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/rickettsia/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/rickettsia/extract_probes_from_group/Rickettsia_rickettsii_GCF_001951015+33-back-to-35.conf --probe-prefix uce_rickettsia_ --designer rnplattii --design rickettsia_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [61]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list.fasta --query results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  12:02:45
Ended:  Thu Feb 06, 2020  12:03:22
Time for execution:  0.628028416634 minutes


In [62]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list.fasta --lastz results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_rickettsia_;
Parsing lastz file...
Screening results...
Screened 10562 fasta sequences.  Filtered 0 duplicates. Kept 10563.


## CDhit to reduce numbers

In [67]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list.fasta
         -o
         results/phyluce/rickettsia/final_probe_design/rickettsia_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 12:20:34 2020
                            Output                              
----------------------------------------------------------------
total seq: 10563
longest and shortest : 80 and 80
Total letters: 845040
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 2M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 89M

Table limit with the given memory limit:
Max number of representatives: 3946989
Max number of word counting entries: 88846733

# comparing sequences from          0  to       1760
.---------- new table with      383 representatives
# comparing sequences from       

# Salmonella 

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [204]:
group = 'salmonella'

In [195]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [205]:
group_taxa = [ "Salmonella_bongori_GCF_000252995",
               "Salmonella_bongori_GCF_002035475",
               "Salmonella_bongori_GCA_003497195",
               "Salmonella_bongori_GCF_902387725",
               "Salmonella_bongori_GCA_003452365",
               "Salmonella_bongori_GCF_900635485",
               "Salmonella_bongori_GCF_002211925",
               "Salmonella_bongori_GCF_000709535",
               "Salmonella_bongori_GCA_003522735",
               "Salmonella_bongori_GCF_002035285",
               "Salmonella_arizonae_GCF_001951595",
               "Salmonella_arizonae_GCF_001951445",
               "Salmonella_arizonae_GCF_000018625",
               "Salmonella_arizonae_GCF_001970905",
               "Salmonella_diarizonae_GCF_002241435",
               "Salmonella_diarizonae_GCF_001832255",
               "Salmonella_diarizonae_GCF_002063885",
               "Salmonella_diarizonae_GCF_002035715",
               "Salmonella_enterica_GCF_009208155",
               "Salmonella_enterica_GCF_005938475",
               "Salmonella_enterica_GCF_002064535",
               "Salmonella_houtenae_GCF_002047425",
               "Salmonella_houtenae_GCF_002045905",
               "Salmonella_houtenae_GCF_002127815",
               "Salmonella_houtenae_GCF_002106115",
               "Salmonella_indica_GCF_002066855",
               "Salmonella_indica_GCF_002035225",
               "Salmonella_indica_GCF_002066875",
               "Salmonella_indica_GCF_002266165",
               "Salmonella_salamae_GCF_002036105",
               "Salmonella_salamae_GCF_900478195",
               "Salmonella_salamae_GCF_003994675",
               "Salmonella_salamae_GCF_002036155" ]
                    
reference_taxon =  "Salmonella_enterica_GCF_001159405"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [197]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000252995.1_ASM25299v1_genomic.fna.gz

sent 42 bytes  received 1327500 bytes  885028.00 bytes/sec
total size is 1327067  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplin

GCA_001951595.1_ASM195159v1_genomic.fna.gz

sent 42 bytes  received 1450258 bytes  966866.67 bytes/sec
total size is 1449792  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001951445.1_ASM195144v1_genomic.fna.gz

sent 42 bytes  received 1504894 bytes  1003290.67 bytes/sec
total size is 1504420  speedup is 1.00


You are accessing a U.S. Government information system which includes this
compute

GCA_002064535.1_ASM206453v1_genomic.fna.gz

sent 42 bytes  received 1374955 bytes  916664.67 bytes/sec
total size is 1374513  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002047425.1_ASM204742v1_genomic.fna.gz

sent 42 bytes  received 1377820 bytes  918574.67 bytes/sec
total size is 1377370  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

GCA_003994675.1_ASM399467v1_genomic.fna.gz

sent 42 bytes  received 1445111 bytes  963435.33 bytes/sec
total size is 1444645  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_002036155.1_ASM203615v1_genomic.fna.gz

sent 42 bytes  received 1532138 bytes  1021453.33 bytes/sec
total size is 1531656  speedup is 1.00


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [206]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [207]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482531 ("sim_Salmonella_bongori_GCF_000252995") has been submitted
Your job 5482532 ("sim_Salmonella_bongori_GCF_002035475") has been submitted
Your job 5482533 ("sim_Salmonella_bongori_GCA_003497195") has been submitted
Your job 5482534 ("sim_Salmonella_bongori_GCF_902387725") has been submitted
Your job 5482535 ("sim_Salmonella_bongori_GCA_003452365") has been submitted
Your job 5482536 ("sim_Salmonella_bongori_GCF_900635485") has been submitted
Your job 5482537 ("sim_Salmonella_bongori_GCF_002211925") has been submitted
Your job 5482538 ("sim_Salmonella_bongori_GCF_000709535") has been submitted
Your job 5482539 ("sim_Salmonella_bongori_GCA_003522735") has been submitted
Your job 5482540 ("sim_Salmonella_bongori_GCF_002035285") has been submitted
Your job 5482541 ("sim_Salmonella_arizonae_GCF_001951595") has been submitted
Your job 5482542 ("sim_Salmonella_arizonae_GCF_001951445") has been submitted
Your job 5482543 ("sim_Salmonella_arizonae_GCF_000018625") has been submit

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [208]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/salmonella/cleaned_genomes/Salmonella_enterica_GCF_001159405_formatted.fas path=results/phyluce/salmonella/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [209]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482564 ("map_Salmonella_bongori_GCF_000252995") has been submitted
Your job 5482565 ("map_Salmonella_bongori_GCF_002035475") has been submitted
Your job 5482566 ("map_Salmonella_bongori_GCA_003497195") has been submitted
Your job 5482567 ("map_Salmonella_bongori_GCF_902387725") has been submitted
Your job 5482568 ("map_Salmonella_bongori_GCA_003452365") has been submitted
Your job 5482569 ("map_Salmonella_bongori_GCF_900635485") has been submitted
Your job 5482570 ("map_Salmonella_bongori_GCF_002211925") has been submitted
Your job 5482571 ("map_Salmonella_bongori_GCF_000709535") has been submitted
Your job 5482572 ("map_Salmonella_bongori_GCA_003522735") has been submitted
Your job 5482573 ("map_Salmonella_bongori_GCF_002035285") has been submitted
Your job 5482574 ("map_Salmonella_arizonae_GCF_001951595") has been submitted
Your job 5482575 ("map_Salmonella_arizonae_GCF_001951445") has been submitted
Your job 5482576 ("map_Salmonella_arizonae_GCF_000018625") has been submit

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [210]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5482597 ("merge_Salmonella_bongori_GCF_000252995") has been submitted
Your job 5482598 ("merge_Salmonella_bongori_GCF_002035475") has been submitted
Your job 5482599 ("merge_Salmonella_bongori_GCA_003497195") has been submitted
Your job 5482600 ("merge_Salmonella_bongori_GCF_902387725") has been submitted
Your job 5482601 ("merge_Salmonella_bongori_GCA_003452365") has been submitted
Your job 5482602 ("merge_Salmonella_bongori_GCF_900635485") has been submitted
Your job 5482603 ("merge_Salmonella_bongori_GCF_002211925") has been submitted
Your job 5482604 ("merge_Salmonella_bongori_GCF_000709535") has been submitted
Your job 5482605 ("merge_Salmonella_bongori_GCA_003522735") has been submitted
Your job 5482606 ("merge_Salmonella_bongori_GCF_002035285") has been submitted
Your job 5482607 ("merge_Salmonella_arizonae_GCF_001951595") has been submitted
Your job 5482608 ("merge_Salmonella_arizonae_GCF_001951445") has been submitted
Your job 5482609 ("merge_Salmonella_arizonae_GCF_0

In [211]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 6426 sequences from Salmonella_bongori_GCF_000252995_merged.bed.  Filtered 1734 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4692.
Screened 6343 sequences from Salmonella_bongori_GCF_002035475_merged.bed.  Filtered 1645 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4698.
Screened 6297 sequences from Salmonella_bongori_GCA_003497195_merged.bed.  Filtered 1677 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4620.
Screened 6400 sequences from Salmonella_bongori_GCF_902387725_merged.bed.  Filtered 1682 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4718.
Screened 3850 sequences from Salmonella_bongori_GCA_003452365_merged.bed.  Filtered 1017 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2833.
Screened 6516 sequences from Salmonella_bongori_GCF_900635485_merged.bed.  Filtered 1765 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4751.
Screened 6377 sequences from Salmonella_bongori_GCF_

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [212]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [213]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/salmonella/initial_intervals/Salmonella_salamae_GCF_002036155_merged.bed --twobit results/phyluce/salmonella/cleaned_genomes/Salmonella_enterica_GCF_001159405_formatted.2bit --output results/phyluce/salmonella/initial_intervals/Salmonella_salamae_GCF_002036155_stripped.bed;
salmonella_bongori_gcf_000252995.....
salmonella_bongori_gcf_002035475.....
salmonella_bongori_gca_003497195.....
salmonella_bongori_gcf_902387725.....
salmonella_bongori_gca_003452365...
salmonella_bongori_gcf_900635485.....
salmonella_bongori_gcf_002211925.....
salmonella_bongori_gcf_000709535.....
salmonella_bongori_gca_003522735.....
salmonella_bongori_gcf_002035285.....
salmonella_arizonae_gcf_001951595.
salmonella_arizonae_gcf_001951445.
salmonella_arizonae_gcf_000018625...
salmonella_arizonae_gcf_001970905...
salmonella_diarizonae_gcf_002241435..
salmonella_diarizonae_gcf_001832255..
salmonella_diarizonae_gcf_00

Quantify probes and the number of targeted taxa for each.

In [214]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/salmonella/initial_intervals/Salmonella_salamae_GCF_002036155_merged.bed --twobit results/phyluce/salmonella/cleaned_genomes/Salmonella_enterica_GCF_001159405_formatted.2bit --output results/phyluce/salmonella/initial_intervals/Salmonella_salamae_GCF_002036155_stripped.bed;
Loci shared by Salmonella_enterica_GCF_001159405 + 0 taxa:	6,437.0
Loci shared by Salmonella_enterica_GCF_001159405 + 1 taxa:	6,437.0
Loci shared by Salmonella_enterica_GCF_001159405 + 2 taxa:	6,337.0
Loci shared by Salmonella_enterica_GCF_001159405 + 3 taxa:	6,271.0
Loci shared by Salmonella_enterica_GCF_001159405 + 4 taxa:	6,202.0
Loci shared by Salmonella_enterica_GCF_001159405 + 5 taxa:	6,152.0
Loci shared by Salmonella_enterica_GCF_001159405 + 6 taxa:	6,058.0
Loci shared by Salmonella_enterica_GCF_001159405 + 7 taxa:	6,005.0
Loci shared by Salmonella_enterica_GCF_001159405 + 8 taxa:	5,951.0
Loci shared by Salmonel

In [215]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 33
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/salmonella/initial_intervals/salmonella-to-Salmonella_enterica_GCF_001159405.sqlite --base-taxon Salmonella_enterica_GCF_001159405 --output results/phyluce/salmonella/initial_intervals/Salmonella_enterica_GCF_001159405_+33.bed --specific-counts 33;
Counter({'salmonella_arizonae_gcf_001970905': 1864, 'salmonella_diarizonae_gcf_002241435': 1864, 'salmonella_arizonae_gcf_001951445': 1864, 'salmonella_bongori_gcf_002035475': 1864, 'salmonella_indica_gcf_002035225': 1864, 'salmonella_salamae_gcf_900478195': 1864, 'salmonella_diarizonae_gcf_001832255': 1864, 'salmonella_enterica_gcf_009208155': 1864, 'salmonella_enterica_gcf_002064535': 1864, 'salmonella_houtenae_gcf_002047425': 1864, 'salmonella_bongori_gca_003522735': 1864, 'salmonella_arizonae_gcf_001951595': 1864, 'salmonella_salamae_gcf_002036105': 1864, 'salmonella_bongori_gca_003497195': 1864, 'salmonella_bongori_gcf_000252995': 1864, 'salmonella_houtenae_gcf_002045905': 1864,

## Design temp set of baits

In [216]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/salmonella/initial_intervals/Salmonella_enterica_GCF_001159405_+33.bed --twobit results/phyluce/salmonella/cleaned_genomes/Salmonella_enterica_GCF_001159405_formatted.2bit --buffer-to 160 --output results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33.fasta;
Screened 1864 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 1864.


design the baits

In [217]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33.fasta --probe-prefix uce_salmonella_ --design salmonella_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGG


Conserved locus count = 1864
Probe Count = 3723


## Find duplicate baited regions

In [218]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas --query results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33_temp_probes_vself.lastz;
Started:  Fri Feb 07, 2020  08:59:35
Ended:  Fri Feb 07, 2020  08:59:36
Time for execution:  0.0295583804448 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas                        --lastz results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33_temp_probes_vself.lastz                       --probe-prefix=uce_salmonella_;
Parsing lastz file...
Screening results...
Screened 3722 fasta sequences.  Filtered 20 duplicates. Kept 3683.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [219]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [220]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/salmonella/validate_intervals/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas --scaffoldlist Salmonella_bongori_GCF_000252995 Salmonella_bongori_GCF_002035475 Salmonella_bongori_GCA_003497195 Salmonella_bongori_GCF_902387725 Salmonella_bongori_GCA_003452365 Salmonella_bongori_GCF_900635485 Salmonella_bongori_GCF_002211925 Salmonella_bongori_GCF_000709535 Salmonella_bongori_GCA_003522735 Salmonella_bongori_GCF_002035285 Salmonella_arizonae_GCF_001951595 Salmonella_arizonae_GCF_001951445 Salmonella_arizonae_GCF_000018625 Salmonella_arizonae_GCF_001970905 Salmonella_diarizonae_GCF_002241435 Salmonella_diarizonae_GCF_001832255 Salmonella_diarizonae_GCF_002063885 Salmonella_diarizonae_GCF_002035715 Salmonella_enterica_GCF_009208155 Salmonella_enterica_GCF_005938475 Salmonella_enterica_GCF_002064535 Salmonella_houtenae_GCF_002047425 Salmonella_houtenae_GCF_002045905 Salmonella_houtenae_GCF_002127815 Salmonella_houtena

Creating Salmonella_arizonae_GCF_001951445 table
Inserting data to Salmonella_arizonae_GCF_001951445 table

Running against Salmonella_arizonae_GCF_000018625.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpvLABVB.fasta

Writing the results file...
	/tmp/tmpKRn9oN.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/salmonella/validate_intervals/lastz/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas_v_Salmonella_arizonae_GCF_000018625.lastz
Creating Salmonella_arizonae_GCF_000018625 table
Inserting data to Salmonella_arizonae_GCF_000018625 table

Running against Salmonella_arizonae_GCF_001970905.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpv2lUlg.fasta

Writing the results file...
	/tmp/tmphl4NuV.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/

	/tmp/tmpVKHyyd.fasta

Writing the results file...
	/tmp/tmpHbMOZa.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/salmonella/validate_intervals/lastz/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas_v_Salmonella_indica_GCF_002035225.lastz
Creating Salmonella_indica_GCF_002035225 table
Inserting data to Salmonella_indica_GCF_002035225 table

Running against Salmonella_indica_GCF_002066875.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpjl6xpV.fasta

Writing the results file...
	/tmp/tmpNzpmIZ.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/salmonella/validate_intervals/lastz/Salmonella_enterica_GCF_001159405_+33_temp_probes.fas_v_Salmonella_indica_GCF_002066875.lastz
Creating Salmonella_indica_GCF_002066875 table
Inserting data to Salmonella_indica_GCF_002066875 table

Running against Salmonella_indica_GCF_00226616

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [221]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/salmonella/extract_probes_from_group/salmonella_genome.conf --lastz results/phyluce/salmonella/validate_intervals/lastz --probes 120 --probe-prefix uce_salmonella_ --name-pattern "Salmonella_enterica_GCF_001159405_+33_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/salmonella/extract_probes_from_group/probe_fasta;
2020-02-07 09:03:33,423 - Phyluce - INFO - ------- Working on Salmonella_bongori_GCF_000252995 genome ------
2020-02-07 09:03:33,424 - Phyluce - INFO - Reading Salmonella_bongori_GCF_000252995 genome
2020-02-07 09:03:39,996 - Phyluce - INFO - Salmonella_bongori_GCF_000252995: 1864 uces, 0 dupes, 1864 non-dupes, 79 orient drop, 38 length drop, 1747 written
2020-02-07 09:03:39,996 - Phyluce - INFO - ------- Working on Salmonella_bongori_GCF_002035475 genome ------
2020-02-07 09:03:39,997 - Phyluce - INFO - Reading Salmonella_bongori_GCF_002035475 genome
2020-02-07 09:03:46,485 - Phyluce - INFO - Salmone

2020-02-07 09:05:58,968 - Phyluce - INFO - Salmonella_houtenae_GCF_002047425: 1858 uces, 116 dupes, 1742 non-dupes, 3 orient drop, 7 length drop, 1732 written
2020-02-07 09:05:58,968 - Phyluce - INFO - ------ Working on Salmonella_houtenae_GCF_002045905 genome ------
2020-02-07 09:05:58,969 - Phyluce - INFO - Reading Salmonella_houtenae_GCF_002045905 genome
2020-02-07 09:06:05,655 - Phyluce - INFO - Salmonella_houtenae_GCF_002045905: 1858 uces, 124 dupes, 1734 non-dupes, 2 orient drop, 6 length drop, 1726 written
2020-02-07 09:06:05,655 - Phyluce - INFO - ------ Working on Salmonella_houtenae_GCF_002127815 genome ------
2020-02-07 09:06:05,656 - Phyluce - INFO - Reading Salmonella_houtenae_GCF_002127815 genome
2020-02-07 09:06:12,268 - Phyluce - INFO - Salmonella_houtenae_GCF_002127815: 1857 uces, 113 dupes, 1744 non-dupes, 2 orient drop, 8 length drop, 1734 written
2020-02-07 09:06:12,268 - Phyluce - INFO - ------ Working on Salmonella_houtenae_GCF_002106115 genome ------
2020-02-07 0

In [222]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/salmonella/extract_probes_from_group/probe_fasta --output results/phyluce/salmonella/extract_probes_from_group/multifastas.sqlite --base-taxon Salmonella_enterica_GCF_001159405;
salmonella_bongori_gcf_000252995..
salmonella_bongori_gcf_002035475..
salmonella_bongori_gca_003497195..
salmonella_bongori_gcf_902387725..
salmonella_bongori_gca_003452365..
salmonella_bongori_gcf_900635485..
salmonella_bongori_gcf_002211925..
salmonella_bongori_gcf_000709535..
salmonella_bongori_gca_003522735..
salmonella_bongori_gcf_002035285..
salmonella_arizonae_gcf_001951595..
salmonella_arizonae_gcf_001951445..
salmonella_arizonae_gcf_000018625..
salmonella_arizonae_gcf_001970905..
salmonella_diarizonae_gcf_002241435..
salmonella_diarizonae_gcf_001832255..
salmonella_diarizonae_gcf_002063885..
salmonella_diarizonae_gcf_002035715..
salmonella_enterica_gcf_009208155..
salmonella_enterica_gcf_005938475..
salmonella_enterica_gcf_002064535..
salmone

In [223]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(34)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/salmonella/extract_probes_from_group/multifastas.sqlite  --base-taxon Salmonella_enterica_GCF_001159405 --output results/phyluce/salmonella/extract_probes_from_group/Salmonella_enterica_GCF_001159405+33-back-to-34.conf --specific-counts 34;
Counter({'salmonella_arizonae_gcf_001970905': 1535, 'salmonella_diarizonae_gcf_002241435': 1535, 'salmonella_arizonae_gcf_001951445': 1535, 'salmonella_bongori_gcf_002035475': 1535, 'salmonella_indica_gcf_002035225': 1535, 'salmonella_houtenae_gcf_002045905': 1535, 'salmonella_diarizonae_gcf_001832255': 1535, 'salmonella_enterica_gcf_009208155': 1535, 'salmonella_enterica_gcf_002064535': 1535, 'salmonella_houtenae_gcf_002047425': 1535, 'salmonella_bongori_gca_003522735': 1535, 'salmonella_arizonae_gcf_001951595': 1535, 'salmonella_salamae_gcf_002036105': 1535, 'salmonella_bongori_gca_003497195': 1535, 'salmonella_bongori_gcf_000252995': 1535, 'salmonella_salamae_gcf_900478195': 1535, 'salmon

## Final group specific bait design

In [224]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/salmonella/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/salmonella/extract_probes_from_group/Salmonella_enterica_GCF_001159405+33-back-to-34.conf --probe-prefix uce_salmonella_ --designer rnplattii --design salmonella_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Con

In [225]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list.fasta --query results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Fri Feb 07, 2020  09:09:20
Ended:  Fri Feb 07, 2020  09:18:57
Time for execution:  9.62158638239 minutes


In [None]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list.fasta --lastz results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_salmonella_;
Parsing lastz file...


## CDhit to reduce numbers

In [228]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list.fasta
         -o
         results/phyluce/salmonella/final_probe_design/salmonella_v1-master_probe_list.95P_cdhit

Started: Fri Feb  7 13:11:16 2020
                            Output                              
----------------------------------------------------------------
total seq: 103901
longest and shortest : 80 and 80
Total letters: 8312080
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 22M
Buffer          : 4 X 13M = 53M
Table           : 2 X 18M = 36M
Miscellaneous   : 5M
Total           : 117M

Table limit with the given memory limit:
Max number of representatives: 3788359
Max number of word counting entries: 85275978

# comparing sequences from          0  to      17316
..........    10000  finished       1400  clusters
.......---------- new table wi

# Streptobacillus

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [84]:
group = 'streptobacillus'

In [85]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [86]:
group_taxa = [ "Streptobacillus_felis_GCF_001559775",
               "Streptobacillus_hongkongensis_GCF_001559795",
               "Streptobacillus_notomytis_GCF_001902575",
               "Streptobacillus_ratti_GCF_001891165",
               "Streptobacillus_sp_GCF_009733925"  ]
                    
reference_taxon = "Streptobacillus_moniliformis_GCF_000024565"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [89]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001559775.1_ASM155977v1_genomic.fna.gz

sent 42 bytes  received 451685 bytes  903454.00 bytes/sec
total size is 451467  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplina

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [90]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [91]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480861 ("sim_Streptobacillus_felis_GCF_001559775") has been submitted
Your job 5480862 ("sim_Streptobacillus_hongkongensis_GCF_001559795") has been submitted
Your job 5480863 ("sim_Streptobacillus_notomytis_GCF_001902575") has been submitted
Your job 5480864 ("sim_Streptobacillus_ratti_GCF_001891165") has been submitted
Your job 5480865 ("sim_Streptobacillus_sp_GCF_009733925") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [92]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/streptobacillus/cleaned_genomes/Streptobacillus_moniliformis_GCF_000024565_formatted.fas path=results/phyluce/streptobacillus/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [93]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480866 ("map_Streptobacillus_felis_GCF_001559775") has been submitted
Your job 5480867 ("map_Streptobacillus_hongkongensis_GCF_001559795") has been submitted
Your job 5480868 ("map_Streptobacillus_notomytis_GCF_001902575") has been submitted
Your job 5480869 ("map_Streptobacillus_ratti_GCF_001891165") has been submitted
Your job 5480870 ("map_Streptobacillus_sp_GCF_009733925") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [94]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480871 ("merge_Streptobacillus_felis_GCF_001559775") has been submitted
Your job 5480872 ("merge_Streptobacillus_hongkongensis_GCF_001559795") has been submitted
Your job 5480873 ("merge_Streptobacillus_notomytis_GCF_001902575") has been submitted
Your job 5480874 ("merge_Streptobacillus_ratti_GCF_001891165") has been submitted
Your job 5480875 ("merge_Streptobacillus_sp_GCF_009733925") has been submitted


In [95]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 1554 sequences from Streptobacillus_felis_GCF_001559775_merged.bed.  Filtered 871 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 683.
Screened 514 sequences from Streptobacillus_hongkongensis_GCF_001559795_merged.bed.  Filtered 318 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 196.
Screened 2204 sequences from Streptobacillus_notomytis_GCF_001902575_merged.bed.  Filtered 728 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1476.
Screened 2041 sequences from Streptobacillus_ratti_GCF_001891165_merged.bed.  Filtered 569 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1472.
Screened 1656 sequences from Streptobacillus_sp_GCF_009733925_merged.bed.  Filtered 905 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 751.


## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [96]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [97]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/streptobacillus/initial_intervals/Streptobacillus_sp_GCF_009733925_merged.bed --twobit results/phyluce/streptobacillus/cleaned_genomes/Streptobacillus_moniliformis_GCF_000024565_formatted.2bit --output results/phyluce/streptobacillus/initial_intervals/Streptobacillus_sp_GCF_009733925_stripped.bed;
streptobacillus_felis_gcf_001559775.
streptobacillus_hongkongensis_gcf_001559795.
streptobacillus_notomytis_gcf_001902575..
streptobacillus_ratti_gcf_001891165..
streptobacillus_sp_gcf_009733925.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [98]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/streptobacillus/initial_intervals/Streptobacillus_sp_GCF_009733925_merged.bed --twobit results/phyluce/streptobacillus/cleaned_genomes/Streptobacillus_moniliformis_GCF_000024565_formatted.2bit --output results/phyluce/streptobacillus/initial_intervals/Streptobacillus_sp_GCF_009733925_stripped.bed;
Loci shared by Streptobacillus_moniliformis_GCF_000024565 + 0 taxa:	2,153.0
Loci shared by Streptobacillus_moniliformis_GCF_000024565 + 1 taxa:	2,153.0
Loci shared by Streptobacillus_moniliformis_GCF_000024565 + 2 taxa:	1,598.0
Loci shared by Streptobacillus_moniliformis_GCF_000024565 + 3 taxa:	811.0
Loci shared by Streptobacillus_moniliformis_GCF_000024565 + 4 taxa:	480.0
Loci shared by Streptobacillus_moniliformis_GCF_000024565 + 5 taxa:	130.0


In [99]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 4
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/streptobacillus/initial_intervals/streptobacillus-to-Streptobacillus_moniliformis_GCF_000024565.sqlite --base-taxon Streptobacillus_moniliformis_GCF_000024565 --output results/phyluce/streptobacillus/initial_intervals/Streptobacillus_moniliformis_GCF_000024565_+4.bed --specific-counts 4;
Counter({'streptobacillus_notomytis_gcf_001902575': 480, 'streptobacillus_ratti_gcf_001891165': 480, 'streptobacillus_sp_gcf_009733925': 473, 'streptobacillus_felis_gcf_001559775': 467, 'streptobacillus_hongkongensis_gcf_001559795': 150})


## Design temp set of baits

In [100]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/streptobacillus/initial_intervals/Streptobacillus_moniliformis_GCF_000024565_+4.bed --twobit results/phyluce/streptobacillus/cleaned_genomes/Streptobacillus_moniliformis_GCF_000024565_formatted.2bit --buffer-to 160 --output results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4.fasta;
Screened 480 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 480.


design the baits

In [101]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4.fasta --probe-prefix uce_streptobacillus_ --design streptobacillus_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

## Find duplicate baited regions

In [102]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes.fas --query results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  12:34:01
Ended:  Thu Feb 06, 2020  12:34:02
Time for execution:  0.0038468003273 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes.fas                        --lastz results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes_vself.lastz                       --probe-prefix=uce_streptobacillus_;
Parsing lastz file...
Screening results...

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [103]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [104]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/streptobacillus/validate_intervals/Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes.fas --scaffoldlist Streptobacillus_felis_GCF_001559775 Streptobacillus_hongkongensis_GCF_001559795 Streptobacillus_notomytis_GCF_001902575 Streptobacillus_ratti_GCF_001891165 Streptobacillus_sp_GCF_009733925 Streptobacillus_moniliformis_GCF_000024565 --genome-base-path results/phyluce/streptobacillus/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/streptobacillus/validate_intervals/streptobacillus-to-Streptobacillus_moniliformis_GCF_000024565.sqlite --output results/phyluce/streptobacillus/validate_intervals/lastz/;

Running against Streptobacillus_felis_GCF_001559775.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpga3iKI.fasta

Writing the results file...
	/tmp/tmpjhJyyM.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathog

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [105]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/streptobacillus/extract_probes_from_group/streptobacillus_genome.conf --lastz results/phyluce/streptobacillus/validate_intervals/lastz --probes 120 --probe-prefix uce_streptobacillus_ --name-pattern "Streptobacillus_moniliformis_GCF_000024565_+4_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/streptobacillus/extract_probes_from_group/probe_fasta;
2020-02-06 12:34:16,987 - Phyluce - INFO - ----- Working on Streptobacillus_felis_GCF_001559775 genome -----
2020-02-06 12:34:16,988 - Phyluce - INFO - Reading Streptobacillus_felis_GCF_001559775 genome
2020-02-06 12:34:17,899 - Phyluce - INFO - Streptobacillus_felis_GCF_001559775: 267 uces, 30 dupes, 237 non-dupes, 0 orient drop, 0 length drop, 237 written
2020-02-06 12:34:17,899 - Phyluce - INFO - - Working on Streptobacillus_hongkongensis_GCF_001559795 genome -
2020-02-06 12:34:17,900 - Phyluce - INFO - Reading Streptobacillus_hongkongensis_GCF_001559795 genome
2020

In [106]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/streptobacillus/extract_probes_from_group/probe_fasta --output results/phyluce/streptobacillus/extract_probes_from_group/multifastas.sqlite --base-taxon Streptobacillus_moniliformis_GCF_000024565;
streptobacillus_felis_gcf_001559775.
streptobacillus_hongkongensis_gcf_001559795.
streptobacillus_notomytis_gcf_001902575.
streptobacillus_ratti_gcf_001891165.
streptobacillus_sp_gcf_009733925.
streptobacillus_moniliformis_gcf_000024565.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/streptobacillus/extract_probes_from_group/multifastas.sqlite --base-taxon Streptobacillus_moniliformis_GCF_000024565;
Loci shared by 0 taxa:	257.0
Loci shared by 1 taxa:	257.0
Loci shared by 2 taxa:	250.0
Loci shared by 3 taxa:	247.0
Loci shared by 4 taxa:	242.0
Loci shared by 5 taxa:	234.0
Loci shared by 6 taxa:	190.0


In [107]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(6)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/streptobacillus/extract_probes_from_group/multifastas.sqlite  --base-taxon Streptobacillus_moniliformis_GCF_000024565 --output results/phyluce/streptobacillus/extract_probes_from_group/Streptobacillus_moniliformis_GCF_000024565+4-back-to-6.conf --specific-counts 6;
Counter({'streptobacillus_hongkongensis_gcf_001559795': 190, 'streptobacillus_ratti_gcf_001891165': 190, 'streptobacillus_moniliformis_gcf_000024565': 190, 'streptobacillus_felis_gcf_001559775': 190, 'streptobacillus_notomytis_gcf_001902575': 190, 'streptobacillus_sp_gcf_009733925': 190})
Total loci = 190


## Final group specific bait design

In [108]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/streptobacillus/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/streptobacillus/extract_probes_from_group/Streptobacillus_moniliformis_GCF_000024565+4-back-to-6.conf --probe-prefix uce_streptobacillus_ --designer rnplattii --design streptobacillus_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus c

In [109]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list.fasta --query results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  12:34:33
Ended:  Thu Feb 06, 2020  12:34:35
Time for execution:  0.022558816274 minutes


In [110]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list.fasta --lastz results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_streptobacillus_;
Parsing lastz file...
Screening results...
Screened 1851 fasta sequences.  Filtered 0 duplicates. Kept 1852.


## CDhit to reduce numbers

In [111]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list.fasta
         -o
         results/phyluce/streptobacillus/final_probe_design/streptobacillus_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 12:35:02 2020
                            Output                              
----------------------------------------------------------------
total seq: 1852
longest and shortest : 80 and 80
Total letters: 148160
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 86M

Table limit with the given memory limit:
Max number of representatives: 3961846
Max number of word counting entries: 89181166

# comparing sequences from          0  to        308
---------- new table with      261 representatives
# comparing seq

# Streptococcus

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [168]:
group = 'streptococcus'

In [169]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [170]:
group_taxa = [ "Streptococcus_acidominimus_GCF_900187245",
               "Streptococcus_agalactiae_GCF_000007265",
               "Streptococcus_alactolyticus_GCF_009695625",
               "Streptococcus_anginosus_GCF_000463505",
               "Streptococcus_australis_GCF_900476055",
               "Streptococcus_azizii_GCF_001984715",
               "Streptococcus_bovimastitidis_GCF_001885095",
               "Streptococcus_caballi_GCF_000379985",
               "Streptococcus_canis_GCF_900636575",
               "Streptococcus_castoreus_GCF_000425025",
               "Streptococcus_constellatus_GCF_000463425",
               "Streptococcus_criceti_GCF_000187975",
               "Streptococcus_cristatus_GCF_000385925",
               "Streptococcus_cuniculi_GCF_001921845",
               "Streptococcus_danieliae_GCF_009767945",
               "Streptococcus_devriesei_GCF_000423725",
               "Streptococcus_didelphis_GCF_000380005",
               "Streptococcus_downei_GCF_900459175",
               "Streptococcus_dysgalactiae_GCF_000317855",
               "Streptococcus_entericus_GCF_000380025",
               "Streptococcus_equi_GCF_000026605",
               "Streptococcus_equinus_GCF_000964315",
               "Streptococcus_ferus_GCF_000372425",
               "Streptococcus_gallolyticus_GCF_002000985",
               "Streptococcus_gordonii_GCF_000017005",
               "Streptococcus_halichoeri_GCF_009870755",
               "Streptococcus_halitosis_GCF_003143695",
               "Streptococcus_halotolerans_GCF_001598035",
               "Streptococcus_henryi_GCF_000376985",
               "Streptococcus_himalayensis_GCF_001708305",
               "Streptococcus_hongkongensis_GCF_000785845",
               "Streptococcus_hyointestinalis_GCF_900459405",
               "Streptococcus_hyovaginalis_GCF_000420785",
               "Streptococcus_ictaluri_GCF_000188015",
               "Streptococcus_infantarius_GCF_000246835",
               "Streptococcus_infantis_GCF_000187465",
               "Streptococcus_iniae_GCF_000831485",
               "Streptococcus_intermedius_GCF_000463355",
               "Streptococcus_lutetiensis_GCF_900475675",
               "Streptococcus_macacae_GCF_000187995",
               "Streptococcus_macedonicus_GCF_000283635",
               "Streptococcus_marimammalium_GCF_000380045",
               "Streptococcus_marmotae_GCF_001623565",
               "Streptococcus_massiliensis_GCF_000380065",
               "Streptococcus_merionis_GCF_000380085",
               "Streptococcus_milleri_GCF_900636715",
               "Streptococcus_minor_GCF_000377005",
               "Streptococcus_mitis_GCF_000027165",
               "Streptococcus_mutans_GCF_000007465",
               "Streptococcus_oralis_GCF_900637025",
               "Streptococcus_orisasini_GCF_001431045",
               "Streptococcus_orisratti_GCF_000380105",
               "Streptococcus_ovis_GCF_000380125",
               "Streptococcus_pantholopis_GCF_001642085",
               "Streptococcus_parasanguinis_GCF_000164675",
               "Streptococcus_parasuis_GCF_004283785",
               "Streptococcus_parauberis_GCF_000213825",
               "Streptococcus_pasteurianus_GCF_900478025",
               "Streptococcus_penaeicida_GCF_002887775",
               "Streptococcus_peroris_GCF_000187585",
               "Streptococcus_pharyngis_GCF_007859195",
               "Streptococcus_phocae_GCF_000772915",
               "Streptococcus_pluranimalium_GCF_003352995",
               "Streptococcus_plurextorum_GCF_000423745",              
               "Streptococcus_porci_GCF_000423765",
               "Streptococcus_porcinus_GCF_900475415",
               "Streptococcus_pseudopneumoniae_GCF_000221985",
               "Streptococcus_pseudoporcinus_GCF_000188035",
               "Streptococcus_pyogenes_GCF_000006785",
               "Streptococcus_ratti_GCF_000286075",
               "Streptococcus_respraculi_GCF_003595525",
               "Streptococcus_rubneri_GCF_004785935",
               "Streptococcus_ruminantium_GCF_003609975",
               "Streptococcus_salivarius_GCF_000785515",
               "Streptococcus_sanguinis_GCF_000014205",
               "Streptococcus_sinensis_GCF_000767835",
               "Streptococcus_sobrinus_GCF_000686605",
               "Streptococcus_sp_GCF_003086355",
               "Streptococcus_suis_GCF_000026745",
               "Streptococcus_thermophilus_GCF_000253395",
               "Streptococcus_thoraltensis_GCF_000380145",
               "Streptococcus_timonensis_GCF_900095845",
               "Streptococcus_troglodytae_GCF_002355215",
               "Streptococcus_uberis_GCF_000009545",
               "Streptococcus_urinalis_GCF_000188055",
               "Streptococcus_varani_GCF_001375655",
               "Streptococcus_vestibularis_GCF_900636445",
               "Streptococcus_viridans_GCF_900636365" ]
                    
reference_taxon = "Streptococcus_pneumoniae_GCF_000007045"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [171]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900187245.1_51470_D02_genomic.fna.gz

sent 42 bytes  received 687125 bytes  196333.43 bytes/sec
total size is 686853  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary

GCA_000463425.1_ASM46342v1_genomic.fna.gz

sent 42 bytes  received 588864 bytes  392604.00 bytes/sec
total size is 588615  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000187975.3_ASM18797v3_genomic.fna.gz

sent 42 bytes  received 718440 bytes  478988.00 bytes/sec
total size is 718159  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000964315.1_ASM96431v1_genomic.fna.gz

sent 42 bytes  received 565160 bytes  376801.33 bytes/sec
total size is 564911  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000372425.1_ASM37242v1_genomic.fna.gz

sent 42 bytes  received 551834 bytes  367917.33 bytes/sec
total size is 551593  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000420785.1_ASM42078v1_genomic.fna.gz

sent 42 bytes  received 621677 bytes  414479.33 bytes/sec
total size is 621420  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000188015.3_ASM18801v3_genomic.fna.gz

sent 42 bytes  received 657139 bytes  438120.67 bytes/sec
total size is 656866  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000380065.1_ASM38006v1_genomic.fna.gz

sent 42 bytes  received 557536 bytes  371718.67 bytes/sec
total size is 557287  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000380085.1_ASM38008v1_genomic.fna.gz

sent 42 bytes  received 705605 bytes  470431.33 bytes/sec
total size is 705324  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000164675.2_ASM16467v2_genomic.fna.gz

sent 42 bytes  received 637082 bytes  424749.33 bytes/sec
total size is 636817  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_004283785.1_ASM428378v1_genomic.fna.gz

sent 42 bytes  received 562323 bytes  374910.00 bytes/sec
total size is 562073  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, net

GCA_000423765.1_ASM42376v1_genomic.fna.gz

sent 42 bytes  received 684563 bytes  456403.33 bytes/sec
total size is 684290  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900475415.1_42206_H01_genomic.fna.gz

sent 42 bytes  received 600001 bytes  400028.67 bytes/sec
total size is 599745  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netwo

GCA_000767835.1_ASM76783v1_genomic.fna.gz

sent 42 bytes  received 618228 bytes  412180.00 bytes/sec
total size is 617971  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000686605.1_ASM68660v1_genomic.fna.gz

sent 42 bytes  received 641205 bytes  427498.00 bytes/sec
total size is 640940  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw


GCA_900636445.1_41965_G01_genomic.fna.gz

sent 42 bytes  received 575065 bytes  383404.67 bytes/sec
total size is 574817  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900636365.1_41559_G01_genomic.fna.gz

sent 42 bytes  received 592864 bytes  395270.67 bytes/sec
total size is 592608  speedup is 1.00


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [172]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [173]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481185 ("sim_Streptococcus_acidominimus_GCF_900187245") has been submitted
Your job 5481186 ("sim_Streptococcus_agalactiae_GCF_000007265") has been submitted
Your job 5481187 ("sim_Streptococcus_alactolyticus_GCF_009695625") has been submitted
Your job 5481188 ("sim_Streptococcus_anginosus_GCF_000463505") has been submitted
Your job 5481189 ("sim_Streptococcus_australis_GCF_900476055") has been submitted
Your job 5481190 ("sim_Streptococcus_azizii_GCF_001984715") has been submitted
Your job 5481191 ("sim_Streptococcus_bovimastitidis_GCF_001885095") has been submitted
Your job 5481192 ("sim_Streptococcus_caballi_GCF_000379985") has been submitted
Your job 5481193 ("sim_Streptococcus_canis_GCF_900636575") has been submitted
Your job 5481194 ("sim_Streptococcus_castoreus_GCF_000425025") has been submitted
Your job 5481195 ("sim_Streptococcus_constellatus_GCF_000463425") has been submitted
Your job 5481196 ("sim_Streptococcus_criceti_GCF_000187975") has been submitted
Your job 54

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [174]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.fas path=results/phyluce/streptococcus/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [175]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481273 ("map_Streptococcus_acidominimus_GCF_900187245") has been submitted
Your job 5481274 ("map_Streptococcus_agalactiae_GCF_000007265") has been submitted
Your job 5481275 ("map_Streptococcus_alactolyticus_GCF_009695625") has been submitted
Your job 5481276 ("map_Streptococcus_anginosus_GCF_000463505") has been submitted
Your job 5481277 ("map_Streptococcus_australis_GCF_900476055") has been submitted
Your job 5481278 ("map_Streptococcus_azizii_GCF_001984715") has been submitted
Your job 5481279 ("map_Streptococcus_bovimastitidis_GCF_001885095") has been submitted
Your job 5481280 ("map_Streptococcus_caballi_GCF_000379985") has been submitted
Your job 5481281 ("map_Streptococcus_canis_GCF_900636575") has been submitted
Your job 5481282 ("map_Streptococcus_castoreus_GCF_000425025") has been submitted
Your job 5481283 ("map_Streptococcus_constellatus_GCF_000463425") has been submitted
Your job 5481284 ("map_Streptococcus_criceti_GCF_000187975") has been submitted
Your job 54

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [176]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481361 ("merge_Streptococcus_acidominimus_GCF_900187245") has been submitted
Your job 5481362 ("merge_Streptococcus_agalactiae_GCF_000007265") has been submitted
Your job 5481363 ("merge_Streptococcus_alactolyticus_GCF_009695625") has been submitted
Your job 5481364 ("merge_Streptococcus_anginosus_GCF_000463505") has been submitted
Your job 5481365 ("merge_Streptococcus_australis_GCF_900476055") has been submitted
Your job 5481366 ("merge_Streptococcus_azizii_GCF_001984715") has been submitted
Your job 5481367 ("merge_Streptococcus_bovimastitidis_GCF_001885095") has been submitted
Your job 5481368 ("merge_Streptococcus_caballi_GCF_000379985") has been submitted
Your job 5481369 ("merge_Streptococcus_canis_GCF_900636575") has been submitted
Your job 5481370 ("merge_Streptococcus_castoreus_GCF_000425025") has been submitted
Your job 5481371 ("merge_Streptococcus_constellatus_GCF_000463425") has been submitted
Your job 5481372 ("merge_Streptococcus_criceti_GCF_000187975") has be

In [177]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 205 sequences from Streptococcus_acidominimus_GCF_900187245_merged.bed.  Filtered 117 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 88.
Screened 203 sequences from Streptococcus_agalactiae_GCF_000007265_merged.bed.  Filtered 114 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 89.
Screened 211 sequences from Streptococcus_alactolyticus_GCF_009695625_merged.bed.  Filtered 113 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 98.
Screened 275 sequences from Streptococcus_anginosus_GCF_000463505_merged.bed.  Filtered 158 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 117.
Screened 578 sequences from Streptococcus_australis_GCF_900476055_merged.bed.  Filtered 298 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 280.
Screened 162 sequences from Streptococcus_azizii_GCF_001984715_merged.bed.  Filtered 88 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 74.
Screened 174 sequences from Streptococcu

Screened 233 sequences from Streptococcus_ovis_GCF_000380125_merged.bed.  Filtered 127 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 106.
Screened 70 sequences from Streptococcus_pantholopis_GCF_001642085_merged.bed.  Filtered 39 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 31.
Screened 505 sequences from Streptococcus_parasanguinis_GCF_000164675_merged.bed.  Filtered 260 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 245.
Screened 245 sequences from Streptococcus_parasuis_GCF_004283785_merged.bed.  Filtered 138 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 107.
Screened 157 sequences from Streptococcus_parauberis_GCF_000213825_merged.bed.  Filtered 88 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 69.
Screened 245 sequences from Streptococcus_pasteurianus_GCF_900478025_merged.bed.  Filtered 130 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 115.
Screened 186 sequences from Streptococcus

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [178]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [179]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_merged.bed --twobit results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.2bit --output results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_stripped.bed;
streptococcus_acidominimus_gcf_900187245.
streptococcus_agalactiae_gcf_000007265.
streptococcus_alactolyticus_gcf_009695625.
streptococcus_anginosus_gcf_000463505.
streptococcus_australis_gcf_900476055.
streptococcus_azizii_gcf_001984715.
streptococcus_bovimastitidis_gcf_001885095.
streptococcus_caballi_gcf_000379985.
streptococcus_canis_gcf_900636575.
streptococcus_castoreus_gcf_000425025.
streptococcus_constellatus_gcf_000463425.
streptococcus_criceti_gcf_000187975.
streptococcus_cristatus_gcf_000385925.
streptococcus_cuniculi_gcf_001921845.
streptococcus_danieliae_gcf_009767945.
streptococcus_d

Quantify probes and the number of targeted taxa for each.

In [180]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_merged.bed --twobit results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.2bit --output results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_stripped.bed;
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 0 taxa:	2,656.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 1 taxa:	2,656.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 2 taxa:	2,359.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 3 taxa:	1,812.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 4 taxa:	1,506.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 5 taxa:	908.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 6 taxa:	723.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 7 taxa:	583.0
Loci shared by Streptococcus_pneum

In [181]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 41
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/streptococcus/initial_intervals/streptococcus-to-Streptococcus_pneumoniae_GCF_000007045.sqlite --base-taxon Streptococcus_pneumoniae_GCF_000007045 --output results/phyluce/streptococcus/initial_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.bed --specific-counts 41;
Counter({'streptococcus_oralis_gcf_900637025': 100, 'streptococcus_pseudopneumoniae_gcf_000221985': 100, 'streptococcus_mitis_gcf_000027165': 100, 'streptococcus_timonensis_gcf_900095845': 99, 'streptococcus_halitosis_gcf_003143695': 99, 'streptococcus_parasanguinis_gcf_000164675': 99, 'streptococcus_infantis_gcf_000187465': 98, 'streptococcus_gordonii_gcf_000017005': 98, 'streptococcus_viridans_gcf_900636365': 97, 'streptococcus_cristatus_gcf_000385925': 96, 'streptococcus_salivarius_gcf_000785515': 95, 'streptococcus_peroris_gcf_000187585': 95, 'streptococcus_gallolyticus_gcf_002000985': 94, 'streptococcus_infantarius_gcf_000246835': 94, 'streptococcus_sangu

## Design temp set of baits

In [182]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/streptococcus/initial_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.bed --twobit results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.2bit --buffer-to 160 --output results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.fasta;
Screened 100 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 100.


design the baits

In [183]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.fasta --probe-prefix uce_streptococcus_ --design streptococcus_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):



Conserved locus count = 100
Probe Count = 200


## Find duplicate baited regions

In [184]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas --query results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  15:38:46
Ended:  Thu Feb 06, 2020  15:38:46
Time for execution:  0.00281981627146 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas                        --lastz results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes_vself.lastz                       --probe-prefix=uce_streptococcus_;
Parsing lastz file...
Screening results...
Screened 199 fasta sequen

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [185]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [186]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas --scaffoldlist Streptococcus_acidominimus_GCF_900187245 Streptococcus_agalactiae_GCF_000007265 Streptococcus_alactolyticus_GCF_009695625 Streptococcus_anginosus_GCF_000463505 Streptococcus_australis_GCF_900476055 Streptococcus_azizii_GCF_001984715 Streptococcus_bovimastitidis_GCF_001885095 Streptococcus_caballi_GCF_000379985 Streptococcus_canis_GCF_900636575 Streptococcus_castoreus_GCF_000425025 Streptococcus_constellatus_GCF_000463425 Streptococcus_criceti_GCF_000187975 Streptococcus_cristatus_GCF_000385925 Streptococcus_cuniculi_GCF_001921845 Streptococcus_danieliae_GCF_009767945 Streptococcus_devriesei_GCF_000423725 Streptococcus_didelphis_GCF_000380005 Streptococcus_downei_GCF_900459175 Streptococcus_dysgalactiae_GCF_000317855 Streptococcus_entericus_GCF_000380025 Streptococcus_equi_GCF_000026605 Streptococcus_equinus_GCF_0

Creating Streptococcus_caballi_GCF_000379985 table
Inserting data to Streptococcus_caballi_GCF_000379985 table

Running against Streptococcus_canis_GCF_900636575.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpz6O8Vl.fasta

Writing the results file...
	/tmp/tmppVZ5hF.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_canis_GCF_900636575.lastz
Creating Streptococcus_canis_GCF_900636575 table
Inserting data to Streptococcus_canis_GCF_900636575 table

Running against Streptococcus_castoreus_GCF_000425025.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpZxyIro.fasta

Writing the results file...
	/tmp/tmp8ERmAr.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/

Inserting data to Streptococcus_equinus_GCF_000964315 table

Running against Streptococcus_ferus_GCF_000372425.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp9s3guk.fasta

Writing the results file...
	/tmp/tmprpK1wS.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_ferus_GCF_000372425.lastz
Creating Streptococcus_ferus_GCF_000372425 table
Inserting data to Streptococcus_ferus_GCF_000372425 table

Running against Streptococcus_gallolyticus_GCF_002000985.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpN6PSFq.fasta

Writing the results file...
	/tmp/tmp_tCpfa.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals

Inserting data to Streptococcus_infantis_GCF_000187465 table

Running against Streptococcus_iniae_GCF_000831485.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpJLsAGi.fasta

Writing the results file...
	/tmp/tmpJb2aaC.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_iniae_GCF_000831485.lastz
Creating Streptococcus_iniae_GCF_000831485 table
Inserting data to Streptococcus_iniae_GCF_000831485 table

Running against Streptococcus_intermedius_GCF_000463355.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp81yovK.fasta

Writing the results file...
	/tmp/tmpzE3xC3.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals


Running against Streptococcus_orisasini_GCF_001431045.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpCxQQss.fasta

Writing the results file...
	/tmp/tmpxcLn4K.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_orisasini_GCF_001431045.lastz
Creating Streptococcus_orisasini_GCF_001431045 table
Inserting data to Streptococcus_orisasini_GCF_001431045 table

Running against Streptococcus_orisratti_GCF_000380105.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp09d6mv.fasta

Writing the results file...
	/tmp/tmpAcPUrC.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+


Running against Streptococcus_porci_GCF_000423765.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpZPMOyW.fasta

Writing the results file...
	/tmp/tmp6Getaa.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_porci_GCF_000423765.lastz
Creating Streptococcus_porci_GCF_000423765 table
Inserting data to Streptococcus_porci_GCF_000423765 table

Running against Streptococcus_porcinus_GCF_900475415.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpwRfBqW.fasta

Writing the results file...
	/tmp/tmp2bBfpT.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fa


Running against Streptococcus_suis_GCF_000026745.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpvukvvc.fasta

Writing the results file...
	/tmp/tmpbRqZo3.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_suis_GCF_000026745.lastz
Creating Streptococcus_suis_GCF_000026745 table
Inserting data to Streptococcus_suis_GCF_000026745 table

Running against Streptococcus_thermophilus_GCF_000253395.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpakGhf3.fasta

Writing the results file...
	/tmp/tmpmRYu_r.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fa

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [187]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/streptococcus/extract_probes_from_group/streptococcus_genome.conf --lastz results/phyluce/streptococcus/validate_intervals/lastz --probes 120 --probe-prefix uce_streptococcus_ --name-pattern "Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/streptococcus/extract_probes_from_group/probe_fasta;
2020-02-06 15:40:38,174 - Phyluce - INFO - --- Working on Streptococcus_acidominimus_GCF_900187245 genome --
2020-02-06 15:40:38,174 - Phyluce - INFO - Reading Streptococcus_acidominimus_GCF_900187245 genome
2020-02-06 15:40:39,054 - Phyluce - INFO - Streptococcus_acidominimus_GCF_900187245: 100 uces, 0 dupes, 100 non-dupes, 18 orient drop, 1 length drop, 81 written
2020-02-06 15:40:39,054 - Phyluce - INFO - ---- Working on Streptococcus_agalactiae_GCF_000007265 genome ---
2020-02-06 15:40:39,055 - Phyluce - INFO - Reading Streptococcus_agalactiae_GCF_000007265 genome
2020-02-06 15

2020-02-06 15:40:48,061 - Phyluce - INFO - Streptococcus_equinus_GCF_000964315: 100 uces, 0 dupes, 100 non-dupes, 0 orient drop, 18 length drop, 82 written
2020-02-06 15:40:48,061 - Phyluce - INFO - ------ Working on Streptococcus_ferus_GCF_000372425 genome ------
2020-02-06 15:40:48,061 - Phyluce - INFO - Reading Streptococcus_ferus_GCF_000372425 genome
2020-02-06 15:40:48,393 - Phyluce - INFO - Streptococcus_ferus_GCF_000372425: 99 uces, 3 dupes, 96 non-dupes, 0 orient drop, 0 length drop, 96 written
2020-02-06 15:40:48,394 - Phyluce - INFO - --- Working on Streptococcus_gallolyticus_GCF_002000985 genome --
2020-02-06 15:40:48,394 - Phyluce - INFO - Reading Streptococcus_gallolyticus_GCF_002000985 genome
2020-02-06 15:40:48,942 - Phyluce - INFO - Streptococcus_gallolyticus_GCF_002000985: 100 uces, 0 dupes, 100 non-dupes, 0 orient drop, 18 length drop, 82 written
2020-02-06 15:40:48,943 - Phyluce - INFO - ----- Working on Streptococcus_gordonii_GCF_000017005 genome ----
2020-02-06 15:

2020-02-06 15:40:58,080 - Phyluce - INFO - Streptococcus_merionis_GCF_000380085: 99 uces, 5 dupes, 94 non-dupes, 0 orient drop, 0 length drop, 94 written
2020-02-06 15:40:58,080 - Phyluce - INFO - ----- Working on Streptococcus_milleri_GCF_900636715 genome -----
2020-02-06 15:40:58,081 - Phyluce - INFO - Reading Streptococcus_milleri_GCF_900636715 genome
2020-02-06 15:40:58,537 - Phyluce - INFO - Streptococcus_milleri_GCF_900636715: 100 uces, 0 dupes, 100 non-dupes, 17 orient drop, 1 length drop, 82 written
2020-02-06 15:40:58,537 - Phyluce - INFO - ------ Working on Streptococcus_minor_GCF_000377005 genome ------
2020-02-06 15:40:58,538 - Phyluce - INFO - Reading Streptococcus_minor_GCF_000377005 genome
2020-02-06 15:40:58,868 - Phyluce - INFO - Streptococcus_minor_GCF_000377005: 99 uces, 5 dupes, 94 non-dupes, 0 orient drop, 0 length drop, 94 written
2020-02-06 15:40:58,868 - Phyluce - INFO - ------ Working on Streptococcus_mitis_GCF_000027165 genome ------
2020-02-06 15:40:58,868 - 

2020-02-06 15:41:07,723 - Phyluce - INFO - Streptococcus_pseudoporcinus_GCF_000188035: 100 uces, 0 dupes, 100 non-dupes, 12 orient drop, 5 length drop, 83 written
2020-02-06 15:41:07,723 - Phyluce - INFO - ----- Working on Streptococcus_pyogenes_GCF_000006785 genome ----
2020-02-06 15:41:07,724 - Phyluce - INFO - Reading Streptococcus_pyogenes_GCF_000006785 genome
2020-02-06 15:41:08,250 - Phyluce - INFO - Streptococcus_pyogenes_GCF_000006785: 100 uces, 0 dupes, 100 non-dupes, 14 orient drop, 4 length drop, 82 written
2020-02-06 15:41:08,250 - Phyluce - INFO - ------ Working on Streptococcus_ratti_GCF_000286075 genome ------
2020-02-06 15:41:08,251 - Phyluce - INFO - Reading Streptococcus_ratti_GCF_000286075 genome
2020-02-06 15:41:08,565 - Phyluce - INFO - Streptococcus_ratti_GCF_000286075: 95 uces, 0 dupes, 95 non-dupes, 0 orient drop, 0 length drop, 95 written
2020-02-06 15:41:08,565 - Phyluce - INFO - ---- Working on Streptococcus_respraculi_GCF_003595525 genome ---
2020-02-06 15:4

In [188]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/streptococcus/extract_probes_from_group/probe_fasta --output results/phyluce/streptococcus/extract_probes_from_group/multifastas.sqlite --base-taxon Streptococcus_pneumoniae_GCF_000007045;
streptococcus_acidominimus_gcf_900187245.
streptococcus_agalactiae_gcf_000007265.
streptococcus_alactolyticus_gcf_009695625.
streptococcus_anginosus_gcf_000463505.
streptococcus_australis_gcf_900476055.
streptococcus_azizii_gcf_001984715.
streptococcus_bovimastitidis_gcf_001885095.
streptococcus_caballi_gcf_000379985.
streptococcus_canis_gcf_900636575.
streptococcus_castoreus_gcf_000425025.
streptococcus_constellatus_gcf_000463425.
streptococcus_criceti_gcf_000187975.
streptococcus_cristatus_gcf_000385925.
streptococcus_cuniculi_gcf_001921845.
streptococcus_danieliae_gcf_009767945.
streptococcus_devriesei_gcf_000423725.
streptococcus_didelphis_gcf_000380005.
streptococcus_downei_gcf_900459175.
streptococcus_dysgalactiae_gcf_000317855.
strep

In [189]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(77)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/streptococcus/extract_probes_from_group/multifastas.sqlite  --base-taxon Streptococcus_pneumoniae_GCF_000007045 --output results/phyluce/streptococcus/extract_probes_from_group/Streptococcus_pneumoniae_GCF_000007045+41-back-to-77.conf --specific-counts 77;
Counter({'streptococcus_varani_gcf_001375655': 81, 'streptococcus_australis_gcf_900476055': 81, 'streptococcus_anginosus_gcf_000463505': 81, 'streptococcus_timonensis_gcf_900095845': 81, 'streptococcus_gallolyticus_gcf_002000985': 81, 'streptococcus_mutans_gcf_000007465': 81, 'streptococcus_intermedius_gcf_000463355': 81, 'streptococcus_infantarius_gcf_000246835': 81, 'streptococcus_equi_gcf_000026605': 81, 'streptococcus_respraculi_gcf_003595525': 81, 'streptococcus_pluranimalium_gcf_003352995': 81, 'streptococcus_uberis_gcf_000009545': 81, 'streptococcus_danieliae_gcf_009767945': 81, 'streptococcus_constellatus_gcf_000463425': 81, 'streptococcus_parauberis_gcf_000213825': 8

## Final group specific bait design

In [190]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/streptococcus/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/streptococcus/extract_probes_from_group/Streptococcus_pneumoniae_GCF_000007045+41-back-to-77.conf --probe-prefix uce_streptococcus_ --designer rnplattii --design streptococcus_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 81
Probe Count = 14298


In [191]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta --query results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  15:42:14
Ended:  Thu Feb 06, 2020  15:44:20
Time for execution:  2.11134323279 minutes


In [192]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta --lastz results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_streptococcus_;
Parsing lastz file...
Screening results...
Screened 14297 fasta sequences.  Filtered 0 duplicates. Kept 14298.


## CDhit to reduce numbers

In [193]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta
         -o
         results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 16:37:39 2020
                            Output                              
----------------------------------------------------------------
total seq: 14298
longest and shortest : 80 and 80
Total letters: 1143840
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 3M
Buffer          : 4 X 12M = 48M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 90M

Table limit with the given memory limit:
Max number of representatives: 3940576
Max number of word counting entries: 88702384

# comparing sequences from          0  to       2383
..---------- new table with     1206 representatives
# comparing sequenc

# Yersinia

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [112]:
group = 'yersinia'

In [113]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [118]:
group_taxa = [ "Yersinia_aldovae_GCF_000834395",
               "Yersinia_aleksiciae_GCF_001047675",
               "Yersinia_bercovieri_GCF_000167975",
               "Yersinia_canariae_GCF_009831415",
               "Yersinia_enterocolitica_GCF_000009345",
               "Yersinia_entomophaga_GCF_001656035",
               "Yersinia_frederiksenii_GCF_002591095",
               "Yersinia_hibernica_GCF_004124235",
               "Yersinia_intermedia_GCF_900635455",
               "Yersinia_kristensenii_GCF_000834865",
               "Yersinia_massiliensis_GCF_003048255",
               "Yersinia_mollaretii_GCF_000167995",
               "Yersinia_nurmii_GCF_001112925",
               "Yersinia_pekkanenii_GCF_001244635",
               "Yersinia_pseudotuberculosis_GCF_900637475",
               "Yersinia_rohdei_GCF_000834455",
               "Yersinia_ruckeri_GCF_000964565",
               "Yersinia_similis_GCF_000582515",
               "Yersinia_sp_GCF_002073315",
               "Yersinia_wautersii_GCF_000493415"  ]
                    
reference_taxon = "Yersinia_pestis_GCF_000009065"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [115]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000834395.1_ASM83439v1_genomic.fna.gz

sent 42 bytes  received 1331888 bytes  887953.33 bytes/sec
total size is 1331455  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplin

GCA_003048255.1_ASM304825v1_genomic.fna.gz

sent 42 bytes  received 1543284 bytes  1028884.00 bytes/sec
total size is 1542794  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000167995.1_ASM16799v1_genomic.fna.gz

sent 42 bytes  received 1367469 bytes  911674.00 bytes/sec
total size is 1367028  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [119]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [120]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480876 ("sim_Yersinia_aldovae_GCF_000834395") has been submitted
Your job 5480877 ("sim_Yersinia_aleksiciae_GCF_001047675") has been submitted
Your job 5480878 ("sim_Yersinia_bercovieri_GCF_000167975") has been submitted
Your job 5480879 ("sim_Yersinia_canariae_GCF_009831415") has been submitted
Your job 5480880 ("sim_Yersinia_enterocolitica_GCF_000009345") has been submitted
Your job 5480881 ("sim_Yersinia_entomophaga_GCF_001656035") has been submitted
Your job 5480882 ("sim_Yersinia_frederiksenii_GCF_002591095") has been submitted
Your job 5480883 ("sim_Yersinia_hibernica_GCF_004124235") has been submitted
Your job 5480884 ("sim_Yersinia_intermedia_GCF_900635455") has been submitted
Your job 5480885 ("sim_Yersinia_kristensenii_GCF_000834865") has been submitted
Your job 5480886 ("sim_Yersinia_massiliensis_GCF_003048255") has been submitted
Your job 5480887 ("sim_Yersinia_mollaretii_GCF_000167995") has been submitted
Your job 5480888 ("sim_Yersinia_nurmii_GCF_001112925") has

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [121]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/yersinia/cleaned_genomes/Yersinia_pestis_GCF_000009065_formatted.fas path=results/phyluce/yersinia/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [122]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480896 ("map_Yersinia_aldovae_GCF_000834395") has been submitted
Your job 5480897 ("map_Yersinia_aleksiciae_GCF_001047675") has been submitted
Your job 5480898 ("map_Yersinia_bercovieri_GCF_000167975") has been submitted
Your job 5480899 ("map_Yersinia_canariae_GCF_009831415") has been submitted
Your job 5480900 ("map_Yersinia_enterocolitica_GCF_000009345") has been submitted
Your job 5480901 ("map_Yersinia_entomophaga_GCF_001656035") has been submitted
Your job 5480902 ("map_Yersinia_frederiksenii_GCF_002591095") has been submitted
Your job 5480903 ("map_Yersinia_hibernica_GCF_004124235") has been submitted
Your job 5480904 ("map_Yersinia_intermedia_GCF_900635455") has been submitted
Your job 5480905 ("map_Yersinia_kristensenii_GCF_000834865") has been submitted
Your job 5480906 ("map_Yersinia_massiliensis_GCF_003048255") has been submitted
Your job 5480907 ("map_Yersinia_mollaretii_GCF_000167995") has been submitted
Your job 5480908 ("map_Yersinia_nurmii_GCF_001112925") has

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [123]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5480916 ("merge_Yersinia_aldovae_GCF_000834395") has been submitted
Your job 5480917 ("merge_Yersinia_aleksiciae_GCF_001047675") has been submitted
Your job 5480918 ("merge_Yersinia_bercovieri_GCF_000167975") has been submitted
Your job 5480919 ("merge_Yersinia_canariae_GCF_009831415") has been submitted
Your job 5480920 ("merge_Yersinia_enterocolitica_GCF_000009345") has been submitted
Your job 5480921 ("merge_Yersinia_entomophaga_GCF_001656035") has been submitted
Your job 5480922 ("merge_Yersinia_frederiksenii_GCF_002591095") has been submitted
Your job 5480923 ("merge_Yersinia_hibernica_GCF_004124235") has been submitted
Your job 5480924 ("merge_Yersinia_intermedia_GCF_900635455") has been submitted
Your job 5480925 ("merge_Yersinia_kristensenii_GCF_000834865") has been submitted
Your job 5480926 ("merge_Yersinia_massiliensis_GCF_003048255") has been submitted
Your job 5480927 ("merge_Yersinia_mollaretii_GCF_000167995") has been submitted
Your job 5480928 ("merge_Yersinia_

In [124]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 3082 sequences from Yersinia_aldovae_GCF_000834395_merged.bed.  Filtered 1707 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1375.
Screened 3023 sequences from Yersinia_aleksiciae_GCF_001047675_merged.bed.  Filtered 1661 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1362.
Screened 3237 sequences from Yersinia_bercovieri_GCF_000167975_merged.bed.  Filtered 1796 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1441.
Screened 3066 sequences from Yersinia_canariae_GCF_009831415_merged.bed.  Filtered 1694 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1372.
Screened 3162 sequences from Yersinia_enterocolitica_GCF_000009345_merged.bed.  Filtered 1747 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 1415.
Screened 1445 sequences from Yersinia_entomophaga_GCF_001656035_merged.bed.  Filtered 829 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 616.
Screened 2720 sequences from Yersinia_frederikse

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [125]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [126]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/yersinia/initial_intervals/Yersinia_wautersii_GCF_000493415_merged.bed --twobit results/phyluce/yersinia/cleaned_genomes/Yersinia_pestis_GCF_000009065_formatted.2bit --output results/phyluce/yersinia/initial_intervals/Yersinia_wautersii_GCF_000493415_stripped.bed;
yersinia_aldovae_gcf_000834395..
yersinia_aleksiciae_gcf_001047675..
yersinia_bercovieri_gcf_000167975..
yersinia_canariae_gcf_009831415..
yersinia_enterocolitica_gcf_000009345..
yersinia_entomophaga_gcf_001656035.
yersinia_frederiksenii_gcf_002591095..
yersinia_hibernica_gcf_004124235..
yersinia_intermedia_gcf_900635455..
yersinia_kristensenii_gcf_000834865..
yersinia_massiliensis_gcf_003048255..
yersinia_mollaretii_gcf_000167995..
yersinia_nurmii_gcf_001112925.
yersinia_pekkanenii_gcf_001244635..
yersinia_pseudotuberculosis_gcf_900637475.
yersinia_rohdei_gcf_000834455..
yersinia_ruckeri_gcf_000964565.
yersinia_similis_gcf_0005

Quantify probes and the number of targeted taxa for each.

In [127]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/yersinia/initial_intervals/Yersinia_wautersii_GCF_000493415_merged.bed --twobit results/phyluce/yersinia/cleaned_genomes/Yersinia_pestis_GCF_000009065_formatted.2bit --output results/phyluce/yersinia/initial_intervals/Yersinia_wautersii_GCF_000493415_stripped.bed;
Loci shared by Yersinia_pestis_GCF_000009065 + 0 taxa:	4,353.0
Loci shared by Yersinia_pestis_GCF_000009065 + 1 taxa:	4,353.0
Loci shared by Yersinia_pestis_GCF_000009065 + 2 taxa:	4,127.0
Loci shared by Yersinia_pestis_GCF_000009065 + 3 taxa:	3,103.0
Loci shared by Yersinia_pestis_GCF_000009065 + 4 taxa:	2,913.0
Loci shared by Yersinia_pestis_GCF_000009065 + 5 taxa:	2,327.0
Loci shared by Yersinia_pestis_GCF_000009065 + 6 taxa:	2,009.0
Loci shared by Yersinia_pestis_GCF_000009065 + 7 taxa:	1,770.0
Loci shared by Yersinia_pestis_GCF_000009065 + 8 taxa:	1,585.0
Loci shared by Yersinia_pestis_GCF_000009065 + 9 taxa:	1,450.0
Loci s

In [128]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 20
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/yersinia/initial_intervals/yersinia-to-Yersinia_pestis_GCF_000009065.sqlite --base-taxon Yersinia_pestis_GCF_000009065 --output results/phyluce/yersinia/initial_intervals/Yersinia_pestis_GCF_000009065_+20.bed --specific-counts 20;
Counter({'yersinia_kristensenii_gcf_000834865': 293, 'yersinia_intermedia_gcf_900635455': 293, 'yersinia_pekkanenii_gcf_001244635': 293, 'yersinia_similis_gcf_000582515': 293, 'yersinia_bercovieri_gcf_000167975': 293, 'yersinia_rohdei_gcf_000834455': 293, 'yersinia_sp_gcf_002073315': 293, 'yersinia_hibernica_gcf_004124235': 293, 'yersinia_enterocolitica_gcf_000009345': 293, 'yersinia_aleksiciae_gcf_001047675': 293, 'yersinia_massiliensis_gcf_003048255': 293, 'yersinia_wautersii_gcf_000493415': 293, 'yersinia_aldovae_gcf_000834395': 293, 'yersinia_frederiksenii_gcf_002591095': 293, 'yersinia_pseudotuberculosis_gcf_900637475': 293, 'yersinia_mollaretii_gcf_000167995': 293, 'yersinia_entomophaga_gcf_0016

## Design temp set of baits

In [129]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/yersinia/initial_intervals/Yersinia_pestis_GCF_000009065_+20.bed --twobit results/phyluce/yersinia/cleaned_genomes/Yersinia_pestis_GCF_000009065_formatted.2bit --buffer-to 160 --output results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20.fasta;
Screened 293 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 293.


design the baits

In [130]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20.fasta --probe-prefix uce_yersinia_ --design yersinia_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GG


Conserved locus count = 292
Probe Count = 584


## Find duplicate baited regions

In [131]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20_temp_probes.fas --query results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  12:42:09
Ended:  Thu Feb 06, 2020  12:42:09
Time for execution:  0.00489437977473 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20_temp_probes.fas                        --lastz results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20_temp_probes_vself.lastz                       --probe-prefix=uce_yersinia_;
Parsing lastz file...
Screening results...
Screened 583 fasta sequences.  Filtered 8 duplicates. Kept 568.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [132]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [133]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/yersinia/validate_intervals/Yersinia_pestis_GCF_000009065_+20_temp_probes.fas --scaffoldlist Yersinia_aldovae_GCF_000834395 Yersinia_aleksiciae_GCF_001047675 Yersinia_bercovieri_GCF_000167975 Yersinia_canariae_GCF_009831415 Yersinia_enterocolitica_GCF_000009345 Yersinia_entomophaga_GCF_001656035 Yersinia_frederiksenii_GCF_002591095 Yersinia_hibernica_GCF_004124235 Yersinia_intermedia_GCF_900635455 Yersinia_kristensenii_GCF_000834865 Yersinia_massiliensis_GCF_003048255 Yersinia_mollaretii_GCF_000167995 Yersinia_nurmii_GCF_001112925 Yersinia_pekkanenii_GCF_001244635 Yersinia_pseudotuberculosis_GCF_900637475 Yersinia_rohdei_GCF_000834455 Yersinia_ruckeri_GCF_000964565 Yersinia_similis_GCF_000582515 Yersinia_sp_GCF_002073315 Yersinia_wautersii_GCF_000493415 Yersinia_pestis_GCF_000009065 --genome-base-path results/phyluce/yersinia/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/yersinia/validate_intervals/yers

Inserting data to Yersinia_nurmii_GCF_001112925 table

Running against Yersinia_pekkanenii_GCF_001244635.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpwAgoaA.fasta

Writing the results file...
	/tmp/tmpI6vDoo.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/yersinia/validate_intervals/lastz/Yersinia_pestis_GCF_000009065_+20_temp_probes.fas_v_Yersinia_pekkanenii_GCF_001244635.lastz
Creating Yersinia_pekkanenii_GCF_001244635 table
Inserting data to Yersinia_pekkanenii_GCF_001244635 table

Running against Yersinia_pseudotuberculosis_GCF_900637475.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp82dvin.fasta

Writing the results file...
	/tmp/tmpeaRd2t.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/yersinia/validate_intervals/lastz/Yersinia_pestis_G

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [134]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/yersinia/extract_probes_from_group/yersinia_genome.conf --lastz results/phyluce/yersinia/validate_intervals/lastz --probes 120 --probe-prefix uce_yersinia_ --name-pattern "Yersinia_pestis_GCF_000009065_+20_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/yersinia/extract_probes_from_group/probe_fasta;
2020-02-06 12:43:00,633 - Phyluce - INFO - -------- Working on Yersinia_aldovae_GCF_000834395 genome -------
2020-02-06 12:43:00,634 - Phyluce - INFO - Reading Yersinia_aldovae_GCF_000834395 genome
2020-02-06 12:43:01,712 - Phyluce - INFO - Yersinia_aldovae_GCF_000834395: 292 uces, 0 dupes, 292 non-dupes, 12 orient drop, 5 length drop, 275 written
2020-02-06 12:43:01,712 - Phyluce - INFO - ------ Working on Yersinia_aleksiciae_GCF_001047675 genome ------
2020-02-06 12:43:01,713 - Phyluce - INFO - Reading Yersinia_aleksiciae_GCF_001047675 genome
2020-02-06 12:43:02,737 - Phyluce - INFO - Yersinia_aleksiciae_GCF_0010

In [135]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/yersinia/extract_probes_from_group/probe_fasta --output results/phyluce/yersinia/extract_probes_from_group/multifastas.sqlite --base-taxon Yersinia_pestis_GCF_000009065;
yersinia_aldovae_gcf_000834395.
yersinia_aleksiciae_gcf_001047675.
yersinia_bercovieri_gcf_000167975.
yersinia_canariae_gcf_009831415.
yersinia_enterocolitica_gcf_000009345.
yersinia_entomophaga_gcf_001656035.
yersinia_frederiksenii_gcf_002591095.
yersinia_hibernica_gcf_004124235.
yersinia_intermedia_gcf_900635455.
yersinia_kristensenii_gcf_000834865.
yersinia_massiliensis_gcf_003048255.
yersinia_mollaretii_gcf_000167995.
yersinia_nurmii_gcf_001112925.
yersinia_pekkanenii_gcf_001244635.
yersinia_pseudotuberculosis_gcf_900637475.
yersinia_rohdei_gcf_000834455.
yersinia_ruckeri_gcf_000964565.
yersinia_similis_gcf_000582515.
yersinia_sp_gcf_002073315.
yersinia_wautersii_gcf_000493415.
yersinia_pestis_gcf_000009065.
Creating database
Inserting results
phyluce_pro

In [136]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(21)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/yersinia/extract_probes_from_group/multifastas.sqlite  --base-taxon Yersinia_pestis_GCF_000009065 --output results/phyluce/yersinia/extract_probes_from_group/Yersinia_pestis_GCF_000009065+20-back-to-21.conf --specific-counts 21;
Counter({'yersinia_kristensenii_gcf_000834865': 253, 'yersinia_intermedia_gcf_900635455': 253, 'yersinia_pekkanenii_gcf_001244635': 253, 'yersinia_similis_gcf_000582515': 253, 'yersinia_pestis_gcf_000009065': 253, 'yersinia_bercovieri_gcf_000167975': 253, 'yersinia_rohdei_gcf_000834455': 253, 'yersinia_sp_gcf_002073315': 253, 'yersinia_hibernica_gcf_004124235': 253, 'yersinia_enterocolitica_gcf_000009345': 253, 'yersinia_aleksiciae_gcf_001047675': 253, 'yersinia_massiliensis_gcf_003048255': 253, 'yersinia_wautersii_gcf_000493415': 253, 'yersinia_aldovae_gcf_000834395': 253, 'yersinia_frederiksenii_gcf_002591095': 253, 'yersinia_pseudotuberculosis_gcf_900637475': 253, 'yersinia_mollaretii_gcf_000167995':

## Final group specific bait design

In [137]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/yersinia/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/yersinia/extract_probes_from_group/Yersinia_pestis_GCF_000009065+20-back-to-21.conf --probe-prefix uce_yersinia_ --designer rnplattii --design yersinia_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list.fasta;
GGGG


Conserved locus count = 253
Probe Count = 10622


In [138]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list.fasta --query results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  12:43:38
Ended:  Thu Feb 06, 2020  12:44:03
Time for execution:  0.407919581731 minutes


In [139]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list.fasta --lastz results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_yersinia_;
Parsing lastz file...
Screening results...
Screened 10621 fasta sequences.  Filtered 0 duplicates. Kept 10622.


## CDhit to reduce numbers

In [140]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list.fasta
         -o
         results/phyluce/yersinia/final_probe_design/yersinia_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 14:07:22 2020
                            Output                              
----------------------------------------------------------------
total seq: 10622
longest and shortest : 80 and 80
Total letters: 849760
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 2M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 89M

Table limit with the given memory limit:
Max number of representatives: 3946898
Max number of word counting entries: 88844689

# comparing sequences from          0  to       1770
.---------- new table with      633 representatives
# comparing sequences from       1770  to