## Prep python

In [1]:
import os
import subprocess
import pandas as pd
import shutil
from shutil import copy
import time
from collections import defaultdict
from Bio import SeqIO
import glob

os.chdir("/master/nplatt/pathogen_probes/")


def wait_on_running_jobs():
   
    num_jobs = 1
    
    while num_jobs > 0:
        num_jobs = len(subprocess.check_output('qstat', shell=True).split("\n")) - 2
        time.sleep(60)
        print(".")

# Streptococcus

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [168]:
group = 'streptococcus'

In [169]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [170]:
group_taxa = [ "Streptococcus_acidominimus_GCF_900187245",
               "Streptococcus_agalactiae_GCF_000007265",
               "Streptococcus_alactolyticus_GCF_009695625",
               "Streptococcus_anginosus_GCF_000463505",
               "Streptococcus_australis_GCF_900476055",
               "Streptococcus_azizii_GCF_001984715",
               "Streptococcus_bovimastitidis_GCF_001885095",
               "Streptococcus_caballi_GCF_000379985",
               "Streptococcus_canis_GCF_900636575",
               "Streptococcus_castoreus_GCF_000425025",
               "Streptococcus_constellatus_GCF_000463425",
               "Streptococcus_criceti_GCF_000187975",
               "Streptococcus_cristatus_GCF_000385925",
               "Streptococcus_cuniculi_GCF_001921845",
               "Streptococcus_danieliae_GCF_009767945",
               "Streptococcus_devriesei_GCF_000423725",
               "Streptococcus_didelphis_GCF_000380005",
               "Streptococcus_downei_GCF_900459175",
               "Streptococcus_dysgalactiae_GCF_000317855",
               "Streptococcus_entericus_GCF_000380025",
               "Streptococcus_equi_GCF_000026605",
               "Streptococcus_equinus_GCF_000964315",
               "Streptococcus_ferus_GCF_000372425",
               "Streptococcus_gallolyticus_GCF_002000985",
               "Streptococcus_gordonii_GCF_000017005",
               "Streptococcus_halichoeri_GCF_009870755",
               "Streptococcus_halitosis_GCF_003143695",
               "Streptococcus_halotolerans_GCF_001598035",
               "Streptococcus_henryi_GCF_000376985",
               "Streptococcus_himalayensis_GCF_001708305",
               "Streptococcus_hongkongensis_GCF_000785845",
               "Streptococcus_hyointestinalis_GCF_900459405",
               "Streptococcus_hyovaginalis_GCF_000420785",
               "Streptococcus_ictaluri_GCF_000188015",
               "Streptococcus_infantarius_GCF_000246835",
               "Streptococcus_infantis_GCF_000187465",
               "Streptococcus_iniae_GCF_000831485",
               "Streptococcus_intermedius_GCF_000463355",
               "Streptococcus_lutetiensis_GCF_900475675",
               "Streptococcus_macacae_GCF_000187995",
               "Streptococcus_macedonicus_GCF_000283635",
               "Streptococcus_marimammalium_GCF_000380045",
               "Streptococcus_marmotae_GCF_001623565",
               "Streptococcus_massiliensis_GCF_000380065",
               "Streptococcus_merionis_GCF_000380085",
               "Streptococcus_milleri_GCF_900636715",
               "Streptococcus_minor_GCF_000377005",
               "Streptococcus_mitis_GCF_000027165",
               "Streptococcus_mutans_GCF_000007465",
               "Streptococcus_oralis_GCF_900637025",
               "Streptococcus_orisasini_GCF_001431045",
               "Streptococcus_orisratti_GCF_000380105",
               "Streptococcus_ovis_GCF_000380125",
               "Streptococcus_pantholopis_GCF_001642085",
               "Streptococcus_parasanguinis_GCF_000164675",
               "Streptococcus_parasuis_GCF_004283785",
               "Streptococcus_parauberis_GCF_000213825",
               "Streptococcus_pasteurianus_GCF_900478025",
               "Streptococcus_penaeicida_GCF_002887775",
               "Streptococcus_peroris_GCF_000187585",
               "Streptococcus_pharyngis_GCF_007859195",
               "Streptococcus_phocae_GCF_000772915",
               "Streptococcus_pluranimalium_GCF_003352995",
               "Streptococcus_plurextorum_GCF_000423745",              
               "Streptococcus_porci_GCF_000423765",
               "Streptococcus_porcinus_GCF_900475415",
               "Streptococcus_pseudopneumoniae_GCF_000221985",
               "Streptococcus_pseudoporcinus_GCF_000188035",
               "Streptococcus_pyogenes_GCF_000006785",
               "Streptococcus_ratti_GCF_000286075",
               "Streptococcus_respraculi_GCF_003595525",
               "Streptococcus_rubneri_GCF_004785935",
               "Streptococcus_ruminantium_GCF_003609975",
               "Streptococcus_salivarius_GCF_000785515",
               "Streptococcus_sanguinis_GCF_000014205",
               "Streptococcus_sinensis_GCF_000767835",
               "Streptococcus_sobrinus_GCF_000686605",
               "Streptococcus_sp_GCF_003086355",
               "Streptococcus_suis_GCF_000026745",
               "Streptococcus_thermophilus_GCF_000253395",
               "Streptococcus_thoraltensis_GCF_000380145",
               "Streptococcus_timonensis_GCF_900095845",
               "Streptococcus_troglodytae_GCF_002355215",
               "Streptococcus_uberis_GCF_000009545",
               "Streptococcus_urinalis_GCF_000188055",
               "Streptococcus_varani_GCF_001375655",
               "Streptococcus_vestibularis_GCF_900636445",
               "Streptococcus_viridans_GCF_900636365" ]
                    
reference_taxon = "Streptococcus_pneumoniae_GCF_000007045"

all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [171]:
#get all of the genomes and gffs for a single representative
if not os.path.exists("data/genomes/" + group):
    os.makedirs("data/genomes/" + group)
    
accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900187245.1_51470_D02_genomic.fna.gz

sent 42 bytes  received 687125 bytes  196333.43 bytes/sec
total size is 686853  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary

GCA_000463425.1_ASM46342v1_genomic.fna.gz

sent 42 bytes  received 588864 bytes  392604.00 bytes/sec
total size is 588615  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000187975.3_ASM18797v3_genomic.fna.gz

sent 42 bytes  received 718440 bytes  478988.00 bytes/sec
total size is 718159  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000964315.1_ASM96431v1_genomic.fna.gz

sent 42 bytes  received 565160 bytes  376801.33 bytes/sec
total size is 564911  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000372425.1_ASM37242v1_genomic.fna.gz

sent 42 bytes  received 551834 bytes  367917.33 bytes/sec
total size is 551593  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000420785.1_ASM42078v1_genomic.fna.gz

sent 42 bytes  received 621677 bytes  414479.33 bytes/sec
total size is 621420  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000188015.3_ASM18801v3_genomic.fna.gz

sent 42 bytes  received 657139 bytes  438120.67 bytes/sec
total size is 656866  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000380065.1_ASM38006v1_genomic.fna.gz

sent 42 bytes  received 557536 bytes  371718.67 bytes/sec
total size is 557287  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000380085.1_ASM38008v1_genomic.fna.gz

sent 42 bytes  received 705605 bytes  470431.33 bytes/sec
total size is 705324  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw

GCA_000164675.2_ASM16467v2_genomic.fna.gz

sent 42 bytes  received 637082 bytes  424749.33 bytes/sec
total size is 636817  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_004283785.1_ASM428378v1_genomic.fna.gz

sent 42 bytes  received 562323 bytes  374910.00 bytes/sec
total size is 562073  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, net

GCA_000423765.1_ASM42376v1_genomic.fna.gz

sent 42 bytes  received 684563 bytes  456403.33 bytes/sec
total size is 684290  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900475415.1_42206_H01_genomic.fna.gz

sent 42 bytes  received 600001 bytes  400028.67 bytes/sec
total size is 599745  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netwo

GCA_000767835.1_ASM76783v1_genomic.fna.gz

sent 42 bytes  received 618228 bytes  412180.00 bytes/sec
total size is 617971  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000686605.1_ASM68660v1_genomic.fna.gz

sent 42 bytes  received 641205 bytes  427498.00 bytes/sec
total size is 640940  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, netw


GCA_900636445.1_41965_G01_genomic.fna.gz

sent 42 bytes  received 575065 bytes  383404.67 bytes/sec
total size is 574817  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900636365.1_41559_G01_genomic.fna.gz

sent 42 bytes  received 592864 bytes  395270.67 bytes/sec
total size is 592608  speedup is 1.00


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [172]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 10x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [173]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 10 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481185 ("sim_Streptococcus_acidominimus_GCF_900187245") has been submitted
Your job 5481186 ("sim_Streptococcus_agalactiae_GCF_000007265") has been submitted
Your job 5481187 ("sim_Streptococcus_alactolyticus_GCF_009695625") has been submitted
Your job 5481188 ("sim_Streptococcus_anginosus_GCF_000463505") has been submitted
Your job 5481189 ("sim_Streptococcus_australis_GCF_900476055") has been submitted
Your job 5481190 ("sim_Streptococcus_azizii_GCF_001984715") has been submitted
Your job 5481191 ("sim_Streptococcus_bovimastitidis_GCF_001885095") has been submitted
Your job 5481192 ("sim_Streptococcus_caballi_GCF_000379985") has been submitted
Your job 5481193 ("sim_Streptococcus_canis_GCF_900636575") has been submitted
Your job 5481194 ("sim_Streptococcus_castoreus_GCF_000425025") has been submitted
Your job 5481195 ("sim_Streptococcus_constellatus_GCF_000463425") has been submitted
Your job 5481196 ("sim_Streptococcus_criceti_GCF_000187975") has been submitted
Your job 54

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [174]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.fas path=results/phyluce/streptococcus/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [175]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481273 ("map_Streptococcus_acidominimus_GCF_900187245") has been submitted
Your job 5481274 ("map_Streptococcus_agalactiae_GCF_000007265") has been submitted
Your job 5481275 ("map_Streptococcus_alactolyticus_GCF_009695625") has been submitted
Your job 5481276 ("map_Streptococcus_anginosus_GCF_000463505") has been submitted
Your job 5481277 ("map_Streptococcus_australis_GCF_900476055") has been submitted
Your job 5481278 ("map_Streptococcus_azizii_GCF_001984715") has been submitted
Your job 5481279 ("map_Streptococcus_bovimastitidis_GCF_001885095") has been submitted
Your job 5481280 ("map_Streptococcus_caballi_GCF_000379985") has been submitted
Your job 5481281 ("map_Streptococcus_canis_GCF_900636575") has been submitted
Your job 5481282 ("map_Streptococcus_castoreus_GCF_000425025") has been submitted
Your job 5481283 ("map_Streptococcus_constellatus_GCF_000463425") has been submitted
Your job 5481284 ("map_Streptococcus_criceti_GCF_000187975") has been submitted
Your job 54

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [176]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5481361 ("merge_Streptococcus_acidominimus_GCF_900187245") has been submitted
Your job 5481362 ("merge_Streptococcus_agalactiae_GCF_000007265") has been submitted
Your job 5481363 ("merge_Streptococcus_alactolyticus_GCF_009695625") has been submitted
Your job 5481364 ("merge_Streptococcus_anginosus_GCF_000463505") has been submitted
Your job 5481365 ("merge_Streptococcus_australis_GCF_900476055") has been submitted
Your job 5481366 ("merge_Streptococcus_azizii_GCF_001984715") has been submitted
Your job 5481367 ("merge_Streptococcus_bovimastitidis_GCF_001885095") has been submitted
Your job 5481368 ("merge_Streptococcus_caballi_GCF_000379985") has been submitted
Your job 5481369 ("merge_Streptococcus_canis_GCF_900636575") has been submitted
Your job 5481370 ("merge_Streptococcus_castoreus_GCF_000425025") has been submitted
Your job 5481371 ("merge_Streptococcus_constellatus_GCF_000463425") has been submitted
Your job 5481372 ("merge_Streptococcus_criceti_GCF_000187975") has be

In [177]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 205 sequences from Streptococcus_acidominimus_GCF_900187245_merged.bed.  Filtered 117 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 88.
Screened 203 sequences from Streptococcus_agalactiae_GCF_000007265_merged.bed.  Filtered 114 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 89.
Screened 211 sequences from Streptococcus_alactolyticus_GCF_009695625_merged.bed.  Filtered 113 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 98.
Screened 275 sequences from Streptococcus_anginosus_GCF_000463505_merged.bed.  Filtered 158 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 117.
Screened 578 sequences from Streptococcus_australis_GCF_900476055_merged.bed.  Filtered 298 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 280.
Screened 162 sequences from Streptococcus_azizii_GCF_001984715_merged.bed.  Filtered 88 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 74.
Screened 174 sequences from Streptococcu

Screened 233 sequences from Streptococcus_ovis_GCF_000380125_merged.bed.  Filtered 127 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 106.
Screened 70 sequences from Streptococcus_pantholopis_GCF_001642085_merged.bed.  Filtered 39 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 31.
Screened 505 sequences from Streptococcus_parasanguinis_GCF_000164675_merged.bed.  Filtered 260 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 245.
Screened 245 sequences from Streptococcus_parasuis_GCF_004283785_merged.bed.  Filtered 138 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 107.
Screened 157 sequences from Streptococcus_parauberis_GCF_000213825_merged.bed.  Filtered 88 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 69.
Screened 245 sequences from Streptococcus_pasteurianus_GCF_900478025_merged.bed.  Filtered 130 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 115.
Screened 186 sequences from Streptococcus

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [178]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [179]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_merged.bed --twobit results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.2bit --output results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_stripped.bed;
streptococcus_acidominimus_gcf_900187245.
streptococcus_agalactiae_gcf_000007265.
streptococcus_alactolyticus_gcf_009695625.
streptococcus_anginosus_gcf_000463505.
streptococcus_australis_gcf_900476055.
streptococcus_azizii_gcf_001984715.
streptococcus_bovimastitidis_gcf_001885095.
streptococcus_caballi_gcf_000379985.
streptococcus_canis_gcf_900636575.
streptococcus_castoreus_gcf_000425025.
streptococcus_constellatus_gcf_000463425.
streptococcus_criceti_gcf_000187975.
streptococcus_cristatus_gcf_000385925.
streptococcus_cuniculi_gcf_001921845.
streptococcus_danieliae_gcf_009767945.
streptococcus_d

Quantify probes and the number of targeted taxa for each.

In [180]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_cmd)
!{phyluce_query_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_merged.bed --twobit results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.2bit --output results/phyluce/streptococcus/initial_intervals/Streptococcus_viridans_GCF_900636365_stripped.bed;
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 0 taxa:	2,656.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 1 taxa:	2,656.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 2 taxa:	2,359.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 3 taxa:	1,812.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 4 taxa:	1,506.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 5 taxa:	908.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 6 taxa:	723.0
Loci shared by Streptococcus_pneumoniae_GCF_000007045 + 7 taxa:	583.0
Loci shared by Streptococcus_pneum

In [181]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 41
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/streptococcus/initial_intervals/streptococcus-to-Streptococcus_pneumoniae_GCF_000007045.sqlite --base-taxon Streptococcus_pneumoniae_GCF_000007045 --output results/phyluce/streptococcus/initial_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.bed --specific-counts 41;
Counter({'streptococcus_oralis_gcf_900637025': 100, 'streptococcus_pseudopneumoniae_gcf_000221985': 100, 'streptococcus_mitis_gcf_000027165': 100, 'streptococcus_timonensis_gcf_900095845': 99, 'streptococcus_halitosis_gcf_003143695': 99, 'streptococcus_parasanguinis_gcf_000164675': 99, 'streptococcus_infantis_gcf_000187465': 98, 'streptococcus_gordonii_gcf_000017005': 98, 'streptococcus_viridans_gcf_900636365': 97, 'streptococcus_cristatus_gcf_000385925': 96, 'streptococcus_salivarius_gcf_000785515': 95, 'streptococcus_peroris_gcf_000187585': 95, 'streptococcus_gallolyticus_gcf_002000985': 94, 'streptococcus_infantarius_gcf_000246835': 94, 'streptococcus_sangu

## Design temp set of baits

In [182]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/streptococcus/initial_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.bed --twobit results/phyluce/streptococcus/cleaned_genomes/Streptococcus_pneumoniae_GCF_000007045_formatted.2bit --buffer-to 160 --output results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.fasta;
Screened 100 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 100.


design the baits

In [183]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41.fasta --probe-prefix uce_streptococcus_ --design streptococcus_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):



Conserved locus count = 100
Probe Count = 200


## Find duplicate baited regions

In [184]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas --query results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes_vself.lastz;
Started:  Thu Feb 06, 2020  15:38:46
Ended:  Thu Feb 06, 2020  15:38:46
Time for execution:  0.00281981627146 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas                        --lastz results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes_vself.lastz                       --probe-prefix=uce_streptococcus_;
Parsing lastz file...
Screening results...
Screened 199 fasta sequen

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [185]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [186]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"
#triCas1+5+menMol1.sqlite

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/streptococcus/validate_intervals/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas --scaffoldlist Streptococcus_acidominimus_GCF_900187245 Streptococcus_agalactiae_GCF_000007265 Streptococcus_alactolyticus_GCF_009695625 Streptococcus_anginosus_GCF_000463505 Streptococcus_australis_GCF_900476055 Streptococcus_azizii_GCF_001984715 Streptococcus_bovimastitidis_GCF_001885095 Streptococcus_caballi_GCF_000379985 Streptococcus_canis_GCF_900636575 Streptococcus_castoreus_GCF_000425025 Streptococcus_constellatus_GCF_000463425 Streptococcus_criceti_GCF_000187975 Streptococcus_cristatus_GCF_000385925 Streptococcus_cuniculi_GCF_001921845 Streptococcus_danieliae_GCF_009767945 Streptococcus_devriesei_GCF_000423725 Streptococcus_didelphis_GCF_000380005 Streptococcus_downei_GCF_900459175 Streptococcus_dysgalactiae_GCF_000317855 Streptococcus_entericus_GCF_000380025 Streptococcus_equi_GCF_000026605 Streptococcus_equinus_GCF_0

Creating Streptococcus_caballi_GCF_000379985 table
Inserting data to Streptococcus_caballi_GCF_000379985 table

Running against Streptococcus_canis_GCF_900636575.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpz6O8Vl.fasta

Writing the results file...
	/tmp/tmppVZ5hF.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_canis_GCF_900636575.lastz
Creating Streptococcus_canis_GCF_900636575 table
Inserting data to Streptococcus_canis_GCF_900636575 table

Running against Streptococcus_castoreus_GCF_000425025.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpZxyIro.fasta

Writing the results file...
	/tmp/tmp8ERmAr.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/

Inserting data to Streptococcus_equinus_GCF_000964315 table

Running against Streptococcus_ferus_GCF_000372425.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp9s3guk.fasta

Writing the results file...
	/tmp/tmprpK1wS.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_ferus_GCF_000372425.lastz
Creating Streptococcus_ferus_GCF_000372425 table
Inserting data to Streptococcus_ferus_GCF_000372425 table

Running against Streptococcus_gallolyticus_GCF_002000985.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpN6PSFq.fasta

Writing the results file...
	/tmp/tmp_tCpfa.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals

Inserting data to Streptococcus_infantis_GCF_000187465 table

Running against Streptococcus_iniae_GCF_000831485.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpJLsAGi.fasta

Writing the results file...
	/tmp/tmpJb2aaC.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_iniae_GCF_000831485.lastz
Creating Streptococcus_iniae_GCF_000831485 table
Inserting data to Streptococcus_iniae_GCF_000831485 table

Running against Streptococcus_intermedius_GCF_000463355.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp81yovK.fasta

Writing the results file...
	/tmp/tmpzE3xC3.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals


Running against Streptococcus_orisasini_GCF_001431045.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpCxQQss.fasta

Writing the results file...
	/tmp/tmpxcLn4K.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_orisasini_GCF_001431045.lastz
Creating Streptococcus_orisasini_GCF_001431045 table
Inserting data to Streptococcus_orisasini_GCF_001431045 table

Running against Streptococcus_orisratti_GCF_000380105.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmp09d6mv.fasta

Writing the results file...
	/tmp/tmpAcPUrC.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+


Running against Streptococcus_porci_GCF_000423765.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpZPMOyW.fasta

Writing the results file...
	/tmp/tmp6Getaa.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_porci_GCF_000423765.lastz
Creating Streptococcus_porci_GCF_000423765 table
Inserting data to Streptococcus_porci_GCF_000423765 table

Running against Streptococcus_porcinus_GCF_900475415.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpwRfBqW.fasta

Writing the results file...
	/tmp/tmp2bBfpT.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fa


Running against Streptococcus_suis_GCF_000026745.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpvukvvc.fasta

Writing the results file...
	/tmp/tmpbRqZo3.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_Streptococcus_suis_GCF_000026745.lastz
Creating Streptococcus_suis_GCF_000026745 table
Inserting data to Streptococcus_suis_GCF_000026745 table

Running against Streptococcus_thermophilus_GCF_000253395.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 1 queries...
	/tmp/tmpakGhf3.fasta

Writing the results file...
	/tmp/tmpmRYu_r.lastz
Cleaning up the chunked files...
Cleaning /master/nplatt/pathogen_probes/results/phyluce/streptococcus/validate_intervals/lastz/Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fa

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [187]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/streptococcus/extract_probes_from_group/streptococcus_genome.conf --lastz results/phyluce/streptococcus/validate_intervals/lastz --probes 120 --probe-prefix uce_streptococcus_ --name-pattern "Streptococcus_pneumoniae_GCF_000007045_+41_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/streptococcus/extract_probes_from_group/probe_fasta;
2020-02-06 15:40:38,174 - Phyluce - INFO - --- Working on Streptococcus_acidominimus_GCF_900187245 genome --
2020-02-06 15:40:38,174 - Phyluce - INFO - Reading Streptococcus_acidominimus_GCF_900187245 genome
2020-02-06 15:40:39,054 - Phyluce - INFO - Streptococcus_acidominimus_GCF_900187245: 100 uces, 0 dupes, 100 non-dupes, 18 orient drop, 1 length drop, 81 written
2020-02-06 15:40:39,054 - Phyluce - INFO - ---- Working on Streptococcus_agalactiae_GCF_000007265 genome ---
2020-02-06 15:40:39,055 - Phyluce - INFO - Reading Streptococcus_agalactiae_GCF_000007265 genome
2020-02-06 15

2020-02-06 15:40:48,061 - Phyluce - INFO - Streptococcus_equinus_GCF_000964315: 100 uces, 0 dupes, 100 non-dupes, 0 orient drop, 18 length drop, 82 written
2020-02-06 15:40:48,061 - Phyluce - INFO - ------ Working on Streptococcus_ferus_GCF_000372425 genome ------
2020-02-06 15:40:48,061 - Phyluce - INFO - Reading Streptococcus_ferus_GCF_000372425 genome
2020-02-06 15:40:48,393 - Phyluce - INFO - Streptococcus_ferus_GCF_000372425: 99 uces, 3 dupes, 96 non-dupes, 0 orient drop, 0 length drop, 96 written
2020-02-06 15:40:48,394 - Phyluce - INFO - --- Working on Streptococcus_gallolyticus_GCF_002000985 genome --
2020-02-06 15:40:48,394 - Phyluce - INFO - Reading Streptococcus_gallolyticus_GCF_002000985 genome
2020-02-06 15:40:48,942 - Phyluce - INFO - Streptococcus_gallolyticus_GCF_002000985: 100 uces, 0 dupes, 100 non-dupes, 0 orient drop, 18 length drop, 82 written
2020-02-06 15:40:48,943 - Phyluce - INFO - ----- Working on Streptococcus_gordonii_GCF_000017005 genome ----
2020-02-06 15:

2020-02-06 15:40:58,080 - Phyluce - INFO - Streptococcus_merionis_GCF_000380085: 99 uces, 5 dupes, 94 non-dupes, 0 orient drop, 0 length drop, 94 written
2020-02-06 15:40:58,080 - Phyluce - INFO - ----- Working on Streptococcus_milleri_GCF_900636715 genome -----
2020-02-06 15:40:58,081 - Phyluce - INFO - Reading Streptococcus_milleri_GCF_900636715 genome
2020-02-06 15:40:58,537 - Phyluce - INFO - Streptococcus_milleri_GCF_900636715: 100 uces, 0 dupes, 100 non-dupes, 17 orient drop, 1 length drop, 82 written
2020-02-06 15:40:58,537 - Phyluce - INFO - ------ Working on Streptococcus_minor_GCF_000377005 genome ------
2020-02-06 15:40:58,538 - Phyluce - INFO - Reading Streptococcus_minor_GCF_000377005 genome
2020-02-06 15:40:58,868 - Phyluce - INFO - Streptococcus_minor_GCF_000377005: 99 uces, 5 dupes, 94 non-dupes, 0 orient drop, 0 length drop, 94 written
2020-02-06 15:40:58,868 - Phyluce - INFO - ------ Working on Streptococcus_mitis_GCF_000027165 genome ------
2020-02-06 15:40:58,868 - 

2020-02-06 15:41:07,723 - Phyluce - INFO - Streptococcus_pseudoporcinus_GCF_000188035: 100 uces, 0 dupes, 100 non-dupes, 12 orient drop, 5 length drop, 83 written
2020-02-06 15:41:07,723 - Phyluce - INFO - ----- Working on Streptococcus_pyogenes_GCF_000006785 genome ----
2020-02-06 15:41:07,724 - Phyluce - INFO - Reading Streptococcus_pyogenes_GCF_000006785 genome
2020-02-06 15:41:08,250 - Phyluce - INFO - Streptococcus_pyogenes_GCF_000006785: 100 uces, 0 dupes, 100 non-dupes, 14 orient drop, 4 length drop, 82 written
2020-02-06 15:41:08,250 - Phyluce - INFO - ------ Working on Streptococcus_ratti_GCF_000286075 genome ------
2020-02-06 15:41:08,251 - Phyluce - INFO - Reading Streptococcus_ratti_GCF_000286075 genome
2020-02-06 15:41:08,565 - Phyluce - INFO - Streptococcus_ratti_GCF_000286075: 95 uces, 0 dupes, 95 non-dupes, 0 orient drop, 0 length drop, 95 written
2020-02-06 15:41:08,565 - Phyluce - INFO - ---- Working on Streptococcus_respraculi_GCF_003595525 genome ---
2020-02-06 15:4

In [188]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/streptococcus/extract_probes_from_group/probe_fasta --output results/phyluce/streptococcus/extract_probes_from_group/multifastas.sqlite --base-taxon Streptococcus_pneumoniae_GCF_000007045;
streptococcus_acidominimus_gcf_900187245.
streptococcus_agalactiae_gcf_000007265.
streptococcus_alactolyticus_gcf_009695625.
streptococcus_anginosus_gcf_000463505.
streptococcus_australis_gcf_900476055.
streptococcus_azizii_gcf_001984715.
streptococcus_bovimastitidis_gcf_001885095.
streptococcus_caballi_gcf_000379985.
streptococcus_canis_gcf_900636575.
streptococcus_castoreus_gcf_000425025.
streptococcus_constellatus_gcf_000463425.
streptococcus_criceti_gcf_000187975.
streptococcus_cristatus_gcf_000385925.
streptococcus_cuniculi_gcf_001921845.
streptococcus_danieliae_gcf_009767945.
streptococcus_devriesei_gcf_000423725.
streptococcus_didelphis_gcf_000380005.
streptococcus_downei_gcf_900459175.
streptococcus_dysgalactiae_gcf_000317855.
strep

In [189]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(77)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/streptococcus/extract_probes_from_group/multifastas.sqlite  --base-taxon Streptococcus_pneumoniae_GCF_000007045 --output results/phyluce/streptococcus/extract_probes_from_group/Streptococcus_pneumoniae_GCF_000007045+41-back-to-77.conf --specific-counts 77;
Counter({'streptococcus_varani_gcf_001375655': 81, 'streptococcus_australis_gcf_900476055': 81, 'streptococcus_anginosus_gcf_000463505': 81, 'streptococcus_timonensis_gcf_900095845': 81, 'streptococcus_gallolyticus_gcf_002000985': 81, 'streptococcus_mutans_gcf_000007465': 81, 'streptococcus_intermedius_gcf_000463355': 81, 'streptococcus_infantarius_gcf_000246835': 81, 'streptococcus_equi_gcf_000026605': 81, 'streptococcus_respraculi_gcf_003595525': 81, 'streptococcus_pluranimalium_gcf_003352995': 81, 'streptococcus_uberis_gcf_000009545': 81, 'streptococcus_danieliae_gcf_009767945': 81, 'streptococcus_constellatus_gcf_000463425': 81, 'streptococcus_parauberis_gcf_000213825': 8

## Final group specific bait design

In [190]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/streptococcus/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/streptococcus/extract_probes_from_group/Streptococcus_pneumoniae_GCF_000007045+41-back-to-77.conf --probe-prefix uce_streptococcus_ --designer rnplattii --design streptococcus_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 81
Probe Count = 14298


In [191]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta --query results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Thu Feb 06, 2020  15:42:14
Ended:  Thu Feb 06, 2020  15:44:20
Time for execution:  2.11134323279 minutes


In [192]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta --lastz results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_streptococcus_;
Parsing lastz file...
Screening results...
Screened 14297 fasta sequences.  Filtered 0 duplicates. Kept 14298.


## CDhit to reduce numbers

In [193]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.fasta
         -o
         results/phyluce/streptococcus/final_probe_design/streptococcus_v1-master_probe_list.95P_cdhit

Started: Thu Feb  6 16:37:39 2020
                            Output                              
----------------------------------------------------------------
total seq: 14298
longest and shortest : 80 and 80
Total letters: 1143840
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 3M
Buffer          : 4 X 12M = 48M
Table           : 2 X 17M = 34M
Miscellaneous   : 4M
Total           : 90M

Table limit with the given memory limit:
Max number of representatives: 3940576
Max number of word counting entries: 88702384

# comparing sequences from          0  to       2383
..---------- new table with     1206 representatives
# comparing sequenc