## Prep python

In [3]:
import os
import subprocess
import pandas as pd
import shutil
from shutil import copy
import time
from collections import defaultdict
from Bio import SeqIO
import glob

os.chdir("/master/nplatt/pathogen_probes/")


def wait_on_running_jobs():
   
    num_jobs = 1
    
    while num_jobs > 0:
        num_jobs = len(subprocess.check_output('qstat', shell=True).split("\n")) - 2
        time.sleep(60)
        print(".")

# Clade 1

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [32]:
group = 'nematoda-clade1'

In [3]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [33]:
group_taxa = [ "Romanomermis_culicivorax_GCA_001039655",
               "Trichinella_britovi_GCA_001447585",
               "Trichinella_murrelli_GCA_002221485",
               "Trichinella_nativa_GCA_001447565",
               "Trichinella_nelsoni_GCA_001447455",
               "Trichinella_papuae_GCA_001447755",
               "Trichinella_patagoniensis_GCA_001447655",
               "Trichinella_pseudospralis_GCA_001447645",
               "Trichinella_sp_GCA_001447745",
               "Trichinella_zimbabwensis_GCA_001447665",
               "Soboliphyme_baturini_GCA_900618415" ]
                    
reference_taxon = "Trichinella_spiralis_GCA_000181795"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [5]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_001039655.1_nRc.2.0_genomic.fna.gz

sent 42 bytes  received 91333130 bytes  8698397.33 bytes/sec
total size is 91310731  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplin

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [6]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [7]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486908 ("sim_Romanomermis_culicivorax_GCA_001039655") has been submitted
Your job 5486909 ("sim_Trichinella_britovi_GCA_001447585") has been submitted
Your job 5486910 ("sim_Trichinella_murrelli_GCA_002221485") has been submitted
Your job 5486911 ("sim_Trichinella_nativa_GCA_001447565") has been submitted
Your job 5486912 ("sim_Trichinella_nelsoni_GCA_001447455") has been submitted
Your job 5486913 ("sim_Trichinella_papuae_GCA_001447755") has been submitted
Your job 5486914 ("sim_Trichinella_patagoniensis_GCA_001447655") has been submitted
Your job 5486915 ("sim_Trichinella_pseudospralis_GCA_001447645") has been submitted
Your job 5486916 ("sim_Trichinella_sp_GCA_001447745") has been submitted
Your job 5486917 ("sim_Trichinella_zimbabwensis_GCA_001447665") has been submitted
Your job 5486918 ("sim_Soboliphyme_baturini_GCA_900618415") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [8]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/nematoda-clade1/cleaned_genomes/Trichinella_spiralis_GCA_000181795_formatted.fas path=results/phyluce/nematoda-clade1/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [9]:
#map reads to reference genome   

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486919 ("map_Romanomermis_culicivorax_GCA_001039655") has been submitted
Your job 5486920 ("map_Trichinella_britovi_GCA_001447585") has been submitted
Your job 5486921 ("map_Trichinella_murrelli_GCA_002221485") has been submitted
Your job 5486922 ("map_Trichinella_nativa_GCA_001447565") has been submitted
Your job 5486923 ("map_Trichinella_nelsoni_GCA_001447455") has been submitted
Your job 5486924 ("map_Trichinella_papuae_GCA_001447755") has been submitted
Your job 5486925 ("map_Trichinella_patagoniensis_GCA_001447655") has been submitted
Your job 5486926 ("map_Trichinella_pseudospralis_GCA_001447645") has been submitted
Your job 5486927 ("map_Trichinella_sp_GCA_001447745") has been submitted
Your job 5486928 ("map_Trichinella_zimbabwensis_GCA_001447665") has been submitted
Your job 5486929 ("map_Soboliphyme_baturini_GCA_900618415") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [10]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486930 ("merge_Romanomermis_culicivorax_GCA_001039655") has been submitted
Your job 5486931 ("merge_Trichinella_britovi_GCA_001447585") has been submitted
Your job 5486932 ("merge_Trichinella_murrelli_GCA_002221485") has been submitted
Your job 5486933 ("merge_Trichinella_nativa_GCA_001447565") has been submitted
Your job 5486934 ("merge_Trichinella_nelsoni_GCA_001447455") has been submitted
Your job 5486935 ("merge_Trichinella_papuae_GCA_001447755") has been submitted
Your job 5486936 ("merge_Trichinella_patagoniensis_GCA_001447655") has been submitted
Your job 5486937 ("merge_Trichinella_pseudospralis_GCA_001447645") has been submitted
Your job 5486938 ("merge_Trichinella_sp_GCA_001447745") has been submitted
Your job 5486939 ("merge_Trichinella_zimbabwensis_GCA_001447665") has been submitted
Your job 5486940 ("merge_Soboliphyme_baturini_GCA_900618415") has been submitted


remove loci that were masked in the original genome

In [34]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 43 sequences from Romanomermis_culicivorax_GCA_001039655_merged.bed.  Filtered 35 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 8.
Screened 54380 sequences from Trichinella_britovi_GCA_001447585_merged.bed.  Filtered 43771 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10609.
Screened 54970 sequences from Trichinella_murrelli_GCA_002221485_merged.bed.  Filtered 44133 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10837.
Screened 54561 sequences from Trichinella_nativa_GCA_001447565_merged.bed.  Filtered 43886 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10675.
Screened 53128 sequences from Trichinella_nelsoni_GCA_001447455_merged.bed.  Filtered 42986 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10142.
Screened 95574 sequences from Trichinella_papuae_GCA_001447755_merged.bed.  Filtered 69845 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 25729.
Screened 54336 sequences from Tric

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [35]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [36]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/nematoda-clade1/initial_intervals/Soboliphyme_baturini_GCA_900618415_merged.bed --twobit results/phyluce/nematoda-clade1/cleaned_genomes/Trichinella_spiralis_GCA_000181795_formatted.2bit --output results/phyluce/nematoda-clade1/initial_intervals/Soboliphyme_baturini_GCA_900618415_stripped.bed;
romanomermis_culicivorax_gca_001039655.
trichinella_britovi_gca_001447585...........
trichinella_murrelli_gca_002221485...........
trichinella_nativa_gca_001447565...........
trichinella_nelsoni_gca_001447455...........
trichinella_papuae_gca_001447755..........................
trichinella_patagoniensis_gca_001447655...........
trichinella_pseudospralis_gca_001447645............................
trichinella_sp_gca_001447745...........
trichinella_zimbabwensis_gca_001447665..........................
soboliphyme_baturini_gca_900618415.
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [37]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade1/initial_intervals/nematoda-clade1-to-Trichinella_spiralis_GCA_000181795.sqlite --base-taxon Trichinella_spiralis_GCA_000181795
Loci shared by Trichinella_spiralis_GCA_000181795 + 0 taxa:	38,179.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 1 taxa:	38,179.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 2 taxa:	28,986.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 3 taxa:	22,754.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 4 taxa:	12,320.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 5 taxa:	9,391.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 6 taxa:	6,992.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 7 taxa:	4,120.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 8 taxa:	3,029.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 9 taxa:	1,845.0
Loci shared by Trichinella_spiralis_GCA_000181795 + 10 taxa:	9.0
Loci shared by Trichinella_spiralis_GCA_00018179

In [38]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 9
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade1/initial_intervals/nematoda-clade1-to-Trichinella_spiralis_GCA_000181795.sqlite --base-taxon Trichinella_spiralis_GCA_000181795 --output results/phyluce/nematoda-clade1/initial_intervals/Trichinella_spiralis_GCA_000181795_+9.bed --specific-counts 9;
Counter({'trichinella_papuae_gca_001447755': 1845, 'trichinella_patagoniensis_gca_001447655': 1845, 'trichinella_sp_gca_001447745': 1845, 'trichinella_murrelli_gca_002221485': 1845, 'trichinella_nelsoni_gca_001447455': 1845, 'trichinella_nativa_gca_001447565': 1845, 'trichinella_zimbabwensis_gca_001447665': 1845, 'trichinella_pseudospralis_gca_001447645': 1845, 'trichinella_britovi_gca_001447585': 1845, 'romanomermis_culicivorax_gca_001039655': 6, 'soboliphyme_baturini_gca_900618415': 3})


## Design temp set of baits

In [39]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/nematoda-clade1/initial_intervals/Trichinella_spiralis_GCA_000181795_+9.bed --twobit results/phyluce/nematoda-clade1/cleaned_genomes/Trichinella_spiralis_GCA_000181795_formatted.2bit --buffer-to 160 --output results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9.fasta;
Screened 1845 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 1845.


design the baits

In [40]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9.fasta --probe-prefix uce_nematoda-clade1_ --design nematoda-clade1_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 1790
Probe Count = 3482


## Find duplicate baited regions

In [41]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9_temp_probes.fas --query results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  08:40:50
Ended:  Tue Feb 11, 2020  08:40:52
Time for execution:  0.0266137997309 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9_temp_probes.fas                        --lastz results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9_temp_probes_vself.lastz                       --probe-prefix=uce_nematoda-clade1_;
Parsing lastz file...
Screening results...
Screened 3481 fasta sequences.  Filtere

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [42]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [43]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/nematoda-clade1/validate_intervals/Trichinella_spiralis_GCA_000181795_+9_temp_probes.fas --scaffoldlist Romanomermis_culicivorax_GCA_001039655 Trichinella_britovi_GCA_001447585 Trichinella_murrelli_GCA_002221485 Trichinella_nativa_GCA_001447565 Trichinella_nelsoni_GCA_001447455 Trichinella_papuae_GCA_001447755 Trichinella_patagoniensis_GCA_001447655 Trichinella_pseudospralis_GCA_001447645 Trichinella_sp_GCA_001447745 Trichinella_zimbabwensis_GCA_001447665 Soboliphyme_baturini_GCA_900618415 Trichinella_spiralis_GCA_000181795 --genome-base-path results/phyluce/nematoda-clade1/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/nematoda-clade1/validate_intervals/nematoda-clade1-to-Trichinella_spiralis_GCA_000181795.sqlite --output results/phyluce/nematoda-clade1/validate_intervals/lastz/;

Running against Romanomermis_culicivorax_GCA_001039655.2bit
Running with the --huge option.  Chunking files into 10000000 bp

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [44]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/nematoda-clade1/extract_probes_from_group/nematoda-clade1_genome.conf --lastz results/phyluce/nematoda-clade1/validate_intervals/lastz --probes 120 --probe-prefix uce_nematoda-clade1_ --name-pattern "Trichinella_spiralis_GCA_000181795_+9_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/nematoda-clade1/extract_probes_from_group/probe_fasta;
2020-02-11 08:45:14,969 - Phyluce - INFO - ---- Working on Romanomermis_culicivorax_GCA_001039655 genome ---
2020-02-11 08:45:14,970 - Phyluce - INFO - Reading Romanomermis_culicivorax_GCA_001039655 genome
2020-02-11 08:45:17,654 - Phyluce - INFO - Romanomermis_culicivorax_GCA_001039655: 185 uces, 68 dupes, 117 non-dupes, 1 orient drop, 2 length drop, 114 written
2020-02-11 08:45:17,654 - Phyluce - INFO - ------ Working on Trichinella_britovi_GCA_001447585 genome ------
2020-02-11 08:45:17,666 - Phyluce - INFO - Reading Trichinella_britovi_GCA_001447585 genome
2020-02-11 08:45

In [45]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/nematoda-clade1/extract_probes_from_group/probe_fasta --output results/phyluce/nematoda-clade1/extract_probes_from_group/multifastas.sqlite --base-taxon Trichinella_spiralis_GCA_000181795;
romanomermis_culicivorax_gca_001039655.
trichinella_britovi_gca_001447585..
trichinella_murrelli_gca_002221485..
trichinella_nativa_gca_001447565..
trichinella_nelsoni_gca_001447455..
trichinella_papuae_gca_001447755..
trichinella_patagoniensis_gca_001447655..
trichinella_pseudospralis_gca_001447645..
trichinella_sp_gca_001447745..
trichinella_zimbabwensis_gca_001447665..
soboliphyme_baturini_gca_900618415.
trichinella_spiralis_gca_000181795.
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/nematoda-clade1/extract_probes_from_group/multifastas.sqlite --base-taxon Trichinella_spiralis_GCA_000181795;
Loci shared by 0 taxa:	1,517.0
Loci shared by 1 taxa:	1,517.0
Loci shared by 2 taxa:	1,466.0
Loci 

In [46]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(11)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/nematoda-clade1/extract_probes_from_group/multifastas.sqlite  --base-taxon Trichinella_spiralis_GCA_000181795 --output results/phyluce/nematoda-clade1/extract_probes_from_group/Trichinella_spiralis_GCA_000181795+9-back-to-11.conf --specific-counts 11;
Counter({'trichinella_papuae_gca_001447755': 102, 'trichinella_patagoniensis_gca_001447655': 102, 'trichinella_sp_gca_001447745': 102, 'trichinella_murrelli_gca_002221485': 102, 'trichinella_nelsoni_gca_001447455': 102, 'trichinella_nativa_gca_001447565': 102, 'trichinella_zimbabwensis_gca_001447665': 102, 'trichinella_pseudospralis_gca_001447645': 102, 'trichinella_britovi_gca_001447585': 102, 'soboliphyme_baturini_gca_900618415': 97, 'trichinella_spiralis_gca_000181795': 71, 'romanomermis_culicivorax_gca_001039655': 60})
Total loci = 102


## Final group specific bait design

In [47]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/nematoda-clade1/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/nematoda-clade1/extract_probes_from_group/Trichinella_spiralis_GCA_000181795+9-back-to-11.conf --probe-prefix uce_nematoda-clade1_ --designer rnplattii --design nematoda-clade1_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGNN


Conserved locus count = 102
Probe Count = 2267


In [48]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list.fasta --query results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  08:53:07
Ended:  Tue Feb 11, 2020  08:53:10
Time for execution:  0.0412437637647 minutes


In [49]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list.fasta --lastz results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_nematoda-clade1_;
Parsing lastz file...
Screening results...
Screened 2266 fasta sequences.  Filtered 0 duplicates. Kept 2267.


## CDhit to reduce numbers

In [50]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list.fasta
         -o
         results/phyluce/nematoda-clade1/final_probe_design/nematoda-clade1_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 08:54:07 2020
                            Output                              
----------------------------------------------------------------
total seq: 2267
longest and shortest : 80 and 80
Total letters: 181360
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 86M

Table limit with the given memory limit:
Max number of representatives: 3961142
Max number of word counting entries: 89165324

# comparing sequences from          0  to        377
---------- new table with      158 representatives
# comparing seq

# Clade 3

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [53]:
group = 'nematoda-clade3'

In [12]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [54]:
group_taxa = [ "Acanthocheilonema_viteae_GCA_900537255",
               "Anisakis_simplex_GCA_900617985",
               "Ascaris_suum_GCA_000187025",
               "Baylisascaris_schroederi_GCA_006503575",
               "Brugia_pahangi_GCA_001280985",
               "Brugia_timori_GCA_900618025",
               "Dirofilaria_immitis_GCA_009829315",
               "Dirofilaria_repens_GCA_008729115",
               "Dracunculus_medinensis_GCA_900625125",
               "Elaeophora_elaphi_GCA_000499685",
               "Enterobius_vermicularis_GCA_900576705",
               "Gongylonema_pulchrum_GCA_900617915",
               "Litomosoides_sigmodontis_GCA_900537275",
               "Loa_loa_GCF_000183805",
               "Onchocerca_flexuosa_GCA_002249935",
               "Onchocerca_ochengi_GCA_000950515",
               "Onchocerca_volvulus_GCA_000499405",
               "Parascaris_univalens_GCA_002259215",
               "Plectus_murrayi_GCA_004785735",
               "Plectus_sambesii_GCA_002796945",
               "Thelazia_callipaeda_GCA_900618365",
               "Toxocara_canis_GCA_000803305",
               "Wuchereria_bancrofti_GCA_005281725" ]
                    
reference_taxon = "Brugia_malayi_GCA_000002995"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [14]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900537255.1_ASM90053725v1_genomic.fna.gz

sent 42 bytes  received 24974073 bytes  5549803.33 bytes/sec
total size is 24967868  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
dis

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [15]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [16]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486941 ("sim_Acanthocheilonema_viteae_GCA_900537255") has been submitted
Your job 5486942 ("sim_Anisakis_simplex_GCA_900617985") has been submitted
Your job 5486943 ("sim_Ascaris_suum_GCA_000187025") has been submitted
Your job 5486944 ("sim_Baylisascaris_schroederi_GCA_006503575") has been submitted
Your job 5486945 ("sim_Brugia_pahangi_GCA_001280985") has been submitted
Your job 5486946 ("sim_Brugia_timori_GCA_900618025") has been submitted
Your job 5486947 ("sim_Dirofilaria_immitis_GCA_009829315") has been submitted
Your job 5486948 ("sim_Dirofilaria_repens_GCA_008729115") has been submitted
Your job 5486949 ("sim_Dracunculus_medinensis_GCA_900625125") has been submitted
Your job 5486950 ("sim_Elaeophora_elaphi_GCA_000499685") has been submitted
Your job 5486951 ("sim_Enterobius_vermicularis_GCA_900576705") has been submitted
Your job 5486952 ("sim_Gongylonema_pulchrum_GCA_900617915") has been submitted
Your job 5486953 ("sim_Litomosoides_sigmodontis_GCA_900537275") has be

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [17]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/nematoda-clade3/cleaned_genomes/Brugia_malayi_GCA_000002995_formatted.fas path=results/phyluce/nematoda-clade3/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [18]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486964 ("map_Acanthocheilonema_viteae_GCA_900537255") has been submitted
Your job 5486965 ("map_Anisakis_simplex_GCA_900617985") has been submitted
Your job 5486966 ("map_Ascaris_suum_GCA_000187025") has been submitted
Your job 5486967 ("map_Baylisascaris_schroederi_GCA_006503575") has been submitted
Your job 5486968 ("map_Brugia_pahangi_GCA_001280985") has been submitted
Your job 5486969 ("map_Brugia_timori_GCA_900618025") has been submitted
Your job 5486970 ("map_Dirofilaria_immitis_GCA_009829315") has been submitted
Your job 5486971 ("map_Dirofilaria_repens_GCA_008729115") has been submitted
Your job 5486972 ("map_Dracunculus_medinensis_GCA_900625125") has been submitted
Your job 5486973 ("map_Elaeophora_elaphi_GCA_000499685") has been submitted
Your job 5486974 ("map_Enterobius_vermicularis_GCA_900576705") has been submitted
Your job 5486975 ("map_Gongylonema_pulchrum_GCA_900617915") has been submitted
Your job 5486976 ("map_Litomosoides_sigmodontis_GCA_900537275") has be

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [19]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5486987 ("merge_Acanthocheilonema_viteae_GCA_900537255") has been submitted
Your job 5486988 ("merge_Anisakis_simplex_GCA_900617985") has been submitted
Your job 5486989 ("merge_Ascaris_suum_GCA_000187025") has been submitted
Your job 5486990 ("merge_Baylisascaris_schroederi_GCA_006503575") has been submitted
Your job 5486991 ("merge_Brugia_pahangi_GCA_001280985") has been submitted
Your job 5486992 ("merge_Brugia_timori_GCA_900618025") has been submitted
Your job 5486993 ("merge_Dirofilaria_immitis_GCA_009829315") has been submitted
Your job 5486994 ("merge_Dirofilaria_repens_GCA_008729115") has been submitted
Your job 5486995 ("merge_Dracunculus_medinensis_GCA_900625125") has been submitted
Your job 5486996 ("merge_Elaeophora_elaphi_GCA_000499685") has been submitted
Your job 5486997 ("merge_Enterobius_vermicularis_GCA_900576705") has been submitted
Your job 5486998 ("merge_Gongylonema_pulchrum_GCA_900617915") has been submitted
Your job 5486999 ("merge_Litomosoides_sigmodon

remove loci that were masked in the original genome

In [55]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 40159 sequences from Acanthocheilonema_viteae_GCA_900537255_merged.bed.  Filtered 28615 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 11544.
Screened 428 sequences from Anisakis_simplex_GCA_900617985_merged.bed.  Filtered 418 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10.
Screened 918 sequences from Ascaris_suum_GCA_000187025_merged.bed.  Filtered 890 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 28.
Screened 513 sequences from Baylisascaris_schroederi_GCA_006503575_merged.bed.  Filtered 498 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 15.
Screened 56981 sequences from Brugia_pahangi_GCA_001280985_merged.bed.  Filtered 42537 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 14444.
Screened 35463 sequences from Brugia_timori_GCA_900618025_merged.bed.  Filtered 24747 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10716.
Screened 2145 sequences from Dirofilaria_immitis_GCA_0098293

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [56]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [57]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/nematoda-clade3/initial_intervals/Wuchereria_bancrofti_GCA_005281725_merged.bed --twobit results/phyluce/nematoda-clade3/cleaned_genomes/Brugia_malayi_GCA_000002995_formatted.2bit --output results/phyluce/nematoda-clade3/initial_intervals/Wuchereria_bancrofti_GCA_005281725_stripped.bed;
acanthocheilonema_viteae_gca_900537255............
anisakis_simplex_gca_900617985.
ascaris_suum_gca_000187025.
baylisascaris_schroederi_gca_006503575.
brugia_pahangi_gca_001280985...............
brugia_timori_gca_900618025...........
dirofilaria_immitis_gca_009829315.
dirofilaria_repens_gca_008729115.......
dracunculus_medinensis_gca_900625125.
elaeophora_elaphi_gca_000499685
enterobius_vermicularis_gca_900576705
gongylonema_pulchrum_gca_900617915.
litomosoides_sigmodontis_gca_900537275......
loa_loa_gcf_000183805.................
onchocerca_flexuosa_gca_002249935........
onchocerca_ochengi_gca_000950515..

Quantify probes and the number of targeted taxa for each.

In [58]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade3/initial_intervals/nematoda-clade3-to-Brugia_malayi_GCA_000002995.sqlite --base-taxon Brugia_malayi_GCA_000002995
Loci shared by Brugia_malayi_GCA_000002995 + 0 taxa:	55,713.0
Loci shared by Brugia_malayi_GCA_000002995 + 1 taxa:	55,713.0
Loci shared by Brugia_malayi_GCA_000002995 + 2 taxa:	28,748.0
Loci shared by Brugia_malayi_GCA_000002995 + 3 taxa:	20,510.0
Loci shared by Brugia_malayi_GCA_000002995 + 4 taxa:	13,407.0
Loci shared by Brugia_malayi_GCA_000002995 + 5 taxa:	10,136.0
Loci shared by Brugia_malayi_GCA_000002995 + 6 taxa:	7,450.0
Loci shared by Brugia_malayi_GCA_000002995 + 7 taxa:	5,473.0
Loci shared by Brugia_malayi_GCA_000002995 + 8 taxa:	3,888.0
Loci shared by Brugia_malayi_GCA_000002995 + 9 taxa:	2,394.0
Loci shared by Brugia_malayi_GCA_000002995 + 10 taxa:	989.0
Loci shared by Brugia_malayi_GCA_000002995 + 11 taxa:	94.0
Loci shared by Brugia_malayi_GCA_000002995 + 12 taxa:	36.0
Loci shared by Bru

In [59]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 9
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade3/initial_intervals/nematoda-clade3-to-Brugia_malayi_GCA_000002995.sqlite --base-taxon Brugia_malayi_GCA_000002995 --output results/phyluce/nematoda-clade3/initial_intervals/Brugia_malayi_GCA_000002995_+9.bed --specific-counts 9;
Counter({'onchocerca_volvulus_gca_000499405': 2368, 'loa_loa_gcf_000183805': 2356, 'onchocerca_ochengi_gca_000950515': 2351, 'wuchereria_bancrofti_gca_005281725': 2350, 'acanthocheilonema_viteae_gca_900537255': 2348, 'brugia_pahangi_gca_001280985': 2284, 'brugia_timori_gca_900618025': 2192, 'onchocerca_flexuosa_gca_002249935': 2178, 'dirofilaria_repens_gca_008729115': 2085, 'litomosoides_sigmodontis_gca_900537275': 1879, 'thelazia_callipaeda_gca_900618365': 91, 'dirofilaria_immitis_gca_009829315': 87, 'gongylonema_pulchrum_gca_900617915': 39, 'ascaris_suum_gca_000187025': 25, 'toxocara_canis_gca_000803305': 24, 'parascaris_univalens_gca_002259215': 23, 'dracunculus_medinensis_gca_90062512

## Design temp set of baits

In [60]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/nematoda-clade3/initial_intervals/Brugia_malayi_GCA_000002995_+9.bed --twobit results/phyluce/nematoda-clade3/cleaned_genomes/Brugia_malayi_GCA_000002995_formatted.2bit --buffer-to 160 --output results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9.fasta;
Screened 2394 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 2394.


design the baits

In [61]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9.fasta --probe-prefix uce_nematoda-clade3_ --design nematoda-clade3_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 2385
Probe Count = 4755


## Find duplicate baited regions

In [62]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9_temp_probes.fas --query results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  08:56:42
Ended:  Tue Feb 11, 2020  08:56:44
Time for execution:  0.0361920833588 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9_temp_probes.fas                        --lastz results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9_temp_probes_vself.lastz                       --probe-prefix=uce_nematoda-clade3_;
Parsing lastz file...
Screening results...
Screened 4754 fasta sequences.  Filtered 72 duplicates. Kept 4612.


## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [63]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [64]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/nematoda-clade3/validate_intervals/Brugia_malayi_GCA_000002995_+9_temp_probes.fas --scaffoldlist Acanthocheilonema_viteae_GCA_900537255 Anisakis_simplex_GCA_900617985 Ascaris_suum_GCA_000187025 Baylisascaris_schroederi_GCA_006503575 Brugia_pahangi_GCA_001280985 Brugia_timori_GCA_900618025 Dirofilaria_immitis_GCA_009829315 Dirofilaria_repens_GCA_008729115 Dracunculus_medinensis_GCA_900625125 Elaeophora_elaphi_GCA_000499685 Enterobius_vermicularis_GCA_900576705 Gongylonema_pulchrum_GCA_900617915 Litomosoides_sigmodontis_GCA_900537275 Loa_loa_GCF_000183805 Onchocerca_flexuosa_GCA_002249935 Onchocerca_ochengi_GCA_000950515 Onchocerca_volvulus_GCA_000499405 Parascaris_univalens_GCA_002259215 Plectus_murrayi_GCA_004785735 Plectus_sambesii_GCA_002796945 Thelazia_callipaeda_GCA_900618365 Toxocara_canis_GCA_000803305 Wuchereria_bancrofti_GCA_005281725 Brugia_malayi_GCA_000002995 --genome-base-path results/phyluce/nematoda-clad

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [65]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/nematoda-clade3/extract_probes_from_group/nematoda-clade3_genome.conf --lastz results/phyluce/nematoda-clade3/validate_intervals/lastz --probes 120 --probe-prefix uce_nematoda-clade3_ --name-pattern "Brugia_malayi_GCA_000002995_+9_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/nematoda-clade3/extract_probes_from_group/probe_fasta;
2020-02-11 09:11:20,506 - Phyluce - INFO - ---- Working on Acanthocheilonema_viteae_GCA_900537255 genome ---
2020-02-11 09:11:20,508 - Phyluce - INFO - Reading Acanthocheilonema_viteae_GCA_900537255 genome
2020-02-11 09:11:31,064 - Phyluce - INFO - Acanthocheilonema_viteae_GCA_900537255: 2362 uces, 181 dupes, 2181 non-dupes, 0 orient drop, 11 length drop, 2169 written
2020-02-11 09:11:31,065 - Phyluce - INFO - -------- Working on Anisakis_simplex_GCA_900617985 genome -------
2020-02-11 09:11:31,066 - Phyluce - INFO - Reading Anisakis_simplex_GCA_900617985 genome
2020-02-11 09:11:38,3

In [66]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/nematoda-clade3/extract_probes_from_group/probe_fasta --output results/phyluce/nematoda-clade3/extract_probes_from_group/multifastas.sqlite --base-taxon Brugia_malayi_GCA_000002995;
acanthocheilonema_viteae_gca_900537255...
anisakis_simplex_gca_900617985..
ascaris_suum_gca_000187025..
baylisascaris_schroederi_gca_006503575..
brugia_pahangi_gca_001280985..
brugia_timori_gca_900618025..
dirofilaria_immitis_gca_009829315..
dirofilaria_repens_gca_008729115...
dracunculus_medinensis_gca_900625125..
elaeophora_elaphi_gca_000499685.
enterobius_vermicularis_gca_900576705..
gongylonema_pulchrum_gca_900617915..
litomosoides_sigmodontis_gca_900537275...
loa_loa_gcf_000183805...
onchocerca_flexuosa_gca_002249935...
onchocerca_ochengi_gca_000950515...
onchocerca_volvulus_gca_000499405...
parascaris_univalens_gca_002259215..
plectus_murrayi_gca_004785735.
plectus_sambesii_gca_002796945.
thelazia_callipaeda_gca_900618365...
toxocara_canis_g

In [67]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(22)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/nematoda-clade3/extract_probes_from_group/multifastas.sqlite  --base-taxon Brugia_malayi_GCA_000002995 --output results/phyluce/nematoda-clade3/extract_probes_from_group/Brugia_malayi_GCA_000002995+9-back-to-22.conf --specific-counts 22;
Counter({'acanthocheilonema_viteae_gca_900537255': 203, 'brugia_malayi_gca_000002995': 203, 'onchocerca_ochengi_gca_000950515': 203, 'onchocerca_volvulus_gca_000499405': 202, 'loa_loa_gcf_000183805': 202, 'litomosoides_sigmodontis_gca_900537275': 202, 'thelazia_callipaeda_gca_900618365': 202, 'dirofilaria_repens_gca_008729115': 201, 'onchocerca_flexuosa_gca_002249935': 201, 'ascaris_suum_gca_000187025': 200, 'parascaris_univalens_gca_002259215': 200, 'baylisascaris_schroederi_gca_006503575': 200, 'dracunculus_medinensis_gca_900625125': 199, 'enterobius_vermicularis_gca_900576705': 199, 'wuchereria_bancrofti_gca_005281725': 198, 'toxocara_canis_gca_000803305': 198, 'brugia_pahangi_gca_001280985'

## Final group specific bait design

In [68]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/nematoda-clade3/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/nematoda-clade3/extract_probes_from_group/Brugia_malayi_GCA_000002995+9-back-to-22.conf --probe-prefix uce_nematoda-clade3_ --designer rnplattii --design nematoda-clade3_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list.fasta;
GGGGGGGGGNNGGGGGNNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGGGGGGGGNNGGGGGGGGGGGGNNNNGG


Conserved locus count = 203
Probe Count = 8938


In [69]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list.fasta --query results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  09:15:08
Ended:  Tue Feb 11, 2020  09:15:25
Time for execution:  0.291917149226 minutes


In [70]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list.fasta --lastz results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_nematoda-clade3_;
Parsing lastz file...
Screening results...
Screened 8937 fasta sequences.  Filtered 0 duplicates. Kept 8938.


## CDhit to reduce numbers

In [71]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list.fasta
         -o
         results/phyluce/nematoda-clade3/final_probe_design/nematoda-clade3_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 09:21:43 2020
                            Output                              
----------------------------------------------------------------
total seq: 8938
longest and shortest : 80 and 80
Total letters: 715040
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 88M

Table limit with the given memory limit:
Max number of representatives: 3949766
Max number of word counting entries: 88909241

# comparing sequences from          0  to       1489
.---------- new table with     1132 representatives
# comparing se

# Clade 4a

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [1]:
group = 'nematoda-clade4a'

In [132]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [2]:
group_taxa = [ "Parastrongyloides_trichosuri_GCA_000941615",
               "Rhabditophanes_sp_GCA_000944355", 
               "Strongyloides_papillosus_GCA_005656395",
               "Strongyloides_ratti_GCF_001040885",
               "Strongyloides_venezuelensis_GCA_001028725" ]
                    
reference_taxon = "Strongyloides_stercoralis_GCA_000947215"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [134]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_000941615.1_P_trichosuri_KNP_genomic.fna.gz

sent 42 bytes  received 13271380 bytes  5308568.80 bytes/sec
total size is 13268028  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in


## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [135]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [136]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487852 ("sim_Parastrongyloides_trichosuri_GCA_000941615") has been submitted
Your job 5487853 ("sim_Rhabditophanes_sp_GCA_000944355") has been submitted
Your job 5487854 ("sim_Strongyloides_papillosus_GCA_005656395") has been submitted
Your job 5487855 ("sim_Strongyloides_ratti_GCF_001040885") has been submitted
Your job 5487856 ("sim_Strongyloides_venezuelensis_GCA_001028725") has been submitted


Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [137]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/nematoda-clade4a/cleaned_genomes/Strongyloides_stercoralis_GCA_000947215_formatted.fas path=results/phyluce/nematoda-clade4a/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [138]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487857 ("map_Parastrongyloides_trichosuri_GCA_000941615") has been submitted
Your job 5487858 ("map_Rhabditophanes_sp_GCA_000944355") has been submitted
Your job 5487859 ("map_Strongyloides_papillosus_GCA_005656395") has been submitted
Your job 5487860 ("map_Strongyloides_ratti_GCF_001040885") has been submitted
Your job 5487861 ("map_Strongyloides_venezuelensis_GCA_001028725") has been submitted


## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [139]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487862 ("merge_Parastrongyloides_trichosuri_GCA_000941615") has been submitted
Your job 5487863 ("merge_Rhabditophanes_sp_GCA_000944355") has been submitted
Your job 5487864 ("merge_Strongyloides_papillosus_GCA_005656395") has been submitted
Your job 5487865 ("merge_Strongyloides_ratti_GCF_001040885") has been submitted
Your job 5487866 ("merge_Strongyloides_venezuelensis_GCA_001028725") has been submitted


remove loci that were masked in the original genome

In [4]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 12706 sequences from Parastrongyloides_trichosuri_GCA_000941615_merged.bed.  Filtered 9665 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3041.
Screened 174 sequences from Rhabditophanes_sp_GCA_000944355_merged.bed.  Filtered 159 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 15.
Screened 28730 sequences from Strongyloides_papillosus_GCA_005656395_merged.bed.  Filtered 21237 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 7493.
Screened 48398 sequences from Strongyloides_ratti_GCF_001040885_merged.bed.  Filtered 38225 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 10173.
Screened 27998 sequences from Strongyloides_venezuelensis_GCA_001028725_merged.bed.  Filtered 20567 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 7431.


## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [5]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [6]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/nematoda-clade4a/initial_intervals/Strongyloides_venezuelensis_GCA_001028725_merged.bed --twobit results/phyluce/nematoda-clade4a/cleaned_genomes/Strongyloides_stercoralis_GCA_000947215_formatted.2bit --output results/phyluce/nematoda-clade4a/initial_intervals/Strongyloides_venezuelensis_GCA_001028725_stripped.bed;
parastrongyloides_trichosuri_gca_000941615....
rhabditophanes_sp_gca_000944355.
strongyloides_papillosus_gca_005656395........
strongyloides_ratti_gcf_001040885...........
strongyloides_venezuelensis_gca_001028725........
Creating database
Inserting results


Quantify probes and the number of targeted taxa for each.

In [7]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade4a/initial_intervals/nematoda-clade4a-to-Strongyloides_stercoralis_GCA_000947215.sqlite --base-taxon Strongyloides_stercoralis_GCA_000947215
Loci shared by Strongyloides_stercoralis_GCA_000947215 + 0 taxa:	15,077.0
Loci shared by Strongyloides_stercoralis_GCA_000947215 + 1 taxa:	15,077.0
Loci shared by Strongyloides_stercoralis_GCA_000947215 + 2 taxa:	8,081.0
Loci shared by Strongyloides_stercoralis_GCA_000947215 + 3 taxa:	5,023.0
Loci shared by Strongyloides_stercoralis_GCA_000947215 + 4 taxa:	1,657.0
Loci shared by Strongyloides_stercoralis_GCA_000947215 + 5 taxa:	9.0


In [8]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 4
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade4a/initial_intervals/nematoda-clade4a-to-Strongyloides_stercoralis_GCA_000947215.sqlite --base-taxon Strongyloides_stercoralis_GCA_000947215 --output results/phyluce/nematoda-clade4a/initial_intervals/Strongyloides_stercoralis_GCA_000947215_+4.bed --specific-counts 4;
Counter({'parastrongyloides_trichosuri_gca_000941615': 1657, 'strongyloides_ratti_gcf_001040885': 1657, 'strongyloides_papillosus_gca_005656395': 1657, 'strongyloides_venezuelensis_gca_001028725': 1653, 'rhabditophanes_sp_gca_000944355': 13})


## Design temp set of baits

In [9]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/nematoda-clade4a/initial_intervals/Strongyloides_stercoralis_GCA_000947215_+4.bed --twobit results/phyluce/nematoda-clade4a/cleaned_genomes/Strongyloides_stercoralis_GCA_000947215_formatted.2bit --buffer-to 160 --output results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4.fasta;
Screened 1657 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 1657.


design the baits

In [10]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4.fasta --probe-prefix uce_nematoda-clade4a_ --design nematoda-clade4a_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

## Find duplicate baited regions

In [11]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4_temp_probes.fas --query results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  14:26:52
Ended:  Tue Feb 11, 2020  14:26:54
Time for execution:  0.0234659830729 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4_temp_probes.fas                        --lastz results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4_temp_probes_vself.lastz                       --probe-prefix=uce_nematoda-clade4a_;
Parsing lastz file...
Screening results...
Screened

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [12]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [13]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/nematoda-clade4a/validate_intervals/Strongyloides_stercoralis_GCA_000947215_+4_temp_probes.fas --scaffoldlist Parastrongyloides_trichosuri_GCA_000941615 Rhabditophanes_sp_GCA_000944355 Strongyloides_papillosus_GCA_005656395 Strongyloides_ratti_GCF_001040885 Strongyloides_venezuelensis_GCA_001028725 Strongyloides_stercoralis_GCA_000947215 --genome-base-path results/phyluce/nematoda-clade4a/cleaned_genomes --identity 30 --cores 4 --db results/phyluce/nematoda-clade4a/validate_intervals/nematoda-clade4a-to-Strongyloides_stercoralis_GCA_000947215.sqlite --output results/phyluce/nematoda-clade4a/validate_intervals/lastz/;

Running against Parastrongyloides_trichosuri_GCA_000941615.2bit
Running with the --huge option.  Chunking files into 10000000 bp...
Running the targets against 5 queries...
	/tmp/tmpST3thk.fasta
	/tmp/tmpkH08pC.fasta
	/tmp/tmpOODDfg.fasta
	/tmp/tmpgvn99j.fasta
	/tmp/tmpnfAgFi.fasta

Writing the results f

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [14]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/nematoda-clade4a/extract_probes_from_group/nematoda-clade4a_genome.conf --lastz results/phyluce/nematoda-clade4a/validate_intervals/lastz --probes 120 --probe-prefix uce_nematoda-clade4a_ --name-pattern "Strongyloides_stercoralis_GCA_000947215_+4_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/nematoda-clade4a/extract_probes_from_group/probe_fasta;
2020-02-11 14:28:09,930 - Phyluce - INFO - -- Working on Parastrongyloides_trichosuri_GCA_000941615 genome -
2020-02-11 14:28:09,934 - Phyluce - INFO - Reading Parastrongyloides_trichosuri_GCA_000941615 genome
2020-02-11 14:28:18,432 - Phyluce - INFO - Parastrongyloides_trichosuri_GCA_000941615: 1416 uces, 389 dupes, 1027 non-dupes, 9 orient drop, 30 length drop, 988 written
2020-02-11 14:28:18,432 - Phyluce - INFO - ------- Working on Rhabditophanes_sp_GCA_000944355 genome -------
2020-02-11 14:28:18,438 - Phyluce - INFO - Reading Rhabditophanes_sp_GCA_000944355 gen

In [15]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/nematoda-clade4a/extract_probes_from_group/probe_fasta --output results/phyluce/nematoda-clade4a/extract_probes_from_group/multifastas.sqlite --base-taxon Strongyloides_stercoralis_GCA_000947215;
parastrongyloides_trichosuri_gca_000941615.
rhabditophanes_sp_gca_000944355.
strongyloides_papillosus_gca_005656395.
strongyloides_ratti_gcf_001040885.
strongyloides_venezuelensis_gca_001028725..
strongyloides_stercoralis_gca_000947215..
Creating database
Inserting results
phyluce_probe_query_multi_fasta_table --db results/phyluce/nematoda-clade4a/extract_probes_from_group/multifastas.sqlite --base-taxon Strongyloides_stercoralis_GCA_000947215;
Loci shared by 0 taxa:	1,175.0
Loci shared by 1 taxa:	1,175.0
Loci shared by 2 taxa:	1,078.0
Loci shared by 3 taxa:	1,018.0
Loci shared by 4 taxa:	980.0
Loci shared by 5 taxa:	928.0
Loci shared by 6 taxa:	626.0


In [16]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(6)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/nematoda-clade4a/extract_probes_from_group/multifastas.sqlite  --base-taxon Strongyloides_stercoralis_GCA_000947215 --output results/phyluce/nematoda-clade4a/extract_probes_from_group/Strongyloides_stercoralis_GCA_000947215+4-back-to-6.conf --specific-counts 6;
Counter({'strongyloides_venezuelensis_gca_001028725': 626, 'parastrongyloides_trichosuri_gca_000941615': 626, 'strongyloides_ratti_gcf_001040885': 626, 'strongyloides_stercoralis_gca_000947215': 626, 'strongyloides_papillosus_gca_005656395': 626, 'rhabditophanes_sp_gca_000944355': 626})
Total loci = 626


## Final group specific bait design

In [17]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/nematoda-clade4a/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/nematoda-clade4a/extract_probes_from_group/Strongyloides_stercoralis_GCA_000947215+4-back-to-6.conf --probe-prefix uce_nematoda-clade4a_ --designer rnplattii --design nematoda-clade4a_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list.fasta;
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG

In [18]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list.fasta --query results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  14:29:21
Ended:  Tue Feb 11, 2020  14:29:26
Time for execution:  0.0882786830266 minutes


In [19]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list.fasta --lastz results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_nematoda-clade4a_;
Parsing lastz file...
Screening results...
Screened 6783 fasta sequences.  Filtered 0 duplicates. Kept 6784.


## CDhit to reduce numbers

In [20]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list.fasta
         -o
         results/phyluce/nematoda-clade4a/final_probe_design/nematoda-clade4a_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 14:31:09 2020
                            Output                              
----------------------------------------------------------------
total seq: 6784
longest and shortest : 80 and 80
Total letters: 542720
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 88M

Table limit with the given memory limit:
Max number of representatives: 3953465
Max number of word counting entries: 88992499

# comparing sequences from          0  to       1130
.---------- new table with      931 representatives
# comparin

# Clade 4b

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [59]:
group = 'nematoda-clade4b'

In [123]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [75]:
group_taxa = [ "Acrobeloides_nanus_GCA_900406225",
               "Bursaphelenchus_xylophilus_GCA_004128815",
               "Deladenus_siricidicola_GCA_009724625",
               "Ditylenchus_destructor_GCA_001579705",
               "Ditylenchus_dipsaci_GCA_004194705",
               "Globodera_ellingtonae_GCA_001723225",
               "Globodera_pallida_GCA_000724045",
               "Globodera_rostochiensis_GCA_900079975",
               "Halicephalobus_mephisto_GCA_009193035",
               "Halicephalobus_sp_GCA_009761265",
               "Heterodera_glycines_GCA_004148225",
               "Heterorhabditis_bacteriophora_GCA_000223415",
               "Meloidogyne_arenaria_GCA_003133805",
               "Meloidogyne_enterolobii_GCA_003693675",
               "Meloidogyne_floridensis_GCA_003693605",
               "Meloidogyne_graminicola_GCA_002778205",
               "Meloidogyne_hapla_GCA_000172435",
               "Meloidogyne_incognita_GCA_900182535",
               "Meloidogyne_javanica_GCA_003693625",
               "Meloidogyne_luci_GCA_902706615",
               "Panagrellus_redivivus_GCA_000341325",
               "Panagrolaimus_davidi_GCA_901779475",
               "Panagrolaimus_sp_GCA_901766855",
               "Panagrolaimus_superbus_GCA_901766145",
               "Radopholus_similis_GCA_004764675",
               "Rotylenchulus_reniformis_GCA_001026735",
               "Steinernema_feltiae_GCA_007213375",
               "Steinernema_glaseri_GCA_000757755",
               "Steinernema_monticolum_GCA_000505645",
               "Steinernema_scapterisci_GCA_000757745",
               "Subanguina_moxae_GCA_000981365", ]
                    
reference_taxon = "Steinernema_carpocapsae_GCA_000757645"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [125]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_900406225.1_v1_genomic.fna.gz

sent 42 bytes  received 79686543 bytes  8388061.58 bytes/sec
total size is 79666989  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary a

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [126]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [127]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487756 ("sim_Acrobeloides_nanus_GCA_900406225") has been submitted
Your job 5487757 ("sim_Bursaphelenchus_xylophilus_GCA_004128815") has been submitted
Your job 5487758 ("sim_Deladenus_siricidicola_GCA_009724625") has been submitted
Your job 5487759 ("sim_Ditylenchus_destructor_GCA_001579705") has been submitted
Your job 5487760 ("sim_Ditylenchus_dipsaci_GCA_004194705") has been submitted
Your job 5487761 ("sim_Globodera_ellingtonae_GCA_001723225") has been submitted
Your job 5487762 ("sim_Globodera_pallida_GCA_000724045") has been submitted
Your job 5487763 ("sim_Globodera_rostochiensis_GCA_900079975") has been submitted
Your job 5487764 ("sim_Halicephalobus_mephisto_GCA_009193035") has been submitted
Your job 5487765 ("sim_Halicephalobus_sp_GCA_009761265") has been submitted
Your job 5487766 ("sim_Heterodera_glycines_GCA_004148225") has been submitted
Your job 5487767 ("sim_Heterorhabditis_bacteriophora_GCA_000223415") has been submitted
Your job 5487768 ("sim_Meloidogyne_a

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [31]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/nematoda-clade4b/cleaned_genomes/Steinernema_carpocapsae_GCA_000757645_formatted.fas path=results/phyluce/nematoda-clade4b/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [32]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5488099 ("map_Acrobeloides_nanus_GCA_900406225") has been submitted
Your job 5488100 ("map_Bursaphelenchus_xylophilus_GCA_004128815") has been submitted
Your job 5488101 ("map_Deladenus_siricidicola_GCA_009724625") has been submitted
Your job 5488102 ("map_Ditylenchus_destructor_GCA_001579705") has been submitted
Your job 5488103 ("map_Ditylenchus_dipsaci_GCA_004194705") has been submitted
Your job 5488104 ("map_Globodera_ellingtonae_GCA_001723225") has been submitted
Your job 5488105 ("map_Globodera_pallida_GCA_000724045") has been submitted
Your job 5488106 ("map_Globodera_rostochiensis_GCA_900079975") has been submitted
Your job 5488107 ("map_Halicephalobus_mephisto_GCA_009193035") has been submitted
Your job 5488108 ("map_Halicephalobus_sp_GCA_009761265") has been submitted
Your job 5488109 ("map_Heterodera_glycines_GCA_004148225") has been submitted
Your job 5488110 ("map_Heterorhabditis_bacteriophora_GCA_000223415") has been submitted
Your job 5488111 ("map_Meloidogyne_a

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [33]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5488131 ("merge_Acrobeloides_nanus_GCA_900406225") has been submitted
Your job 5488132 ("merge_Bursaphelenchus_xylophilus_GCA_004128815") has been submitted
Your job 5488133 ("merge_Deladenus_siricidicola_GCA_009724625") has been submitted
Your job 5488134 ("merge_Ditylenchus_destructor_GCA_001579705") has been submitted
Your job 5488135 ("merge_Ditylenchus_dipsaci_GCA_004194705") has been submitted
Your job 5488136 ("merge_Globodera_ellingtonae_GCA_001723225") has been submitted
Your job 5488137 ("merge_Globodera_pallida_GCA_000724045") has been submitted
Your job 5488138 ("merge_Globodera_rostochiensis_GCA_900079975") has been submitted
Your job 5488139 ("merge_Halicephalobus_mephisto_GCA_009193035") has been submitted
Your job 5488140 ("merge_Halicephalobus_sp_GCA_009761265") has been submitted
Your job 5488141 ("merge_Heterodera_glycines_GCA_004148225") has been submitted
Your job 5488142 ("merge_Heterorhabditis_bacteriophora_GCA_000223415") has been submitted
Your job 548

remove loci that were masked in the original genome

In [61]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 161 sequences from Acrobeloides_nanus_GCA_900406225_merged.bed.  Filtered 150 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 11.
Screened 241 sequences from Bursaphelenchus_xylophilus_GCA_004128815_merged.bed.  Filtered 228 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 13.
Screened 220 sequences from Deladenus_siricidicola_GCA_009724625_merged.bed.  Filtered 207 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 13.
Screened 48 sequences from Ditylenchus_destructor_GCA_001579705_merged.bed.  Filtered 46 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 2.
Screened 92 sequences from Ditylenchus_dipsaci_GCA_004194705_merged.bed.  Filtered 88 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 4.
Screened 72 sequences from Globodera_ellingtonae_GCA_001723225_merged.bed.  Filtered 66 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 6.
Screened 77 sequences from Globodera_pallida_GCA_000724045_merged

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [62]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [63]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/nematoda-clade4b/initial_intervals/Subanguina_moxae_GCA_000981365_merged.bed --twobit results/phyluce/nematoda-clade4b/cleaned_genomes/Steinernema_carpocapsae_GCA_000757645_formatted.2bit --output results/phyluce/nematoda-clade4b/initial_intervals/Subanguina_moxae_GCA_000981365_stripped.bed;
acrobeloides_nanus_gca_900406225.
bursaphelenchus_xylophilus_gca_004128815.
deladenus_siricidicola_gca_009724625.
ditylenchus_destructor_gca_001579705.
ditylenchus_dipsaci_gca_004194705.
globodera_ellingtonae_gca_001723225.
globodera_pallida_gca_000724045.
globodera_rostochiensis_gca_900079975.
halicephalobus_mephisto_gca_009193035.
halicephalobus_sp_gca_009761265.
heterodera_glycines_gca_004148225.
heterorhabditis_bacteriophora_gca_000223415.
meloidogyne_arenaria_gca_003133805.
meloidogyne_enterolobii_gca_003693675.
meloidogyne_floridensis_gca_003693605.
meloidogyne_graminicola_gca_002778205.
meloido

Quantify probes and the number of targeted taxa for each.

In [64]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade4b/initial_intervals/nematoda-clade4b-to-Steinernema_carpocapsae_GCA_000757645.sqlite --base-taxon Steinernema_carpocapsae_GCA_000757645
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 0 taxa:	6,132.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 1 taxa:	6,132.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 2 taxa:	1,243.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 3 taxa:	547.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 4 taxa:	318.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 5 taxa:	145.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 6 taxa:	35.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 7 taxa:	18.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 8 taxa:	14.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 9 taxa:	10.0
Loci shared by Steinernema_carpocapsae_GCA_000757645 + 10 taxa:	10.0
Loci shared by Steinernema_ca

In [66]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 2
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade4b/initial_intervals/nematoda-clade4b-to-Steinernema_carpocapsae_GCA_000757645.sqlite --base-taxon Steinernema_carpocapsae_GCA_000757645 --output results/phyluce/nematoda-clade4b/initial_intervals/Steinernema_carpocapsae_GCA_000757645_+2.bed --specific-counts 2;
Counter({'steinernema_scapterisci_gca_000757745': 1209, 'steinernema_carpocapsae_gca_000757645': 970, 'steinernema_feltiae_gca_007213375': 475, 'steinernema_glaseri_gca_000757755': 434, 'steinernema_monticolum_gca_000505645': 325, 'panagrellus_redivivus_gca_000341325': 43, 'rotylenchulus_reniformis_gca_001026735': 16, 'halicephalobus_sp_gca_009761265': 14, 'deladenus_siricidicola_gca_009724625': 13, 'bursaphelenchus_xylophilus_gca_004128815': 13, 'radopholus_similis_gca_004764675': 13, 'acrobeloides_nanus_gca_900406225': 11, 'heterodera_glycines_gca_004148225': 11, 'halicephalobus_mephisto_gca_009193035': 11, 'meloidogyne_luci_gca_902706615': 9, 'meloidogy

## Design temp set of baits

In [67]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/nematoda-clade4b/initial_intervals/Steinernema_carpocapsae_GCA_000757645_+2.bed --twobit results/phyluce/nematoda-clade4b/cleaned_genomes/Steinernema_carpocapsae_GCA_000757645_formatted.2bit --buffer-to 160 --output results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2.fasta;
Screened 1243 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 1243.


design the baits

In [68]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2.fasta --probe-prefix uce_nematoda-clade4b_ --design nematoda-clade4b_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 1231
Probe Count = 2438


## Find duplicate baited regions

In [69]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2_temp_probes.fas --query results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  19:08:06
Ended:  Tue Feb 11, 2020  19:08:07
Time for execution:  0.0178224802017 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2_temp_probes.fas                        --lastz results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2_temp_probes_vself.lastz                       --probe-prefix=uce_nematoda-clade4b_;
Parsing lastz file...
Screening results...
Screened 2437 fast

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [70]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [76]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/nematoda-clade4b/validate_intervals/Steinernema_carpocapsae_GCA_000757645_+2_temp_probes.fas --scaffoldlist Acrobeloides_nanus_GCA_900406225 Bursaphelenchus_xylophilus_GCA_004128815 Deladenus_siricidicola_GCA_009724625 Ditylenchus_destructor_GCA_001579705 Ditylenchus_dipsaci_GCA_004194705 Globodera_ellingtonae_GCA_001723225 Globodera_pallida_GCA_000724045 Globodera_rostochiensis_GCA_900079975 Halicephalobus_mephisto_GCA_009193035 Halicephalobus_sp_GCA_009761265 Heterodera_glycines_GCA_004148225 Heterorhabditis_bacteriophora_GCA_000223415 Meloidogyne_arenaria_GCA_003133805 Meloidogyne_enterolobii_GCA_003693675 Meloidogyne_floridensis_GCA_003693605 Meloidogyne_graminicola_GCA_002778205 Meloidogyne_hapla_GCA_000172435 Meloidogyne_incognita_GCA_900182535 Meloidogyne_javanica_GCA_003693625 Meloidogyne_luci_GCA_902706615 Panagrellus_redivivus_GCA_000341325 Panagrolaimus_davidi_GCA_901779475 Panagrolaimus_sp_GCA_901766855 Pa

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [77]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/nematoda-clade4b/extract_probes_from_group/nematoda-clade4b_genome.conf --lastz results/phyluce/nematoda-clade4b/validate_intervals/lastz --probes 120 --probe-prefix uce_nematoda-clade4b_ --name-pattern "Steinernema_carpocapsae_GCA_000757645_+2_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/nematoda-clade4b/extract_probes_from_group/probe_fasta;
2020-02-11 19:27:32,256 - Phyluce - INFO - ------- Working on Acrobeloides_nanus_GCA_900406225 genome ------
2020-02-11 19:27:32,260 - Phyluce - INFO - Reading Acrobeloides_nanus_GCA_900406225 genome
2020-02-11 19:27:36,433 - Phyluce - INFO - Acrobeloides_nanus_GCA_900406225: 268 uces, 176 dupes, 92 non-dupes, 0 orient drop, 2 length drop, 90 written
2020-02-11 19:27:36,433 - Phyluce - INFO - --- Working on Bursaphelenchus_xylophilus_GCA_004128815 genome --
2020-02-11 19:27:36,434 - Phyluce - INFO - Reading Bursaphelenchus_xylophilus_GCA_004128815 genome
2020-02-11 19:

In [78]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/nematoda-clade4b/extract_probes_from_group/probe_fasta --output results/phyluce/nematoda-clade4b/extract_probes_from_group/multifastas.sqlite --base-taxon Steinernema_carpocapsae_GCA_000757645;
acrobeloides_nanus_gca_900406225.
bursaphelenchus_xylophilus_gca_004128815.
deladenus_siricidicola_gca_009724625.
ditylenchus_destructor_gca_001579705.
ditylenchus_dipsaci_gca_004194705.
globodera_ellingtonae_gca_001723225.
globodera_pallida_gca_000724045.
globodera_rostochiensis_gca_900079975.
halicephalobus_mephisto_gca_009193035.
halicephalobus_sp_gca_009761265.
heterodera_glycines_gca_004148225.
heterorhabditis_bacteriophora_gca_000223415.
meloidogyne_arenaria_gca_003133805.
meloidogyne_enterolobii_gca_003693675.
meloidogyne_floridensis_gca_003693605.
meloidogyne_graminicola_gca_002778205.
meloidogyne_hapla_gca_000172435.
meloidogyne_incognita_gca_900182535.
meloidogyne_javanica_gca_003693625.
meloidogyne_luci_gca_902706615.
panagr

In [79]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(17)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/nematoda-clade4b/extract_probes_from_group/multifastas.sqlite  --base-taxon Steinernema_carpocapsae_GCA_000757645 --output results/phyluce/nematoda-clade4b/extract_probes_from_group/Steinernema_carpocapsae_GCA_000757645+2-back-to-17.conf --specific-counts 17;
Counter({'halicephalobus_mephisto_gca_009193035': 54, 'halicephalobus_sp_gca_009761265': 53, 'steinernema_monticolum_gca_000505645': 53, 'deladenus_siricidicola_gca_009724625': 52, 'steinernema_feltiae_gca_007213375': 50, 'bursaphelenchus_xylophilus_gca_004128815': 50, 'steinernema_carpocapsae_gca_000757645': 49, 'steinernema_glaseri_gca_000757755': 49, 'panagrellus_redivivus_gca_000341325': 48, 'steinernema_scapterisci_gca_000757745': 48, 'globodera_rostochiensis_gca_900079975': 47, 'globodera_ellingtonae_gca_001723225': 44, 'ditylenchus_destructor_gca_001579705': 44, 'panagrolaimus_sp_gca_901766855': 42, 'heterorhabditis_bacteriophora_gca_000223415': 41, 'radopholus_simi

## Final group specific bait design

In [80]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/nematoda-clade4b/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/nematoda-clade4b/extract_probes_from_group/Steinernema_carpocapsae_GCA_000757645+2-back-to-17.conf --probe-prefix uce_nematoda-clade4b_ --designer rnplattii --design nematoda-clade4b_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list.fasta;
NGGGGGGGGGGGGGNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGNNGGGGGGGGNNGGGNNGGGGGGGGGGGGGGGNGGGGGGGGGGGGG


Conserved locus count = 59
Probe Count = 2237


In [81]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list.fasta --query results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  19:30:02
Ended:  Tue Feb 11, 2020  19:30:05
Time for execution:  0.0525559147199 minutes


In [82]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list.fasta --lastz results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_nematoda-clade4b_;
Parsing lastz file...
Screening results...
Screened 2236 fasta sequences.  Filtered 0 duplicates. Kept 2237.


## CDhit to reduce numbers

In [83]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list.fasta
         -o
         results/phyluce/nematoda-clade4b/final_probe_design/nematoda-clade4b_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 19:31:13 2020
                            Output                              
----------------------------------------------------------------
total seq: 2237
longest and shortest : 80 and 80
Total letters: 178960
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 86M

Table limit with the given memory limit:
Max number of representatives: 3961192
Max number of word counting entries: 89166441

# comparing sequences from          0  to        372
---------- new table with      325 representatives
# comparing

# Clade 5

## Prep directory and group info

Create a clean directory to work from and build a list of all the taxa to include.  Desired taxa should includ Genus, species, and assembly accession like so : 

```
Genus_species_GCA_123456789
```

In addition, for each "group" (albiet genus, phylum, etc...) we need to designate a `reference_taxon`.  This is the species/genome that all the others will be compared to.  Ideally it will be from a high quality genome.  

I used NCBI Genomes to identify all available __"representative"__ assemblies for each group.  I extracted the accesion numbers and genreated a csv file that looks like:

```
$> cat data/genomes/trematode_accessions.csv
genus,species,accession
Clonorchis,sinensis,GCA_003604175.1
Dicrocoelium,dendriticum,GCA_000950715.1
Echinostoma,caproni,GCA_900618425.1
Fasciola,gigantica,GCA_006461475.1
Fasciola,hepatica,GCA_900302435.1
Fasciolopsis,buski,GCA_008360955.1
```

In [36]:
group = 'nematoda-clade5'

In [142]:
if os.path.exists("results/phyluce/" + group):
    shutil.rmtree("results/phyluce/" + group)

os.makedirs("results/phyluce/" + group)

In [37]:
group_taxa = [ "Ancylostoma_caninum_GCA_003336725",
               "Ancylostoma_ceylanicum_GCA_000688135",
               "Ancylostoma_duodenale_GCA_000816745",
               "Angiostrongylus_cantonensis_GCA_009735665",
               "Angiostrongylus_costaricensis_GCA_900624975",
               "Caenorhabditis_angaria_GCA_000165025",
               "Caenorhabditis_becei_GCA_900536315",
               "Caenorhabditis_bovis_GCA_902636285",
               "Caenorhabditis_brenneri_GCA_000143925",
               "Caenorhabditis_briggsae_GCA_000004555",
               "Caenorhabditis_elegans_GCF_000002985",
               "Caenorhabditis_inopinata_GCA_003052745",
               "Caenorhabditis_japonica_GCA_000147155",
               "Caenorhabditis_latens_GCA_002259235",
               "Caenorhabditis_nigoni_GCA_002742825",
               "Caenorhabditis_panamensis_GCA_900536275",
               "Caenorhabditis_remanei_GCF_000149515",
               "Caenorhabditis_sp_GCA_900536305",
               "Caenorhabditis_tropicalis_GCA_000186765",
               "Cylicostephanus_goldi_GCA_900617965",
               "Dictyocaulus_viviparus_GCA_000816705",
               "Diploscapter_coronatus_GCA_002207785",
               "Diploscapter_pachys_GCA_002287525",
               "Haemonchus_placei_GCA_900617895",
               "Heligmosomoides_polygyrus_GCA_900096555",
               "Heterorhabditis_bacteriophora_GCA_000223415",
               "Mesorhabditis_belari_GCA_900631915",
               "Micoletzkya_japonica_GCA_900490955",
               "Necator_americanus_GCF_000507365",
               "Nippostrongylus_brasiliensis_GCA_900200055",
               "Oesophagostomum_dentatum_GCA_000797555",
               "Oscheius_sp_GCA_000934875",
               "Oscheius_tipulae_GCA_900184235",
               "Parapristionchus_giblindavisi_GCA_900491355",
               "Pristionchus_arcanus_GCA_900490705",
               "Pristionchus_entomophagus_GCA_900490825",
               "Pristionchus_exspctatus_GCA_900380275",
               "Pristionchus_fissidentatus_GCA_900490895",
               "Pristionchus_japonicus_GCA_900490845",
               "Pristionchus_maxplancki_GCA_900490775",
               "Pristionchus_mayeri_GCA_900490875",
               "Pristionchus_pacificus_GCA_000180635",
               "Rhabditida_sp_GCA_004026265",
               "Strongylus_vulgaris_GCA_900624965",
               "Teladorsagia_circumcincta_GCA_002352805" ]
                    
reference_taxon = "Haemonchus_contortus_GCA_007637855"


all_taxa = group_taxa + [ reference_taxon ]

## Download genomes

Read from the accessions `data/genomes/*_accessions.csv` then find the ftp link, download, and format the genome file name in a standard format `Genus_species_GCA_123456789`

For sanity's sake, I have saved all the ftp info in a csv file stored in `data/genomes/*group*`

In [144]:
#get all of the genomes and gffs for a single representative
if os.path.exists("data/genomes/" + group):
    shutil.rmtree("data/genomes/" + group + "/")

os.makedirs("data/genomes/" + group + "/")

accession_df = pd.read_csv("data/" + group +  "_accessions.csv")
accession_w_ftp=[]

for index, row in accession_df.iterrows():
    
    #get the ftp site from the accession
    ftp=!(esearch -db assembly -query {row["accession"]} | esummary | xtract -pattern DocumentSummary -element FtpPath_GenBank)
    ftp=ftp[0].replace("ftp://", "")
    
    #generate a fas and download
    fas_ftp_link = ftp + "/" + ftp.split("/")[-1] + "_genomic.fna.gz"
    fas_rsync_cmd="rsync --copy-links --times --verbose rsync://{} data/genomes/{}".format(fas_ftp_link, group)
   
    !{fas_rsync_cmd}

    #create "clean/standardized" name for gff and fas
    #Genus_species_GCA_12345678
    clean_fas_name = "_".join([ str(row["genus"]), str(row["species"]), row["accession"].split(".")[0]]) + ".fas.gz"

    orig_fas_name = ftp.split("/")[-1] + "_genomic.fna.gz"

    os.rename("data/genomes/" + group + "/" + orig_fas_name, "data/genomes/" + group + "/" + clean_fas_name)
    
    #finally make a table documenting changes
    #accession, fas_ftp, gff_ftp, original file name, updated file name
    xfer_info = [row["accession"], ftp, orig_fas_name, clean_fas_name]
    accession_w_ftp.append(xfer_info)
    
#convert the tableto a df
ftp_df=pd.DataFrame(accession_w_ftp, columns = ["accession", 
                                                "ftp", 
                                                "orig_fas_name", 
                                                "new_fas_name"])

#save to csv
ftp_df.to_csv("data/genomes/" + group + "/ftp_info.csv", index=False)

#finally uncompress all of the gzipped files (will be easier to manage donw the line)
!gunzip -f data/genomes/{group}/*gz



You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may result in
disciplinary action and civil and criminal penalties. System users have no
expectation of privacy regarding any communications or data processed by this
system. At any time, the government may monitor, record, or seize any
communication or data transiting or stored on this information system.

-------------------------------------------------------------------------------

Welcome to the NCBI rsync server.


GCA_003336725.1_A_caninum_9.3.2.ec.cg.pg_genomic.fna.gz

sent 42 bytes  received 134450487 bytes  9959298.44 bytes/sec
total size is 134417543  speedup is 1.00


You are accessing a U.S. Government information system which includes this
computer, network, and all attached devices. This system is for
Government-authorized use only. Unauthorized use of this system may 

## Clean up and format genome

Further process the genomes so that the headers only contain the accession and then the entire genome is converted into 2bit.  These are reqs. for the `phyluce` pipeline.

In [145]:
if os.path.exists("results/phyluce/" + group + "/cleaned_genomes"):
    shutil.rmtree("results/phyluce/" + group + "/cleaned_genomes")

os.makedirs("results/phyluce/" + group + "/cleaned_genomes")

for sample in all_taxa:
        
        i_genome_file = "data/genomes/" + group + "/" + sample + ".fas"
        o_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
        o_2bit_file   = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
        
        with open(i_genome_file, "rU") as infile:
            with open(o_genome_file, "w") as outfile:
                for seq in SeqIO.parse(infile, 'fasta'):
                    seq.name = ""
                    seq.description = ""
                    outfile.write(seq.format('fasta'))
                    
        f2bit_cmd = "faToTwoBit {} {}".format(i_genome_file, o_2bit_file)
        subprocess.call(f2bit_cmd.split(" "))

## Simluate and map reads at 25x coverage

Use `art` to simulate 25x coverage of error free reads for each species/genome.  In the next step these reads will all be mapped against the reference taxon to find conserved regions were we can map reads from multiple different species.  

A quick comment about coverage: In most cases 4-6x is acceptable. I have chosen 25x coverage because it should provide 4bp resolution on conserved orthologous regions and since my genomes are relatilvely small...I can trade compute time for resolution.

__NOTE__: Here and in several other places throught the notebook(s) jobs are not run on the local machine, but instead sent to our cluster via PBS.  In these cases you cannot proceed to the next step in the pipeline/notebook untill all jobs from the current step have completed.  In some cases this mean waiting up to a few hours.  These steps are identifed in giant/commented block text. 

In [146]:
if os.path.exists("results/phyluce/" + group + "/simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/simulated_reads")

os.makedirs("results/phyluce/" + group + "/simulated_reads/logs")

for sample in group_taxa:
    i_genome_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.fas"
    o_prefix      = "results/phyluce/" + group + "/simulated_reads/" + sample + "_"
        
    art_cmd = str("art_illumina "
                      "--paired "
                      "--len 100 "
                      "--fcov 25 "
                      "--mflen 200 "
                      "--sdev 150 "
                      "-ir 0.0 "
                      "-ir2 0.0 "
                      "-dr 0.0 "
                      "-dr2 0.0 "
                      "-qs 100 "
                      "-qs2 100 "
                      "-na "
                      "--in {} "
                      "--out {};").format(i_genome_file, o_prefix)

    #now set up qsub
    jid= "sim_" + sample
    log= "results/phyluce/" + group + "/simulated_reads/logs/sim_" + sample + ".log"
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -pe smp 12".format(jid, log)

    #conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #and submit to the queue
    combined_cmd ="echo \"" + conda_cmd + art_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}

    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487883 ("sim_Ancylostoma_caninum_GCA_003336725") has been submitted
Your job 5487884 ("sim_Ancylostoma_ceylanicum_GCA_000688135") has been submitted
Your job 5487885 ("sim_Ancylostoma_duodenale_GCA_000816745") has been submitted
Your job 5487886 ("sim_Angiostrongylus_cantonensis_GCA_009735665") has been submitted
Your job 5487887 ("sim_Angiostrongylus_costaricensis_GCA_900624975") has been submitted
Your job 5487888 ("sim_Caenorhabditis_angaria_GCA_000165025") has been submitted
Your job 5487889 ("sim_Caenorhabditis_becei_GCA_900536315") has been submitted
Your job 5487890 ("sim_Caenorhabditis_bovis_GCA_902636285") has been submitted
Your job 5487891 ("sim_Caenorhabditis_brenneri_GCA_000143925") has been submitted
Your job 5487892 ("sim_Caenorhabditis_briggsae_GCA_000004555") has been submitted
Your job 5487893 ("sim_Caenorhabditis_elegans_GCF_000002985") has been submitted
Your job 5487894 ("sim_Caenorhabditis_inopinata_GCA_003052745") has been submitted
Your job 5487895 ("s

Before we can map the reads to the reference taxon genome we need to index it with `bbmap`

In [147]:
if os.path.exists("results/phyluce/" + group + "/map_simulated_reads"):
    shutil.rmtree("results/phyluce/" + group + "/map_simulated_reads")

os.makedirs("results/phyluce/" + group + "/map_simulated_reads/logs")

#generate a bbmap index of the reference genome
i_ref_genome = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.fas"
o_dir        = "results/phyluce/" + group + "/map_simulated_reads/"

bbmap_cmd = "bbmap.sh ref={} path={}".format(i_ref_genome, o_dir)

print(bbmap_cmd)
subprocess.call(bbmap_cmd.split(" "))

bbmap.sh ref=results/phyluce/nematoda-clade5/cleaned_genomes/Haemonchus_contortus_GCA_007637855_formatted.fas path=results/phyluce/nematoda-clade5/map_simulated_reads/


0

Now we can use bbmap to align reads from all species to the reference genome.  In this case we are allowing up to 15% divergence between the reference genome and simulated read.  Generally this number is smaller and may be adjusted for other groups, however, given the diversity of some groups a larger number may identify more orthologous regions.

In [148]:
#map reads to reference genome   

"results/phyluce/" + group + "/map_simulated_reads/logs"

for sample in group_taxa:
    #
    #input and output files
    i_sim_r1_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_1.fq"
    i_sim_r2_fas  = "results/phyluce/" + group + "/simulated_reads/" + sample + "_2.fq"
    i_ref_genome  = i_ref_genome
    o_sim_r12_fas = "results/phyluce/" + group + "/simulated_reads/" + sample + ".fq"
    o_bbmap_sam   = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"

    #merge r1 and r2 simulated reads
    cat_cmd = "cat {} {} >{}; ".format(i_sim_r1_fas, i_sim_r2_fas, o_sim_r12_fas)

    #map with bbmap (via qsub)
    bbmap_cmd = str("bbmap.sh t=12 "
                        "in={} "
                        "ref={} "
                        "path=results/phyluce/" + group + "/map_simulated_reads/ "
                        "out={} "
                        "minid=0.90 "
                        "mappedonly=t "
                        "ignorebadquality=t; ").format(o_sim_r12_fas, 
                                                       i_ref_genome, 
                                                       o_bbmap_sam)

    #set up qsub
    jid= "map_" + sample
    log= "results/phyluce/" + group + "/map_simulated_reads/logs/map_" + sample + ".log"
    hold = "sim_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)
    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    #submit to the queue
    combined_cmd ='echo \"' + conda_cmd + " " + cat_cmd + "  " + bbmap_cmd + '\" | ' + qsub_cmd
    #combined_cmd ='echo \"' + conda_cmd + " " + bbmap_cmd + '\" | ' + qsub_cmd

    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487928 ("map_Ancylostoma_caninum_GCA_003336725") has been submitted
Your job 5487929 ("map_Ancylostoma_ceylanicum_GCA_000688135") has been submitted
Your job 5487930 ("map_Ancylostoma_duodenale_GCA_000816745") has been submitted
Your job 5487931 ("map_Angiostrongylus_cantonensis_GCA_009735665") has been submitted
Your job 5487932 ("map_Angiostrongylus_costaricensis_GCA_900624975") has been submitted
Your job 5487933 ("map_Caenorhabditis_angaria_GCA_000165025") has been submitted
Your job 5487934 ("map_Caenorhabditis_becei_GCA_900536315") has been submitted
Your job 5487935 ("map_Caenorhabditis_bovis_GCA_902636285") has been submitted
Your job 5487936 ("map_Caenorhabditis_brenneri_GCA_000143925") has been submitted
Your job 5487937 ("map_Caenorhabditis_briggsae_GCA_000004555") has been submitted
Your job 5487938 ("map_Caenorhabditis_elegans_GCF_000002985") has been submitted
Your job 5487939 ("map_Caenorhabditis_inopinata_GCA_003052745") has been submitted
Your job 5487940 ("m

## Use mapped reads to find orthologous loci

`bbmap` aligns all reads and stores the info in a `.sam` file.  Here we use a few filtering steps to remove unmapped reads, and then sort and merge mapped reads for each species/sample.  

In [149]:
if os.path.exists("results/phyluce/" + group + "/initial_intervals"):
    shutil.rmtree("results/phyluce/" + group + "/initial_intervals")

os.makedirs("results/phyluce/" + group + "/initial_intervals/logs")

#map reads to reference genome   
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"
    
    #sam to (filtered) bam
    i_sam = "results/phyluce/" + group + "/map_simulated_reads/" + sample + ".sam"
    o_filt_bam = o_dir + sample + ".bam"
    
    sam_cmd = "samtools view -b -F 4 {} >{};".format(i_sam, o_filt_bam)
  
    #bam to bed
    i_filt_bam = o_filt_bam
    o_bed      = o_dir + sample + ".bed"
    
    bed_to_bam_cmd = "bedtools bamtobed -i {} >{};".format(i_filt_bam, o_bed)
    
    #sort bed
    i_bed        = o_bed
    o_sorted_bed = o_dir + sample +  "_sorted.bed"
    
    sort_bed_cmd = "bedtools sort -i {} >{};".format(i_bed, o_sorted_bed)
    
    #merge bed
    i_sorted_bed   = o_sorted_bed
    o_interval_bed = o_dir + sample +  "_merged.bed"
    
    merge_cmd = "bedtools merge -i {} >{};".format(i_sorted_bed, o_interval_bed)
    
    #set up qsub
    jid= "merge_" + sample
    log= o_dir + "logs/merge_" + sample + ".log"
    hold = "map_" + sample
    qsub_cmd = "qsub -V -cwd -S /bin/bash -q all.q -j y -N {} -o {} -hold_jid {} -pe smp 12".format(jid, 
                                                                                                    log, 
                                                                                                    hold)

    #and conda
    conda_cmd = "conda activate pathogen_probes-phyluce; "

    bed_cmd = sam_cmd + bed_to_bam_cmd + sort_bed_cmd + merge_cmd
    
    combined_cmd ="echo \"" + conda_cmd + bed_cmd + "\" | " + qsub_cmd
    
    #print(combined_cmd)
    !{combined_cmd}
    
#################################################################################################
#  __          __     _____ _______   ______ ____  _____     ____  _    _ ______ _    _ ______  #
#  \ \        / /\   |_   _|__   __| |  ____/ __ \|  __ \   / __ \| |  | |  ____| |  | |  ____| #
#   \ \  /\  / /  \    | |    | |    | |__ | |  | | |__) | | |  | | |  | | |__  | |  | | |__    #
#    \ \/  \/ / /\ \   | |    | |    |  __|| |  | |  _  /  | |  | | |  | |  __| | |  | |  __|   #
#     \  /\  / ____ \ _| |_   | |    | |   | |__| | | \ \  | |__| | |__| | |____| |__| | |____  #
#      \/  \/_/    \_\_____|  |_|    |_|    \____/|_|  \_\  \___\_\\____/|______|\____/|______| #
#                                                                                               #
#################################################################################################
#wait_on_running_jobs()

Your job 5487973 ("merge_Ancylostoma_caninum_GCA_003336725") has been submitted
Your job 5487974 ("merge_Ancylostoma_ceylanicum_GCA_000688135") has been submitted
Your job 5487975 ("merge_Ancylostoma_duodenale_GCA_000816745") has been submitted
Your job 5487976 ("merge_Angiostrongylus_cantonensis_GCA_009735665") has been submitted
Your job 5487977 ("merge_Angiostrongylus_costaricensis_GCA_900624975") has been submitted
Your job 5487978 ("merge_Caenorhabditis_angaria_GCA_000165025") has been submitted
Your job 5487979 ("merge_Caenorhabditis_becei_GCA_900536315") has been submitted
Your job 5487980 ("merge_Caenorhabditis_bovis_GCA_902636285") has been submitted
Your job 5487981 ("merge_Caenorhabditis_brenneri_GCA_000143925") has been submitted
Your job 5487982 ("merge_Caenorhabditis_briggsae_GCA_000004555") has been submitted
Your job 5487983 ("merge_Caenorhabditis_elegans_GCF_000002985") has been submitted
Your job 5487984 ("merge_Caenorhabditis_inopinata_GCA_003052745") has been submit

remove loci that were masked in the original genome

In [38]:
#strip repeats
for sample in group_taxa:
    
    o_dir = "results/phyluce/" + group + "/initial_intervals/"

    #input and output files
    i_merged_bed   = o_dir + sample + "_merged.bed"
    i_genome_2bit  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
    o_stripped_bed = o_dir + sample + "_stripped.bed"

    #repeat strip w/ phyluce
    phyluce_cmd = str("phyluce_probe_strip_masked_loci_from_set "
                          "--filter-mask 0.25 "
                          "--min-length 160 "
                          "--bed {} "
                          "--twobit {} " 
                          "--output {};").format(i_merged_bed,
                                                 i_genome_2bit,
                                                 o_stripped_bed)
    !{phyluce_cmd}
   

Screened 1581 sequences from Ancylostoma_caninum_GCA_003336725_merged.bed.  Filtered 1433 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 148.
Screened 1547 sequences from Ancylostoma_ceylanicum_GCA_000688135_merged.bed.  Filtered 1413 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 134.
Screened 1478 sequences from Ancylostoma_duodenale_GCA_000816745_merged.bed.  Filtered 1345 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 133.
Screened 903 sequences from Angiostrongylus_cantonensis_GCA_009735665_merged.bed.  Filtered 852 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 51.
Screened 753 sequences from Angiostrongylus_costaricensis_GCA_900624975_merged.bed.  Filtered 706 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 47.
Screened 123 sequences from Caenorhabditis_angaria_GCA_000165025_merged.bed.  Filtered 120 with > 25.0% masked bases or > 0 N-bases or < 160 length. Kept 3.
Screened 146 sequences from Caenorha

## Find conserved loci across taxa

We used phyluce to find regions that are consistently mapped ("conserved") across various taxa.

First, make ```conf``` file that indicates where the stripped bedfiles (from above) are stored. 

```
[beds]
agrPla1:agrPla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
anoGla1:anoGla1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
denPon1:denPon1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
lepDec1:lepDec1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
ontTau1:ontTau1-to-triCas1-MAPPING.bam.sort.merge.strip.bed
```

In [39]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
o_bed_conf = o_dir + group + "_bed-files.conf"

with open(o_bed_conf, "w") as outfile:

    #print the header line
    outfile.write("[beds]\n")

    #print bedinfo for each species for the genus
    for sample in group_taxa:
        stripped_bed = o_dir + sample + "_stripped.bed"

        outfile.write(sample + ":" + stripped_bed + "\n")     
 

Then merge all of the data into a single ```sqlite``` db.

In [40]:
#make output file
o_dir      = "results/phyluce/" + group + "/initial_intervals/"
i_bed_conf = o_dir + group + "_bed-files.conf"

#now run phyluce to get table of orthologous sequeneces across the genus
o_sqlite_db = o_dir + group + "-to-" + reference_taxon + ".sqlite"

phyluce_merge_cmd=str("phyluce_probe_get_multi_merge_table "
                          "--conf {} "
                          "--base-taxon {} "
                          "--output {};").format(o_bed_conf,
                                                 reference_taxon,
                                                 o_sqlite_db)
print(phyluce_cmd)
!{phyluce_merge_cmd}

phyluce_probe_strip_masked_loci_from_set --filter-mask 0.25 --min-length 160 --bed results/phyluce/nematoda-clade5/initial_intervals/Teladorsagia_circumcincta_GCA_002352805_merged.bed --twobit results/phyluce/nematoda-clade5/cleaned_genomes/Haemonchus_contortus_GCA_007637855_formatted.2bit --output results/phyluce/nematoda-clade5/initial_intervals/Teladorsagia_circumcincta_GCA_002352805_stripped.bed;
ancylostoma_caninum_gca_003336725.
ancylostoma_ceylanicum_gca_000688135.
ancylostoma_duodenale_gca_000816745.
angiostrongylus_cantonensis_gca_009735665.
angiostrongylus_costaricensis_gca_900624975.
caenorhabditis_angaria_gca_000165025.
caenorhabditis_becei_gca_900536315.
caenorhabditis_bovis_gca_902636285.
caenorhabditis_brenneri_gca_000143925.
caenorhabditis_briggsae_gca_000004555.
caenorhabditis_elegans_gcf_000002985.
caenorhabditis_inopinata_gca_003052745.
caenorhabditis_japonica_gca_000147155.
caenorhabditis_latens_gca_002259235.
caenorhabditis_nigoni_gca_002742825.
caenorhabditis_pana

Quantify probes and the number of targeted taxa for each.

In [41]:
#get genus and species
i_sqlite_db = o_sqlite_db

phyluce_query_cmd = "phyluce_probe_query_multi_merge_table --db {} --base-taxon {}".format(i_sqlite_db,
                                                                                           reference_taxon)

print(phyluce_query_cmd)
!{phyluce_query_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade5/initial_intervals/nematoda-clade5-to-Haemonchus_contortus_GCA_007637855.sqlite --base-taxon Haemonchus_contortus_GCA_007637855
Loci shared by Haemonchus_contortus_GCA_007637855 + 0 taxa:	2,710.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 1 taxa:	2,710.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 2 taxa:	272.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 3 taxa:	182.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 4 taxa:	140.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 5 taxa:	108.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 6 taxa:	90.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 7 taxa:	75.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 8 taxa:	56.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 9 taxa:	43.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 10 taxa:	29.0
Loci shared by Haemonchus_contortus_GCA_007637855 + 11 taxa:	24.0
Loci s

In [42]:
o_dir      = "results/phyluce/" + group + "/initial_intervals/"

#set up input and output files
num_taxa = 1
i_sqlite_db = o_sqlite_db
out_bed = o_dir + reference_taxon + "_+" + str(num_taxa) + ".bed"

phyluce_cmd = str("phyluce_probe_query_multi_merge_table "
                      "--db {} "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db,
                                                      reference_taxon,
                                                      out_bed,
                                                      num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_merge_table --db results/phyluce/nematoda-clade5/initial_intervals/nematoda-clade5-to-Haemonchus_contortus_GCA_007637855.sqlite --base-taxon Haemonchus_contortus_GCA_007637855 --output results/phyluce/nematoda-clade5/initial_intervals/Haemonchus_contortus_GCA_007637855_+1.bed --specific-counts 1;
Counter({'teladorsagia_circumcincta_gca_002352805': 2588, 'nippostrongylus_brasiliensis_gca_900200055': 153, 'ancylostoma_caninum_gca_003336725': 148, 'ancylostoma_duodenale_gca_000816745': 134, 'ancylostoma_ceylanicum_gca_000688135': 134, 'necator_americanus_gcf_000507365': 108, 'oesophagostomum_dentatum_gca_000797555': 87, 'strongylus_vulgaris_gca_900624965': 75, 'angiostrongylus_cantonensis_gca_009735665': 51, 'cylicostephanus_goldi_gca_900617965': 47, 'angiostrongylus_costaricensis_gca_900624975': 47, 'dictyocaulus_viviparus_gca_000816705': 40, 'oscheius_tipulae_gca_900184235': 15, 'caenorhabditis_elegans_gcf_000002985': 14, 'caenorhabditis_japonica_gca_000147155'

## Design temp set of baits

In [43]:
o_dir = "results/phyluce/" + group + "/validate_intervals/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.makedirs(o_dir)


i_2bit_genome  = "results/phyluce/" + group + "/cleaned_genomes/" + reference_taxon + "_formatted.2bit"
i_interval_bed = "results/phyluce/" + group + "/initial_intervals/" + reference_taxon + "_+" + str(num_taxa) + ".bed"
i_bp_buffer    = 160
o_interval_fas = o_dir + reference_taxon + "_+" + str(num_taxa) + ".fasta"

phyluce_cmd = str("phyluce_probe_get_genome_sequences_from_bed "
                      "--bed {} "
                      "--twobit {} "
                      "--buffer-to {} "
                      "--output {};").format(i_interval_bed,
                                             i_2bit_genome,
                                             i_bp_buffer,
                                             o_interval_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_genome_sequences_from_bed --bed results/phyluce/nematoda-clade5/initial_intervals/Haemonchus_contortus_GCA_007637855_+1.bed --twobit results/phyluce/nematoda-clade5/cleaned_genomes/Haemonchus_contortus_GCA_007637855_formatted.2bit --buffer-to 160 --output results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1.fasta;
Screened 2710 sequences.  Filtered 0 < 160 bp or with > 25.0% masked bases or > 0 N-bases. Kept 2710.


design the baits

In [44]:
i_interval_fas   = o_interval_fas
i_probe_prefix   = "uce_" + group + "_"
i_design_txt     = group + "_v1"
i_designer_txt   = "rnplattii"
i_tiling_density = "3"
o_probes_fas     = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas"



phyluce_cmd = str("phyluce_probe_get_tiled_probes "
                      "--input {} "
                      "--probe-prefix {} "
                      "--design {} "
                      "--designer {} "
                      "--tiling-density {} "
                      "--two-probes "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--output {}").format(i_interval_fas, 
                                            i_probe_prefix, 
                                            i_design_txt, 
                                            i_designer_txt, 
                                            i_tiling_density,
                                            o_probes_fas)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probes --input results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1.fasta --probe-prefix uce_nematoda-clade5_ --design nematoda-clade5_v1 --designer rnplattii --tiling-density 3 --two-probes --overlap middle --masking 0.25 --remove-gc --output results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1_temp_probes.fas
Probes removed for masking (.) / low GC % (G) / ambiguous bases (N):
GGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 2707
Probe Count = 5397


## Find duplicate baited regions

In [45]:

#find duplicated probes within the probes set by a self v self lastz search
#set up input and output files
i_probes_fas   = o_probes_fas
i_identity_int = str(50)
i_coverage_int = str(50)

o_probe_lastz  = o_dir + reference_taxon + "_+" + str(num_taxa) + "_temp_probes_vself.lastz"

phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity {} "
                      "--coverage {} "
                      "--output {};").format(i_probes_fas, 
                                             i_probes_fas, 
                                             i_identity_int, 
                                             i_coverage_int, 
                                             o_probe_lastz)

print(phyluce_cmd)
!{phyluce_cmd}


#remove duplicated probes
phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz \
                      --fasta {}  \
                      --lastz {} \
                      --probe-prefix={};").format(i_probes_fas, 
                                                  o_probe_lastz,
                                                  i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}


phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1_temp_probes.fas --query results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1_temp_probes.fas --identity 50 --coverage 50 --output results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1_temp_probes_vself.lastz;
Started:  Tue Feb 11, 2020  15:53:13
Ended:  Tue Feb 11, 2020  15:53:15
Time for execution:  0.0468909343084 minutes
phyluce_probe_remove_duplicate_hits_from_probes_using_lastz                       --fasta results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1_temp_probes.fas                        --lastz results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1_temp_probes_vself.lastz                       --probe-prefix=uce_nematoda-clade5_;
Parsing lastz file...
Screening results...
Screened 5396 fasta sequences.  Filtere

## Find orthologous seqs in other taxa (via lastz)

Create a modified genome dir structure to work with phyluce

In [46]:
#need to set up soft links to make this work properly
for sample in all_taxa:
    
    src_file = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "_formatted.2bit"
    
    dst_dir  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/"
    dst_file = dst_dir + sample + ".2bit"
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
        
    os.mkdir(dst_dir)
    copy(src_file, dst_file)

Use phyluce to run lastz runs against each genome in group

In [47]:
i_probes_fas    = i_probes_fas
i_genome_list   = " ".join(all_taxa)
i_genome_dir    = "results/phyluce/" + group + "/cleaned_genomes"
i_percident_int = str(30)
i_ncores_int    = str(4)
o_sqlite_db     = o_dir + group + "-to-" + reference_taxon + ".sqlite"
o_lastz_dir     = o_dir + "lastz/"

phyluce_cmd = str("phyluce_probe_run_multiple_lastzs_sqlite "
                      "--probefile {} "
                      "--scaffoldlist {} "
                      "--genome-base-path {} "
                      "--identity {} "
                      "--cores {} "
                      "--db {} "
                      "--output {};").format(i_probes_fas, 
                                             i_genome_list, 
                                             i_genome_dir,
                                             i_percident_int,
                                             i_ncores_int,
                                             o_sqlite_db,
                                             o_lastz_dir)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_run_multiple_lastzs_sqlite --probefile results/phyluce/nematoda-clade5/validate_intervals/Haemonchus_contortus_GCA_007637855_+1_temp_probes.fas --scaffoldlist Ancylostoma_caninum_GCA_003336725 Ancylostoma_ceylanicum_GCA_000688135 Ancylostoma_duodenale_GCA_000816745 Angiostrongylus_cantonensis_GCA_009735665 Angiostrongylus_costaricensis_GCA_900624975 Caenorhabditis_angaria_GCA_000165025 Caenorhabditis_becei_GCA_900536315 Caenorhabditis_bovis_GCA_902636285 Caenorhabditis_brenneri_GCA_000143925 Caenorhabditis_briggsae_GCA_000004555 Caenorhabditis_elegans_GCF_000002985 Caenorhabditis_inopinata_GCA_003052745 Caenorhabditis_japonica_GCA_000147155 Caenorhabditis_latens_GCA_002259235 Caenorhabditis_nigoni_GCA_002742825 Caenorhabditis_panamensis_GCA_900536275 Caenorhabditis_remanei_GCF_000149515 Caenorhabditis_sp_GCA_900536305 Caenorhabditis_tropicalis_GCA_000186765 Cylicostephanus_goldi_GCA_900617965 Dictyocaulus_viviparus_GCA_000816705 Diploscapter_coronatus_GCA_002207785 Diplos

## Extract seqeunces from group genomes

make a conf file with the following format

```
[scaffolds]
menMol1:/path/to/uce-coleoptera/genomes/menMol1/menMol1.2bit
agrPla1:/path/to/uce-coleoptera/genomes/agrPla1/agrPla1.2bit
anoGla1:/path/to/uce-coleoptera/genomes/anoGla1/anoGla1.2bit
denPon1:/path/to/uce-coleoptera/genomes/denPon1/denPon1.2bit
lepDec1:/path/to/uce-coleoptera/genomes/lepDec1/lepDec1.2bit
ontTau1:/path/to/uce-coleoptera/genomes/ontTau1/ontTau1.2bit
triCas1:/path/to/uce-coleoptera/genomes/triCas1/triCas1.2bit
```

In [48]:
o_dir = "results/phyluce/" + group + "/extract_probes_from_group/"
if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

o_scaff_conf = o_dir + group + "_genome.conf"

in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"
with open(o_scaff_conf, "w") as outfile:

    #print the header line
    outfile.write("[scaffolds]\n")

    #print bedinfo for each species for the genus
    for sample in all_taxa:
        in_2bit_dest  = "results/phyluce/" + group + "/cleaned_genomes/" + sample + "/" + sample + ".2bit"

        outfile.write(sample + ":" + in_2bit_dest + "\n")     


i_scaff_conf   = o_scaff_conf
i_lastz        = "results/phyluce/" + group + "/validate_intervals/lastz"
i_bp_int       = str(120)
i_name_pattern = "\"" + reference_taxon + "_+" + str(num_taxa) + "_temp_probes.fas_v_{}.lastz.clean\""
i_probe_prefix = "uce_" + group + "_"
o_probe_dir    = o_dir + "probe_fasta"

phyluce_cmd = str("phyluce_probe_slice_sequence_from_genomes "
                      "--conf {} "
                      "--lastz {} "
                      "--probes {} "
                      "--probe-prefix {} "
                      "--name-pattern {} "
                      "--output {};").format(i_scaff_conf, 
                                             i_lastz, 
                                             i_bp_int,
                                             i_probe_prefix,
                                             i_name_pattern, 
                                             o_probe_dir)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_slice_sequence_from_genomes --conf results/phyluce/nematoda-clade5/extract_probes_from_group/nematoda-clade5_genome.conf --lastz results/phyluce/nematoda-clade5/validate_intervals/lastz --probes 120 --probe-prefix uce_nematoda-clade5_ --name-pattern "Haemonchus_contortus_GCA_007637855_+1_temp_probes.fas_v_{}.lastz.clean" --output results/phyluce/nematoda-clade5/extract_probes_from_group/probe_fasta;
2020-02-11 16:38:16,061 - Phyluce - INFO - ------ Working on Ancylostoma_caninum_GCA_003336725 genome ------
2020-02-11 16:38:16,070 - Phyluce - INFO - Reading Ancylostoma_caninum_GCA_003336725 genome
2020-02-11 16:38:49,995 - Phyluce - INFO - Ancylostoma_caninum_GCA_003336725: 2196 uces, 690 dupes, 1506 non-dupes, 3 orient drop, 15 length drop, 1488 written
2020-02-11 16:38:49,995 - Phyluce - INFO - ----- Working on Ancylostoma_ceylanicum_GCA_000688135 genome ----
2020-02-11 16:38:49,998 - Phyluce - INFO - Reading Ancylostoma_ceylanicum_GCA_000688135 genome
2020-02-11 16:39:2

In [49]:
o_dir       = "results/phyluce/" + group + "/extract_probes_from_group/"
o_probe_dir = o_probe_dir
o_sqlite_db = o_dir + "multifastas.sqlite"

phyluce_cmd = str("phyluce_probe_get_multi_fasta_table "
                      "--fastas {} "
                      "--output {} "
                      "--base-taxon {};").format(o_probe_dir, 
                                                 o_sqlite_db, 
                                                 reference_taxon)
print(phyluce_cmd)
!{phyluce_cmd}

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {} "
                      "--base-taxon {};").format(o_sqlite_db, 
                                                 reference_taxon)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_multi_fasta_table --fastas results/phyluce/nematoda-clade5/extract_probes_from_group/probe_fasta --output results/phyluce/nematoda-clade5/extract_probes_from_group/multifastas.sqlite --base-taxon Haemonchus_contortus_GCA_007637855;
ancylostoma_caninum_gca_003336725..
ancylostoma_ceylanicum_gca_000688135..
ancylostoma_duodenale_gca_000816745..
angiostrongylus_cantonensis_gca_009735665..
angiostrongylus_costaricensis_gca_900624975..
caenorhabditis_angaria_gca_000165025.
caenorhabditis_becei_gca_900536315.
caenorhabditis_bovis_gca_902636285.
caenorhabditis_brenneri_gca_000143925.
caenorhabditis_briggsae_gca_000004555.
caenorhabditis_elegans_gcf_000002985.
caenorhabditis_inopinata_gca_003052745.
caenorhabditis_japonica_gca_000147155.
caenorhabditis_latens_gca_002259235.
caenorhabditis_nigoni_gca_002742825.
caenorhabditis_panamensis_gca_900536275.
caenorhabditis_remanei_gcf_000149515.
caenorhabditis_sp_gca_900536305.
caenorhabditis_tropicalis_gca_000186765.
cylicostephanus

In [50]:
i_sqlite_db     = o_sqlite_db
target_num_taxa = str(38)
o_conf          = "".join([o_dir, reference_taxon, "+", str(num_taxa), "-back-to-", target_num_taxa, ".conf"])

phyluce_cmd = str("phyluce_probe_query_multi_fasta_table "
                      "--db {}  "
                      "--base-taxon {} "
                      "--output {} "
                      "--specific-counts {};").format(i_sqlite_db, 
                                                      reference_taxon, 
                                                      o_conf, 
                                                      target_num_taxa)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_query_multi_fasta_table --db results/phyluce/nematoda-clade5/extract_probes_from_group/multifastas.sqlite  --base-taxon Haemonchus_contortus_GCA_007637855 --output results/phyluce/nematoda-clade5/extract_probes_from_group/Haemonchus_contortus_GCA_007637855+1-back-to-38.conf --specific-counts 38;
Counter({'caenorhabditis_nigoni_gca_002742825': 60, 'caenorhabditis_panamensis_gca_900536275': 60, 'dictyocaulus_viviparus_gca_000816705': 60, 'haemonchus_placei_gca_900617895': 60, 'caenorhabditis_bovis_gca_902636285': 60, 'caenorhabditis_inopinata_gca_003052745': 60, 'ancylostoma_ceylanicum_gca_000688135': 60, 'necator_americanus_gcf_000507365': 59, 'angiostrongylus_costaricensis_gca_900624975': 59, 'heterorhabditis_bacteriophora_gca_000223415': 59, 'caenorhabditis_tropicalis_gca_000186765': 59, 'caenorhabditis_becei_gca_900536315': 58, 'caenorhabditis_latens_gca_002259235': 58, 'heligmosomoides_polygyrus_gca_900096555': 58, 'oscheius_tipulae_gca_900184235': 58, 'caenorhabditis_

## Final group specific bait design

In [51]:
o_dir       = "results/phyluce/" + group + "/final_probe_design/"

if os.path.exists(o_dir):
    shutil.rmtree(o_dir)

os.mkdir(o_dir)

i_probe_dir = o_probe_dir
i_conf = o_conf
i_probe_prefix = "uce_" + group + "_"
i_designer_txt = "rnplattii"
i_design_txt   = group + "_v1"
i_tiling_density = "3"

o_probes_fas     = o_dir + i_design_txt + "-master_probe_list.fasta"


phyluce_cmd = str("phyluce_probe_get_tiled_probe_from_multiple_inputs "
                      "--fastas {} "
                      "--multi-fasta-output {} "
                      "--probe-prefix {} "
                      "--designer {} "
                      "--design {} "
                      "--probe-length 80 "
                      "--tiling-density 3 "
                      "--overlap middle "
                      "--masking 0.25 "
                      "--remove-gc "
                      "--two-probes "
                      "--output {};").format(i_probe_dir, 
                                             i_conf, i_probe_prefix, 
                                             i_designer_txt, 
                                             i_design_txt, 
                                             o_probes_fas)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_get_tiled_probe_from_multiple_inputs --fastas results/phyluce/nematoda-clade5/extract_probes_from_group/probe_fasta --multi-fasta-output results/phyluce/nematoda-clade5/extract_probes_from_group/Haemonchus_contortus_GCA_007637855+1-back-to-38.conf --probe-prefix uce_nematoda-clade5_ --designer rnplattii --design nematoda-clade5_v1 --probe-length 80 --tiling-density 3 --overlap middle --masking 0.25 --remove-gc --two-probes --output results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list.fasta;
NGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG


Conserved locus count = 60
Probe Count = 4725


In [52]:
i_probes_fas   = o_probes_fas
o_lastz_tbl    = o_dir + i_design_txt + "-master_probe_list-TO-SELF-PROBES.lastz"


phyluce_cmd = str("phyluce_probe_easy_lastz "
                      "--target {} "
                      "--query {} "
                      "--identity 50 "
                      "--coverage 50 "
                      "--output {};").format(o_probes_fas, 
                                             o_probes_fas, 
                                             o_lastz_tbl)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_easy_lastz --target results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list.fasta --query results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list.fasta --identity 50 --coverage 50 --output results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list-TO-SELF-PROBES.lastz;
Started:  Tue Feb 11, 2020  16:50:40
Ended:  Tue Feb 11, 2020  16:50:55
Time for execution:  0.256766732534 minutes


In [53]:
i_lastz_tbl = o_lastz_tbl
i_probe_prefix = "uce_" + group + "_"

phyluce_cmd = str("phyluce_probe_remove_duplicate_hits_from_probes_using_lastz "
                      "--fasta {} "
                      "--lastz {} "
                      "--probe-prefix={};").format(i_probes_fas, 
                                                   i_lastz_tbl, 
                                                   i_probe_prefix)

print(phyluce_cmd)
!{phyluce_cmd}

phyluce_probe_remove_duplicate_hits_from_probes_using_lastz --fasta results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list.fasta --lastz results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list-TO-SELF-PROBES.lastz --probe-prefix=uce_nematoda-clade5_;
Parsing lastz file...
Screening results...
Screened 4724 fasta sequences.  Filtered 0 duplicates. Kept 4725.


## CDhit to reduce numbers

In [54]:
i_probe_fasta = o_probes_fas 
o_clust_prfx = o_probes_fas.replace("fasta", "95P_cdhit")

cdhit_cmd = "cd-hit-est -c 0.95 -G 1 -T 4 -i {} -o {}".format(i_probe_fasta, o_clust_prfx)

!{cdhit_cmd}

Program: CD-HIT, V4.8.1 (+OpenMP), Oct 26 2019, 14:51:47
Command: cd-hit-est -c 0.95 -G 1 -T 4 -i
         results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list.fasta
         -o
         results/phyluce/nematoda-clade5/final_probe_design/nematoda-clade5_v1-master_probe_list.95P_cdhit

Started: Tue Feb 11 16:56:43 2020
                            Output                              
----------------------------------------------------------------
total seq: 4725
longest and shortest : 80 and 80
Total letters: 378000
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 4 X 12M = 48M
Table           : 2 X 16M = 33M
Miscellaneous   : 4M
Total           : 87M

Table limit with the given memory limit:
Max number of representatives: 3956965
Max number of word counting entries: 89071294

# comparing sequences from          0  to        787
---------- new table with      615 representatives
# comparing seq