# Phylogeny of Phacelia section Glandulosae in South America



In [1]:
#import programs you need
import ipyrad as ip
import ipyrad.analysis as ipa
import ipyparallel as ipp
import pandas as pd
import toytree
import toyplot

##print Version of ipyrad and toytree 
print("ipyrad v. {}".format(ip.__version__))
print("toytree v. {}".format(toytree.__version__))

##print version of Python
from platform import python_version
print("Python v.", python_version())

ipyrad v. 0.9.84
toytree v. 2.0.5
Python v. 3.10.5


### Parallel processes on independent Python kernels
To start a parallel client you must run the command-line program 'ipcluster'. This will essentially start a number of independent Python processes (kernels) which we can then send bits of work to do. The cluster can be stopped and restarted independently of this notebook, which is convenient for working on a cluster where connecting to many cores is not always immediately available.

Open a terminal and type the following command to start an ipcluster instance with N engines.

In [None]:
## ipcluster start --n=8

In [3]:
## connect to cluster
ipyclient = ipp.Client()
print(ip.cluster_info(ipyclient))

Parallel connection | Cryptantha: 64 cores
None


In [3]:
## Provide a name for the assembly 
data = ip.Assembly("Phacelia_ingroup")

New Assembly: Phacelia_ingroup


In [4]:
## set parameters
data.set_params("project_dir", "./Phac_Assembly_ingroup")
data.set_params("sorted_fastq_path", "./Phac_fastq_ingroup/*.gz")
data.set_params("clust_threshold", 0.85)
data.set_params("max_Hs_consens", 0.05)
data.set_params("max_SNPs_locus", 0.2)
data.set_params("restriction_overhang", ('TGCAG', 'GGCC'))
data.set_params("output_formats", "*")
data.set_params("datatype", "ddrad")

## see / print all parameters
data.get_params()

0   assembly_name               Phacelia_ingroup                             
1   project_dir                 ./Phac_Assembly_ingroup                      
2   raw_fastq_path                                                           
3   barcodes_path                                                            
4   sorted_fastq_path           ./Phac_fastq_ingroup/*.gz                    
5   assembly_method             denovo                                       
6   reference_sequence                                                       
7   datatype                    ddrad                                        
8   restriction_overhang        ('TGCAG', 'GGCC')                            
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            6                               

### Assemble the data from step 1 to 6

In [5]:
## run steps 1 & 2 of the assembly
data.run("12", force = True)

Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:04 | loading reads        | s1 |
[####################] 100% 0:00:34 | processing reads     | s2 |


In [6]:
##use data.branch to make branches with different cluster thresholds
## set cluster treshold to 85 && run assembly steps 3-6
data_clust85 = data.branch("data_clust85")
data_clust85.set_params("clust_threshold", 0.85)
data_clust85.run("3456", force = True)

Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:05 | dereplicating        | s3 |
[####################] 100% 0:17:18 | clustering/mapping   | s3 |
[####################] 100% 0:00:03 | building clusters    | s3 |
[####################] 100% 0:00:00 | chunking clusters    | s3 |
[####################] 100% 0:03:45 | aligning clusters    | s3 |
[####################] 100% 0:00:31 | concat clusters      | s3 |
[####################] 100% 0:00:01 | calc cluster stats   | s3 |
[####################] 100% 0:00:25 | inferring [H, E]     | s4 |
[####################] 100% 0:00:01 | calculating depths   | s5 |
[####################] 100% 0:00:02 | chunking clusters    | s5 |
[####################] 100% 0:00:45 | consens calling      | s5 |
[####################] 100% 0:00:01 | indexing alleles     | s5 |
[####################] 100% 0:00:01 | concatenating inputs | s6 |
[####################] 100% 0:00:35 | clustering across    | s6 |
[####################] 100% 0:00:

In [7]:
#show assemby stats until step 6
data_clust85.stats.sort_values(by=['hetero_est'])
data_clust85.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,158947,22156,0.004,0.002,21383
W5599,6,1136178,1135732,120337,10088,0.005,0.002,9614
W5610,6,1280762,1280243,171586,14465,0.008,0.002,13507
W5612,6,1299484,1298953,63731,11303,0.005,0.002,10723
W5636,6,899974,899681,53396,10781,0.006,0.002,10259
W5637,6,792010,791701,71385,10000,0.006,0.002,9431
W6001,6,1611986,1611340,84415,11877,0.006,0.002,11228
W6021_out,6,1553643,1552821,236570,18216,0.014,0.002,16106
W6024,6,1544846,1543628,199755,28576,0.015,0.002,25013
W6027,6,1257435,1257027,75042,11508,0.009,0.002,10648


In [8]:
## set cluster treshold to 90 && run assembly steps 3-6
data_clust90 = data.branch("data_clust90")
data_clust90.set_params("clust_threshold", 0.90)
data_clust90.run("3456", force = True)


Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:05 | dereplicating        | s3 |
[####################] 100% 0:19:57 | clustering/mapping   | s3 |
[####################] 100% 0:00:02 | building clusters    | s3 |
[####################] 100% 0:00:00 | chunking clusters    | s3 |
[####################] 100% 0:03:35 | aligning clusters    | s3 |
[####################] 100% 0:00:32 | concat clusters      | s3 |
[####################] 100% 0:00:01 | calc cluster stats   | s3 |
[####################] 100% 0:00:26 | inferring [H, E]     | s4 |
[####################] 100% 0:00:01 | calculating depths   | s5 |
[####################] 100% 0:00:02 | chunking clusters    | s5 |
[####################] 100% 0:00:46 | consens calling      | s5 |
[####################] 100% 0:00:01 | indexing alleles     | s5 |
[####################] 100% 0:00:02 | concatenating inputs | s6 |
[####################] 100% 0:00:37 | clustering across    | s6 |
[####################] 100% 0:00:

In [9]:
## show assemby stats until step 6
data_clust90.stats.sort_values(by=['reads_consens'])
data_clust90.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,163995,22219,0.003,0.002,21680
W5599,6,1136178,1135732,123345,10153,0.003,0.002,9856
W5610,6,1280762,1280243,176933,14443,0.005,0.002,13916
W5612,6,1299484,1298953,66612,11414,0.004,0.002,11026
W5636,6,899974,899681,55403,10889,0.004,0.002,10552
W5637,6,792010,791701,73786,10021,0.003,0.002,9681
W6001,6,1611986,1611340,87605,11959,0.004,0.002,11546
W6021_out,6,1553643,1552821,244224,18204,0.009,0.002,16641
W6024,6,1544846,1543628,209322,28888,0.011,0.002,26480
W6027,6,1257435,1257027,77996,11498,0.006,0.002,10993


In [10]:
## set cluster treshold to 95 && run assembly steps 3-6
data_clust95 = data.branch("data_clust95")
data_clust95.set_params("clust_threshold", 0.95)
data_clust95.run("3456", force = True)

Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:05 | dereplicating        | s3 |
[####################] 100% 0:22:58 | clustering/mapping   | s3 |
[####################] 100% 0:00:02 | building clusters    | s3 |
[####################] 100% 0:00:00 | chunking clusters    | s3 |
[####################] 100% 0:03:41 | aligning clusters    | s3 |
[####################] 100% 0:00:33 | concat clusters      | s3 |
[####################] 100% 0:00:02 | calc cluster stats   | s3 |
[####################] 100% 0:00:24 | inferring [H, E]     | s4 |
[####################] 100% 0:00:02 | calculating depths   | s5 |
[####################] 100% 0:00:02 | chunking clusters    | s5 |
[####################] 100% 0:00:44 | consens calling      | s5 |
[####################] 100% 0:00:02 | indexing alleles     | s5 |
[####################] 100% 0:00:02 | concatenating inputs | s6 |
[####################] 100% 0:00:42 | clustering across    | s6 |
[####################] 100% 0:00:

In [11]:
## show assemby stats until step 6
data_clust95.stats.sort_values(by=['reads_consens'])
data_clust95.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,175155,22294,0.002,0.002,22032
W5599,6,1136178,1135732,128661,10194,0.001,0.002,10106
W5610,6,1280762,1280243,185142,14463,0.003,0.002,14278
W5612,6,1299484,1298953,72782,11484,0.002,0.002,11351
W5636,6,899974,899681,59872,10975,0.002,0.002,10860
W5637,6,792010,791701,78327,10103,0.002,0.002,9965
W6001,6,1611986,1611340,94375,12034,0.002,0.002,11856
W6021_out,6,1553643,1552821,260061,17980,0.004,0.002,17502
W6024,6,1544846,1543628,225735,28610,0.006,0.002,27815
W6027,6,1257435,1257027,83824,11509,0.003,0.002,11342


### Final assembly with different min_samples_locus settings for different analyses

In case coming back to continue from here, load assembly object to continue after step 6 by using the command: data_clust85=ip.load_json("/home/marianna/Documents/Phacelia/Phac_Assembly/data_clust85.json")

But first lets exclude samples with low read number (< 1000 reads after step 6), which are outsite the target group or with odd placements in preliminary analysis:
Samples with low read number are:
>Phacelia_setigera_var_humahuaquense_W6371

In [7]:
## exclude samples from assembly with ...
keep_list = [i for i in data.samples.keys() if i not in [
    ## exclude samples with low read number (< 5000 )
    "W6371",
]]

## make a new data branch from the keep_list #excl stands for exclude
excl_clust85 = data_clust85.branch("excl_clust85", subsamples = keep_list, force = True)

excl_clust90 = data_clust90.branch("excl_clust90", subsamples = keep_list, force = True)

excl_clust95 = data_clust95.branch("excl_clust95", subsamples = keep_list, force = True)


## double check taxon sampling
#data.stats.sort_values(by=['reads_consens'])
excl_clust85.stats

NameError: name 'data' is not defined

In [13]:

## double check taxon sampling
#data.stats.sort_values(by=['reads_consens'])
excl_clust90.stats


Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,163995,22219,0.003,0.002,21680
W5599,6,1136178,1135732,123345,10153,0.003,0.002,9856
W5610,6,1280762,1280243,176933,14443,0.005,0.002,13916
W5612,6,1299484,1298953,66612,11414,0.004,0.002,11026
W5636,6,899974,899681,55403,10889,0.004,0.002,10552
W5637,6,792010,791701,73786,10021,0.003,0.002,9681
W6001,6,1611986,1611340,87605,11959,0.004,0.002,11546
W6021_out,6,1553643,1552821,244224,18204,0.009,0.002,16641
W6024,6,1544846,1543628,209322,28888,0.011,0.002,26480
W6027,6,1257435,1257027,77996,11498,0.006,0.002,10993


In [14]:

## double check taxon sampling
#data.stats.sort_values(by=['reads_consens'])
excl_clust95.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,175155,22294,0.002,0.002,22032
W5599,6,1136178,1135732,128661,10194,0.001,0.002,10106
W5610,6,1280762,1280243,185142,14463,0.003,0.002,14278
W5612,6,1299484,1298953,72782,11484,0.002,0.002,11351
W5636,6,899974,899681,59872,10975,0.002,0.002,10860
W5637,6,792010,791701,78327,10103,0.002,0.002,9965
W6001,6,1611986,1611340,94375,12034,0.002,0.002,11856
W6021_out,6,1553643,1552821,260061,17980,0.004,0.002,17502
W6024,6,1544846,1543628,225735,28610,0.006,0.002,27815
W6027,6,1257435,1257027,83824,11509,0.003,0.002,11342


### Step 7 of the assembly for the three clustering thresholds 85 90 95 without missing data

In [15]:
## run final assembly without outgroups and no missing data allowed for the ingroup
excl_clust85.set_params("min_samples_locus", 4) 
excl_clust85.run("7", force = True)

Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:04 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[####################] 100% 0:00:01 | indexing vcf depths  | s7 |
[####################] 100% 0:00:04 | writing vcf output   | s7 |


In [16]:
excl_clust85.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,158947,22156,0.004,0.002,21383
W5599,6,1136178,1135732,120337,10088,0.005,0.002,9614
W5610,6,1280762,1280243,171586,14465,0.008,0.002,13507
W5612,6,1299484,1298953,63731,11303,0.005,0.002,10723
W5636,6,899974,899681,53396,10781,0.006,0.002,10259
W5637,6,792010,791701,71385,10000,0.006,0.002,9431
W6001,6,1611986,1611340,84415,11877,0.006,0.002,11228
W6021_out,6,1553643,1552821,236570,18216,0.014,0.002,16106
W6024,6,1544846,1543628,199755,28576,0.015,0.002,25013
W6027,6,1257435,1257027,75042,11508,0.009,0.002,10648


In [17]:
## run final assembly without outgroups and no missing data allowed for the ingroup
excl_clust90.set_params("min_samples_locus", 4) 
excl_clust90.run("7", force = True)

Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:00 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[####################] 100% 0:00:01 | indexing vcf depths  | s7 |
[####################] 100% 0:00:04 | writing vcf output   | s7 |


In [18]:
excl_clust90.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,163995,22219,0.003,0.002,21680
W5599,6,1136178,1135732,123345,10153,0.003,0.002,9856
W5610,6,1280762,1280243,176933,14443,0.005,0.002,13916
W5612,6,1299484,1298953,66612,11414,0.004,0.002,11026
W5636,6,899974,899681,55403,10889,0.004,0.002,10552
W5637,6,792010,791701,73786,10021,0.003,0.002,9681
W6001,6,1611986,1611340,87605,11959,0.004,0.002,11546
W6021_out,6,1553643,1552821,244224,18204,0.009,0.002,16641
W6024,6,1544846,1543628,209322,28888,0.011,0.002,26480
W6027,6,1257435,1257027,77996,11498,0.006,0.002,10993


In [19]:
## run final assembly without outgroups and no missing data allowed for the ingroup
excl_clust95.set_params("min_samples_locus", 4) 
excl_clust95.run("7", force = True)

Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:00 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[####################] 100% 0:00:01 | indexing vcf depths  | s7 |
[####################] 100% 0:00:03 | writing vcf output   | s7 |


In [20]:
excl_clust95.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
W5145,6,2021555,2020701,175155,22294,0.002,0.002,22032
W5599,6,1136178,1135732,128661,10194,0.001,0.002,10106
W5610,6,1280762,1280243,185142,14463,0.003,0.002,14278
W5612,6,1299484,1298953,72782,11484,0.002,0.002,11351
W5636,6,899974,899681,59872,10975,0.002,0.002,10860
W5637,6,792010,791701,78327,10103,0.002,0.002,9965
W6001,6,1611986,1611340,94375,12034,0.002,0.002,11856
W6021_out,6,1553643,1552821,260061,17980,0.004,0.002,17502
W6024,6,1544846,1543628,225735,28610,0.006,0.002,27815
W6027,6,1257435,1257027,83824,11509,0.003,0.002,11342


### Calculate different sets of missing data and then run the step 7 of the assembly for each one of the three clustering thresholds

We will first write the results of the percentage loop into a dictionary which can subsequently be used in the following steps instead of making the dictionary by hand

In [16]:
## first check number of remaining samples 
ingroup = excl_clust85.stats.state.count() - 1
print("Number of ingroup taxa:", ingroup)
print("Calculate different sets of missing data:")

## for loop to calculate different values for min_sample_locus
percent = [10, 15, 20, 25, 30, 35, 40]
for i in percent:
    res = ingroup / 100 * i
    print(i,"% = ", round(res))

Number of ingroup taxa: 23
Calculate different sets of missing data:
10 % =  2
15 % =  3
20 % =  5
25 % =  6
30 % =  7
35 % =  8
40 % =  9


### Step 7 of the assembly for clustering threshold 85% with different sets of missing data

In [17]:
# Cluster Treshold 85

## Run the final assembly step 7 through for loop with different min_sample_locus
## based on estimated number of remaining samples MINUS outgroup

## make a dictionary with the percentage of missing data as keys and 
## the actual min_sample_locus specified as values based on the number of "ingroup samples"
sample_dict = {10: 2,
               15: 3,
               20: 5,
               25: 6,
               30: 7,
               35: 8,
               40: 9}

## loop over the dictionary 
for key, value in sample_dict.items():
    newname = "pops{}_clust85".format(key)
    newdata = excl_clust85.branch(newname)
    newdata.populations = {
        "ingroup":  (value, [i for i in newdata.samples if "out" not in i]),
        "outgroup": (0,     [i for i in newdata.samples if "out" in i]),
         }
    ## run final step on every interation of the loop
    newdata.run("7", force = True)

Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:01 | applying filters     | s7 |
[####################] 100% 0:00:04 | building arrays      | s7 |
[####################] 100% 0:00:04 | writing conversions  | s7 |
[####################] 100% 0:00:02 | indexing vcf depths  | s7 |
[####################] 100% 0:00:06 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:01 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[####################] 100% 0:00:02 | indexing vcf depths  | s7 |
[####################] 100% 0:00:05 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:01 | applying filters     | s7 |
[####################] 100% 0:00:02 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[############

### Step 7 of the aasembly for clustering threshold 90% with different sets of missing data

In [18]:
# Cluster Treshold 90

## Run the final assembly step 7 through for loop with different min_sample_locus
## based on estimated number of remaining samples MINUS outgroup

## make a dictionary with the percentage of missing data as keys and 
## the actual min_sample_locus specified as values based on the number of "ingroup samples"
sample_dict = {10: 2,
               15: 3,
               20: 5,
               25: 6,
               30: 7,
               35: 8,
               40: 9}

## loop over the dictionary 
for key, value in sample_dict.items():
    newname = "pops{}_clust90".format(key)
    newdata = excl_clust90.branch(newname)
    newdata.populations = {
        "ingroup":  (value, [i for i in newdata.samples if "out" not in i]),
        "outgroup": (0,     [i for i in newdata.samples if "out" in i]),
         }
    ## run final step on every interation of the loop
    newdata.run("7", force = True)

Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:02 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:04 | writing conversions  | s7 |
[####################] 100% 0:00:03 | indexing vcf depths  | s7 |
[####################] 100% 0:00:06 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:02 | applying filters     | s7 |
[####################] 100% 0:00:04 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[####################] 100% 0:00:02 | indexing vcf depths  | s7 |
[####################] 100% 0:00:05 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:01 | applying filters     | s7 |
[####################] 100% 0:00:02 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[############

### Step 7 of the aasembly for clustering threshold 95% with different sets of missing data

In [19]:
# Cluster Treshold 95

## Run the final assembly step 7 through for loop with different min_sample_locus
## based on estimated number of remaining samples MINUS outgroup

## make a dictionary with the percentage of missing data as keys and 
## the actual min_sample_locus specified as values based on the number of "ingroup samples"
sample_dict = {10: 2,
               15: 3,
               20: 5,
               25: 6,
               30: 7,
               35: 8,
               40: 9}

## loop over the dictionary 
for key, value in sample_dict.items():
    newname = "pops{}_clust95".format(key)
    newdata = excl_clust95.branch(newname)
    newdata.populations = {
        "ingroup":  (value, [i for i in newdata.samples if "out" not in i]),
        "outgroup": (0,     [i for i in newdata.samples if "out" in i]),
         }
    ## run final step on every interation of the loop
    newdata.run("7", force = True)

Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:02 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:04 | writing conversions  | s7 |
[####################] 100% 0:00:02 | indexing vcf depths  | s7 |
[####################] 100% 0:00:05 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:02 | applying filters     | s7 |
[####################] 100% 0:00:02 | building arrays      | s7 |
[####################] 100% 0:00:03 | writing conversions  | s7 |
[####################] 100% 0:00:02 | indexing vcf depths  | s7 |
[####################] 100% 0:00:04 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 40 cores
[####################] 100% 0:00:02 | applying filters     | s7 |
[####################] 100% 0:00:02 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[############

### Step 7 - Creating an assembly with 50% misssing data which we will use for Structure analysis, PCA and Treemix

In [8]:
## load assembly object when comming back
excl_clust85 = ip.load_json("./Phac_Assembly_ingroup/excl_clust85.json")
excl_clust90 = ip.load_json("./Phac_Assembly_ingroup/excl_clust90.json")
excl_clust95 = ip.load_json("./Phac_Assembly_ingroup/excl_clust95.json")

loading Assembly: excl_clust85
from saved path: ~/Documents/Phacelia/Phac_Assembly_ingroup/excl_clust85.json
loading Assembly: excl_clust90
from saved path: ~/Documents/Phacelia/Phac_Assembly_ingroup/excl_clust90.json
loading Assembly: excl_clust95
from saved path: ~/Documents/Phacelia/Phac_Assembly_ingroup/excl_clust95.json


In [9]:
## first check number of remaining samples 
ingroup = excl_clust85.stats.state.count() - 1
print("Number of ingroup taxa:", ingroup)
print("Calculate different sets of missing data:")

## for loop to calculate different values for min_sample_locus
percent = [10, 15, 20, 25, 30, 35, 40, 50]
for i in percent:
    res = ingroup / 100 * i
    print(i,"% = ", round(res))

Number of ingroup taxa: 23
Calculate different sets of missing data:
10 % =  2
15 % =  3
20 % =  5
25 % =  6
30 % =  7
35 % =  8
40 % =  9
50 % =  12


In [11]:
## create a branch for outputs with min_samples = 12 (50% of missing data) for clustering threshold 85%
min12_clust85 = excl_clust85.branch("min12_clust85")
min12_clust85.set_params("min_samples_locus", 12)
min12_clust85.run("7")

## create a branch for outputs with min_samples = 12 (50% of missing data) for clustering threshold 90%
min12_clust90 = excl_clust90.branch("min12_clust90")
min12_clust90.set_params("min_samples_locus", 12)
min12_clust90.run("7")

## create a branch for outputs with min_samples = 12 (50% of missing data) for clustering threshold 95%
min12_clust95 = excl_clust95.branch("min12_clust95")
min12_clust95.set_params("min_samples_locus", 12)
min12_clust95.run("7")

Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:06 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:01 | writing conversions  | s7 |
[####################] 100% 0:00:01 | indexing vcf depths  | s7 |
[####################] 100% 0:00:03 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:00 | applying filters     | s7 |
[####################] 100% 0:00:02 | building arrays      | s7 |
[####################] 100% 0:00:01 | writing conversions  | s7 |
[####################] 100% 0:00:01 | indexing vcf depths  | s7 |
[####################] 100% 0:00:03 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 64 cores
[####################] 100% 0:00:00 | applying filters     | s7 |
[####################] 100% 0:00:02 | building arrays      | s7 |
[####################] 100% 0:00:02 | writing conversions  | s7 |
[############

## RAxML

### here we run RAxML analyses with only outgroup 6021(the rest were excluded from the assembly) for the three different clustering thresholds

RAxML is performed in the terminal and you don't have to run it in a specific enviroment

First create an .sh file in the terminal using the vim text editor. Then put this file in the folder with the .phy files that you want to run in parallel.

In the terminal we did the .sh executable using the command "chmod +x RAxML_ddRAD.sh" To execute the file we used the command "./RAxML_ddRAD.sh"

The text file contains the following:

/bin/bash
set -e
set -u
set -o pipefail

for i in *.phy
do raxml -f a -s $i -m GTRGAMMA -p 54321 -x 123456 -N 200 -T 20 -n $i
done


T stands for the cores. Raxml does not work better if you use all the cores available. -N stands for bootstrap and we could try to increase it up to 1000 maybe

### Plotting Raxml trees


In [20]:
## Load trees clust85_ingroup_20220930
tre15 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops15_clust85.phy")
tre20 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops20_clust85.phy")
tre25 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops25_clust85.phy")
tre30 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops30_clust85.phy")
tre35 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops35_clust85.phy")
tre40 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops40_clust85.phy")

tre15 = tre15.root(names=["Phacelia_artemisioides_W6021_out"])
tre20 = tre20.root(names=["Phacelia_artemisioides_W6021_out"])
tre25 = tre25.root(names=["Phacelia_artemisioides_W6021_out"])
tre30 = tre30.root(names=["Phacelia_artemisioides_W6021_out"])
tre35 = tre35.root(names=["Phacelia_artemisioides_W6021_out"])
tre40 = tre40.root(names=["Phacelia_artemisioides_W6021_out"])


## set dimensions of the canvas
canvas = toyplot.Canvas(width = 2000, height = 2000)

## dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
ax0 = canvas.cartesian(bounds=('2%',  '30%', '5%',  '47.5%'))
ax1 = canvas.cartesian(bounds=('33%', '63%', '5%',  '47.5%'))
ax2 = canvas.cartesian(bounds=('66%', '96%', '5%',  '47.5%'))
ax3 = canvas.cartesian(bounds=('2%',  '30%', '50%', '97.5%'))
ax4 = canvas.cartesian(bounds=('33%', '63%', '50%', '97.5%'))
ax5 = canvas.cartesian(bounds=('66%', '96%', '50%', '97.5%'))

# call draw with the 'axes' argument to pass it to a specific cartesian area
style = {
    "tip_labels_align": True,
    "tip_labels_style": {"font-size": "14px"},
    "node_labels_style":{"font-size": "12px",
                        "baseline-shift": "7px",
                        "-toyplot-anchor-shift": "-13px"},
}
tre15.ladderize(1).draw(
    axes = ax0,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre20.ladderize(1).draw(
    axes = ax1,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre25.ladderize(1).draw(
    axes = ax2,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre30.ladderize(1).draw(
    axes = ax3,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre35.ladderize(1).draw(
    axes = ax4,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre40.ladderize(1).draw(
    axes = ax5,
    **style,
    node_sizes = 0,
    node_labels = 'support');

## hide the axes (e.g, ticks and splines)
ax0.show = False; ax1.show = False; ax2.show = False;
ax3.show = False; ax4.show = False; ax5.show = False;

## add names for the single trees
canvas.text(1000, 50, 'RAxML - Clustering threshold 85 %', style = {"font-size": "24px"})
canvas.text(150, 125, '85 % missing data', style={"font-size": "18px"})
canvas.text(800, 125, '80 % missing data', style={"font-size": "18px"})
canvas.text(1450, 125, '75 % missing data', style={"font-size": "18px"})
canvas.text(150, 1025, '70 % missing data', style={"font-size": "18px"})
canvas.text(800, 1025, '65 % missing data', style={"font-size": "18px"})
canvas.text(1450, 1025, '60 % missing data', style={"font-size": "18px"});


In [21]:
import toyplot.pdf
toyplot.pdf.render(canvas, "/home/marianna/Documents/Phacelia/Figures/RAxML__clust85ingroup_20220930.pdf");

In [26]:
## Load trees clust90_ingroup_20220930
tre15 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops15_clust90.phy")
tre20 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops20_clust90.phy")
tre25 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops25_clust90.phy")
tre30 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops30_clust90.phy")
tre35 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops35_clust90.phy")
tre40 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops40_clust90.phy")

tre15 = tre15.root(names=["Phacelia_artemisioides_W6021_out"])
tre20 = tre20.root(names=["Phacelia_artemisioides_W6021_out"])
tre25 = tre25.root(names=["Phacelia_artemisioides_W6021_out"])
tre30 = tre30.root(names=["Phacelia_artemisioides_W6021_out"])
tre35 = tre35.root(names=["Phacelia_artemisioides_W6021_out"])
tre40 = tre40.root(names=["Phacelia_artemisioides_W6021_out"])


## set dimensions of the canvas
canvas = toyplot.Canvas(width = 2000, height = 2000)

## dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
ax0 = canvas.cartesian(bounds=('2%',  '30%', '5%',  '47.5%'))
ax1 = canvas.cartesian(bounds=('33%', '63%', '5%',  '47.5%'))
ax2 = canvas.cartesian(bounds=('66%', '96%', '5%',  '47.5%'))
ax3 = canvas.cartesian(bounds=('2%',  '30%', '50%', '97.5%'))
ax4 = canvas.cartesian(bounds=('33%', '63%', '50%', '97.5%'))
ax5 = canvas.cartesian(bounds=('66%', '96%', '50%', '97.5%'))

# call draw with the 'axes' argument to pass it to a specific cartesian area
style = {
    "tip_labels_align": True,
    "tip_labels_style": {"font-size": "14px"},
    "node_labels_style":{"font-size": "12px",
                        "baseline-shift": "7px",
                        "-toyplot-anchor-shift": "-13px"},
}
tre15.ladderize(1).draw(
    axes = ax0,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre20.ladderize(1).draw(
    axes = ax1,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre25.ladderize(1).draw(
    axes = ax2,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre30.ladderize(1).draw(
    axes = ax3,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre35.ladderize(1).draw(
    axes = ax4,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre40.ladderize(1).draw(
    axes = ax5,
    **style,
    node_sizes = 0,
    node_labels = 'support');

## hide the axes (e.g, ticks and splines)
ax0.show = False; ax1.show = False; ax2.show = False;
ax3.show = False; ax4.show = False; ax5.show = False;

## add names for the single trees
canvas.text(1000, 50, 'RAxML - Clustering threshold 90 %', style = {"font-size": "24px"})
canvas.text(150, 125, '85 % missing data', style={"font-size": "18px"})
canvas.text(800, 125, '80 % missing data', style={"font-size": "18px"})
canvas.text(1450, 125, '75 % missing data', style={"font-size": "18px"})
canvas.text(150, 1025, '70 % missing data', style={"font-size": "18px"})
canvas.text(800, 1025, '65 % missing data', style={"font-size": "18px"})
canvas.text(1450, 1025, '60 % missing data', style={"font-size": "18px"});

In [27]:
import toyplot.pdf
toyplot.pdf.render(canvas, "/home/marianna/Documents/Phacelia/Figures/RAxML__clust90ingroup_20220930.pdf");

In [23]:
## Load trees clust95_ingroup_20220930
tre15 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops15_clust95.phy")
tre20 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops20_clust95.phy")
tre25 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops25_clust95.phy")
tre30 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops30_clust95.phy")
tre35 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops35_clust95.phy")
tre40 = toytree.tree("/home/marianna/Documents/Phacelia/Phac_Analyses/RAxML/RAxML_ingroup_20220930/RAxML_bipartitions.pops40_clust95.phy")

tre15 = tre15.root(names=["Phacelia_artemisioides_W6021_out"])
tre20 = tre20.root(names=["Phacelia_artemisioides_W6021_out"])
tre25 = tre25.root(names=["Phacelia_artemisioides_W6021_out"])
tre30 = tre30.root(names=["Phacelia_artemisioides_W6021_out"])
tre35 = tre35.root(names=["Phacelia_artemisioides_W6021_out"])
tre40 = tre40.root(names=["Phacelia_artemisioides_W6021_out"])


## set dimensions of the canvas
canvas = toyplot.Canvas(width = 2000, height = 2000)

## dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
ax0 = canvas.cartesian(bounds=('2%',  '30%', '5%',  '47.5%'))
ax1 = canvas.cartesian(bounds=('33%', '63%', '5%',  '47.5%'))
ax2 = canvas.cartesian(bounds=('66%', '96%', '5%',  '47.5%'))
ax3 = canvas.cartesian(bounds=('2%',  '30%', '50%', '97.5%'))
ax4 = canvas.cartesian(bounds=('33%', '63%', '50%', '97.5%'))
ax5 = canvas.cartesian(bounds=('66%', '96%', '50%', '97.5%'))

# call draw with the 'axes' argument to pass it to a specific cartesian area
style = {
    "tip_labels_align": True,
    "tip_labels_style": {"font-size": "14px"},
    "node_labels_style":{"font-size": "12px",
                        "baseline-shift": "7px",
                        "-toyplot-anchor-shift": "-13px"},
}
tre15.ladderize(1).draw(
    axes = ax0,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre20.ladderize(1).draw(
    axes = ax1,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre25.ladderize(1).draw(
    axes = ax2,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre30.ladderize(1).draw(
    axes = ax3,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre35.ladderize(1).draw(
    axes = ax4,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre40.ladderize(1).draw(
    axes = ax5,
    **style,
    node_sizes = 0,
    node_labels = 'support');

## hide the axes (e.g, ticks and splines)
ax0.show = False; ax1.show = False; ax2.show = False;
ax3.show = False; ax4.show = False; ax5.show = False;

## add names for the single trees
canvas.text(1000, 50, 'RAxML - Clustering threshold 95 %', style = {"font-size": "24px"})
canvas.text(150, 125, '85 % missing data', style={"font-size": "18px"})
canvas.text(800, 125, '80 % missing data', style={"font-size": "18px"})
canvas.text(1450, 125, '75 % missing data', style={"font-size": "18px"})
canvas.text(150, 1025, '70 % missing data', style={"font-size": "18px"})
canvas.text(800, 1025, '65 % missing data', style={"font-size": "18px"})
canvas.text(1450, 1025, '60 % missing data', style={"font-size": "18px"});

In [25]:
import toyplot.pdf
toyplot.pdf.render(canvas, "/home/marianna/Documents/Phacelia/Figures/RAxML__clust95ingroup_20220930.pdf");