# All Norfolk samples, 2021-12-01 to 2022-02-28

The metadata file was `03.nn_geodate_min.csv.xz`, anonymized by removing the `adm2` column.

The exact parameters can be found in `ubuntu@madeline-01:/home/ubuntu/scripts_old/preprocessing.nf`:
```
params.cog_fasta = "/home/ubuntu/ongaeshi-mnt/civet/cog_global_*.fasta.xz"
params.cog_meta = "/home/ubuntu/ongaeshi-mnt/civet/cog_global_*public.csv.xz"
params.wuhan_hu_1 = "/home/ubuntu/Wuhan-Wu-1-linear.fasta"
params.nns = "/home/ubuntu/03.nn_geodate_min_noadm2.csv.xz"
params.start_date = "2021-12-01"
params.end_date = "2022-02-28"
params.N = 4
params.K = 100
```

``
executor >  local (2)
[69/4e8026] process > chooseSamples            [100%] 1 of 1, cached: 1 ✔
[56/85bbc8] process > getFastaRecords (1)      [100%] 1 of 1, cached: 1 ✔
[04/9ff2c4] process > maskProblematicSites (1) [100%] 1 of 1, cached: 1 ✔
[02/d4d1fa] process > removeWuhanHu1 (1)       [100%] 1 of 1, cached: 1 ✔
[78/1065eb] process > reduceRefs (1)           [100%] 1 of 1, cached: 1 ✔
[4b/ba24d1] process > buildReducedRefsTree (1) [100%] 1 of 1, cached: 1 ✔
[01/da398e] process > buildAllSamplesTree (1)  [100%] 1 of 1, cached: 1 ✔
[6d/e6360e] process > runPangolin (1)          [100%] 1 of 1, failed: 1 ✘
[44/260c79] process > getCOGUKMetadata (1)     [100%] 1 of 1 ✔
[-        ] process > joinAllMetadata          -
[-        ] process > treeTime                 -
[-        ] process > treeTimeMugration        -
Error executing process > 'runPangolin (1)'
``


At the time of running, the COG-UK version was `cog_global_2022-03-04.fasta.xz`.

I selected all the Norfolk samples between December 1, 2021 and February 28, 2022 from the nearest neighbours file (`nns`), and the first 4 nearest neighbours for each.

Then I extracted the `fasta` records from `cog_global_2022-03-04.fasta.xz` and added Wuhan-Wu-1.  I masked the problematic sites, then removed Wuhan-Wu-1 from the analysis.

I made a neighbour-joining tree out of the neighbour sequences with `rapidnj`, and found the 100 most diverse neighbours with `iqtree -k 100`.  I built a tree out of just these 100 with `iqtree`.

Then, I made a final tree of all the sequences (queries + neighbours), using the neighbours-only tree as a constraint:
    ```
    iqtree -s allseqs.aln -m HKY+G -g reduced_refs.aln.treefile -t PARS
    ```
    
This part took 3.5 days to run, since there were 6408 tips.

Finally, I visualized this in R:

![title](img/norfolk_dectofeb.png)

The brown samples on the left side are all Omicron, and the more colourful cluster on the right is Delta.  Three samples had no lineage assigned to them.

**More plots of this data**

In [None]:
plot_tree_by_factor <- function(treefile, metadata, factor, treetype, mrsd){

# read in everything
tbl0 <- read.csv (metadata, sep = ",") 
tbl0[is.na(tbl0)] <- ""
if(treetype=="newick") {tre <- read.tree (treefile)}
else if(treetype=="nexus") {tre <- read.nexus (treefile)}
traits <- tbl0 %>% slice(match(tre$tip.label, sequence_name)) #this line means you could use the entire cog_uk metadata
rownames(traits) <- traits$sequence_name
# traits$code <- substr(traits$central_sample_id,1,4)

# get plot inputs
groupInfo <- split (tre$tip.label, traits[factor])
grouptre <- groupOTU(tre, groupInfo, group_name="group1")
options(repr.plot.width=26, repr.plot.height=20) ; par(oma=c(0,0,0,0)) 
getPal_1 = colorRampPalette( brewer.pal(8,"Dark2"))(length(unique(traits[,factor]))+1)  ## RdYlGn Set2

# make plot
p <- ggtree (grouptre,  aes(color=group1),size=0.4, layout="rectangular", mrsd=mrsd) + theme(legend.position="right") + theme(legend.text=element_text(size=15))
p <- p + ggplot2::scale_color_manual(values = getPal_1, name=factor)
p <- p + new_scale_color()  + new_scale_fill() 
p <- p + theme_tree2() 
p
return(p)
}

In [None]:
plot_one_lineage_from_tree <- function(treefile, metadata, lineage_to_plot, treetype, mrsd){

# read in everything
tbl0 <- read.csv (metadata, sep = ",") 
tbl0[is.na(tbl0)] <- ""
if(treetype=="newick") {tre <- read.tree (treefile)}
else if(treetype=="nexus") {tre <- read.nexus (treefile)}
traits <- tbl0 %>% slice(match(tre$tip.label, sequence_name)) 
rownames(traits) <- traits$sequence_name

groupInfo <- split (tre$tip.label, traits$lineage)
tip_subset <- unlist(groupInfo[lineage_to_plot])   #char vector of tips to plot
subtree <- get_subtree_with_tips(tre, only_tips=tip_subset)$subtree #castor

# make plot
options(repr.plot.width=26, repr.plot.height=20) ; par(oma=c(0,0,0,0)) 
p <- ggtree (subtree,  color="blue",size=0.4, layout="rectangular", mrsd=mrsd) + theme(legend.position="right") + theme(legend.text=element_text(size=15))
p <- p + theme_tree2() 
p <- p + geom_tiplab()
p <- p + labs(title = lineage_to_plot)
p

return(p)

}

In [None]:
plot_tree_subset_and_color <- function(treefile, metadata, factor_to_subset_by, factor_value_to_subset_by, color_by, treetype, mrsd){

# read in everything
#tbl0 <- read.csv (metadata, sep = ",") 
tbl0 <- metadata
tbl0[is.na(tbl0)] <- ""
if(treetype=="newick") {tre <- read.tree (treefile)}
else if(treetype=="nexus") {tre <- read.nexus (treefile)}
traits <- tbl0 %>% slice(match(tre$tip.label, sequence_name)) 
rownames(traits) <- traits$sequence_name

# get subtree and traits subset corresponding to factor value
traits_subset <- traits[ traits[[factor_to_subset_by]] == factor_value_to_subset_by , ]
tip_subset <- traits_subset[["sequence_name"]]
tree_subset <- get_subtree_with_tips(tre, only_tips=tip_subset)$subtree #castor
    
# get color scheme for color_by
groupInfo <- split (tree_subset$tip.label, traits_subset[color_by])   
grouptre <- groupOTU(tree_subset, groupInfo, group_name="group1")
options(repr.plot.width=26, repr.plot.height=20) ; par(oma=c(0,0,0,0)) 
getPal_1 = colorRampPalette( brewer.pal(8,"Dark2"))(length(unique(traits_subset[color_by]))+2)  

# make plot
p <- ggtree (grouptre,  aes(color=group1),size=0.4, layout="rectangular", mrsd=mrsd) + theme(legend.position="right") + theme(legend.text=element_text(size=15))
p <- p + ggplot2::scale_color_manual(values = getPal_1, name=color_by)
p <- p + new_scale_color()  + new_scale_fill() 
p <- p + theme_tree2() 
p <- p + geom_tiplab()
p <- p + labs(title = factor_value_to_subset_by)

p
return(p)
    


}

In [1]:
treefile <- "/home/madeline/Desktop/git_temp/QIB_Internship/data/allseqs.aln.treefile"
annotations <- "/home/madeline/Desktop/git_temp/QIB_Internship/data/figtree_annotation.tsv"

tbl0 <- read.csv (annotations, sep = "\t") 
tbl0[is.na(tbl0)] <- ""   ## mising country info messes with startsWith()
tre <- read.tree (treefile)

traits <- tbl0 %>% slice(match(tre$tip.label, sequence_name)) #these already match, but this line means you could use the entire cog_uk metadata
rownames(traits) <- traits$sequence_name
traits$code <- substr(traits$central_sample_id,1,4)

SyntaxError: invalid syntax (3399687293.py, line 5)

In [None]:
groupInfo <- split (tre$tip.label, traits["lineage"])
grouptre <- groupOTU(tre, groupInfo, group_name="group1")
options(repr.plot.width=26, repr.plot.height=20) ; par(oma=c(0,0,0,0)) 
getPal_lineage = colorRampPalette( brewer.pal(8,"Dark2"))(length(unique(traits$lineage))+1)  ## RdYlGn Set2

p <- ggtree (grouptre,  aes(color=group1),size=0.4, layout="rectangular") + theme(legend.position="right") + theme(legend.text=element_text(size=15))
p <- p + ggplot2::scale_color_manual(values = getPal_lineage, name="Lineage")
p <- p + new_scale_color()  + new_scale_fill() 
p

In [None]:
plot_one_lineage_from_tree("full_timeframe/reformatted.aln.treefile", "full_timeframe/metadata_reformatted.csv", "B", "newick", "2020-06-30")

**Notes about running `treetime`, which ultimately failed due to memory**

On March 24 I ran `treetime` with the relaxed clock, first reformatting the names everywhere:

``sed 's/\//_/g' work/01/da398e7bd11c0ac3f4d1843bbd18ec/allseqs.aln > norfolk_dectofeb_allseqs_formatted.aln
``

``(base) ubuntu@madeline-01:~/scripts$ python cog_subset_to_figtree_annotation.py --cog_meta_file /home/ubuntu/scripts_old/work/44/260c79a94c014e4a64b18ba83c57a9/leaftips_cog_metadata.csv --out /home/ubuntu/norfolk_dectofeb_leaftips_cog_metadata_formatted.csv``

And then converting to .csv again, because `treetime` couldn't find `sample_date`...

And finally running `treetime` for real:

``(base) ubuntu@madeline-01:~/scripts_old$ nohup treetime --aln norfolk_dectofeb_allseqs_formatted.aln --dates /home/ubuntu/norfolk_dectofeb_leaftips_cog_metadata_formatted.csv --name-column sequence_name --date-column sample_date --relax 1.0 0 --clock-rate 0.008 --outdir timetree-Norfolk_DectoFeb``