# Produce phylograms cross-referencing data from `fastStructure` and geographical locations of clones

## Install and load `phytools` R package

In [None]:
# install.packages("phytools")

In [None]:
library(phytools)

## Load our data and format as needed

### Load phylogram data from `SNPhylo`

In [None]:
tree <- read.tree("~/GUIs/01.6_Phylogenetics/SNPhylo/outputa4.ml.tree")

In [None]:
tiplabels <- tree$tip.label

In [None]:
length(tiplabels)

In [None]:
head(tiplabels)

<div class="alert alert-block alert-warning"> Note that tip labels from `SNPhylo` output are genotype name, repeated after _, truncated to 10 characters </div>

### Load and format geographical data

In [None]:
geo <- read.csv("BESC_LocationInfoAll_Final.txt", row.names = 1, sep = "\t")

Change rownames to match with labels from `SNPhylo`

In [None]:
rownames(geo) <- paste0(rownames(geo), "_", rownames(geo))

rownames(geo) <- stringr::str_trunc(rownames(geo), width = 10, ellipsis = "")

rownames(geo) <- gsub("Nisqually", "NISQUALLY", rownames(geo))

In [None]:
setdiff(rownames(geo), tiplabels)

In [None]:
setdiff(tiplabels, rownames(geo))

In [None]:
dim(geo)

In [None]:
geo <- geo[which(rownames(geo) %in% tiplabels), ]

In [None]:
dim(geo)

In [None]:
colnames(geo) <- c("lat", "long")

In [None]:
geo_lat_only <- geo
geo_lat_only$long <- NULL

geo_long_only <- geo
geo_long_only$lat <- NULL

In [None]:
# geo <- as.matrix(geo)

In [None]:
# geo_lat_only <- as.matrix(geo_lat_only)
# geo_long_only <- as.matrix(geo_long_only)

### Load and format `fastStructure` data

#### Load `fastStructure` data

In [None]:
structure7 <- read.csv("fastStructure_mean_results/SNPs72kfiltered_mean.7.meanQ", header = FALSE, sep ="\t")

In [None]:
head(structure7)

In [None]:
structure6 <- read.csv("fastStructure_mean_results/SNPs72kfiltered_mean.6.meanQ", header = FALSE, sep ="\t")

#### Add rownames for genotype name to `fastStructure` data and convert to matrix

##### Obtain correctly ordered row names

In [None]:
genotype_names <- data.table::fread("id_list_1323geno.txt", header = TRUE, sep = "\t")

In [None]:
genotype_names <- colnames(genotype_names)[10:ncol(genotype_names)]

In [None]:
head(genotype_names)

##### Add the rownames to `fastStructure` dataframe, then convert to matrix

In [None]:
dim(structure7)

In [None]:
length(genotype_names)

In [None]:
rownames(structure7) <- rownames(structure6) <- genotype_names

In [None]:
structure7 <- as.matrix(structure7)
structure6 <- as.matrix(structure6)

In [None]:
head(structure7)

##### Format rownames as needed to reflect duplication, 10-character limit in `fastStructure` results

In [None]:
rownames(structure7) <- paste0(rownames(structure7), "_", rownames(structure7))
rownames(structure7) <- stringr::str_trunc(rownames(structure7), width = 10, ellipsis = "")

rownames(structure6) <- paste0(rownames(structure6), "_", rownames(structure6))
rownames(structure6) <- stringr::str_trunc(rownames(structure6), width = 10, ellipsis = "")

##### Produce a vector/matrix of the "dominant" cluster for each genotype

In [None]:
dominant_subpop_k6 <- as.matrix(colnames(structure6)[apply(structure6,1,which.max)])
rownames(dominant_subpop_k6) <- rownames(structure6)
dominant_subpop_k6 <- gsub("V", "", dominant_subpop_k6)

In [None]:
dominant_subpop_k7 <- as.matrix(colnames(structure7)[apply(structure7,1,which.max)])
rownames(dominant_subpop_k7) <- rownames(structure7)
dominant_subpop_k7 <- gsub("V", "", dominant_subpop_k7)

### Load and format PCs from SNPs

In [None]:
PC_in <- read.csv("1323_cohort_maf05_defaultmissingrates.pca.eigenvec", header = FALSE, sep =" ")

In [None]:
head(PC_in)

In [None]:
PC_matrix <- as.matrix(PC_in[,3:22])

In [None]:
rownames(PC_matrix) <- PC_in$V1

In [None]:
rownames(PC_matrix) <- paste0(rownames(PC_matrix), "_", rownames(PC_matrix))

rownames(PC_matrix) <- stringr::str_trunc(rownames(PC_matrix), width = 10, ellipsis = "")

rownames(PC_matrix) <- gsub("Nisqually", "NISQUALLY", rownames(PC_matrix))

In [None]:
colnames(PC_matrix) <- paste0("PC", 1:20)

In [None]:
# head(PC_matrix)

## Make plots

We will use code in this `phytools` exercise: http://phytools.org/mexico2018/ex/12/Plotting-methods.html

### Phylogram with map

In [None]:
nams <- map("world", namesonly=TRUE, plot=FALSE)

In [None]:
options(repr.plot.width=30, repr.plot.height=30)

In [None]:
obj<-phylo.to.map(tree,geo,plot=FALSE, regions=c(".*usa", "Canada"),
                  xlim = c(min(geo$long), max(geo$long)),
                  ylim = c(min(geo$lat), max(geo$lat)), direction="rightwards")

In [None]:
# options(repr.plot.width=60, repr.plot.height=60)

In [None]:
# plot(obj, direction="rightwards", cex.points=c(0,1),
#     lwd=c(3,1),ftype="off", asp=1.3, pts=FALSE)

#### Prune tree

In [None]:
# obj<-phylo.to.map(drop.leaves(tree),geo,plot=FALSE,
#                   regions=c(".*usa", "Canada"),
#                   xlim = c(min(geo$long), max(geo$long)),
#                   ylim = c(min(geo$lat), max(geo$lat)),
#                   direction="rightwards")

In [None]:
# plot(obj, direction="rightwards", cex.points=c(0,1),
#     lwd=c(3,1),ftype="off", asp=1.3, pts=FALSE)

In [None]:
pruned_tree <- drop.leaves(tree, keep.tip.labels=TRUE)

Problem: We can't see all genotypes in a clade due to character limit

##### Attempt using `drop.tip`

In [None]:
set.seed(5)
tips_to_keep <- 130
prune_tree <- drop.tip(tree, tip=sample(tree$tip.label, length(tree$tip.label) - tips_to_keep))

In [None]:
options(repr.plot.width=20, repr.plot.height=20)

In [None]:
dominant_subpop_subset_k6 <- dominant_subpop_k6[which(rownames(dominant_subpop_k6) %in% prune_tree$tip.label), ]
dominant_subpop_subset_k7 <- dominant_subpop_k7[which(rownames(dominant_subpop_k7) %in% prune_tree$tip.label), ]

###### K=6

In [None]:
cols <- gplots::col2hex(dominant_subpop_subset_k6)
names(cols) <- names(dominant_subpop_subset_k6)
obj<-phylo.to.map(prune_tree,geo[which(rownames(geo) %in% prune_tree$tip.label), ],
                  colors = cols,
                  plot=FALSE, regions=c(".*usa", "Canada"),
                  xlim = c(min(geo$long), max(geo$long)),
                  ylim = c(min(geo$lat), max(geo$lat)), direction="rightwards", rotate = TRUE)

In [None]:
plot(obj, direction="rightwards", cex.points=c(0,1),
     lwd=c(3,1),ftype="off", asp=1.3, pts=FALSE, colors = cols)

###### K=7

In [None]:
plot(obj, direction="rightwards", cex.points=c(0,1),
     lwd=c(3,1),ftype="off", asp=1.3, pts=FALSE, colors = cols)

In [None]:
obj<-phylo.to.map(prune_tree,geo[which(rownames(geo) %in% prune_tree$tip.label), ],
                  colors = cols,
                  plot=FALSE, regions=c(".*usa", "Canada"),
                  xlim = c(min(geo$long), max(geo$long)),
                  ylim = c(min(geo$lat), max(geo$lat)), direction="rightwards", rotate = TRUE)

In [None]:
plot(obj, direction="rightwards", cex.points=c(0,1),
     lwd=c(3,1),ftype="off", asp=1.3, pts=FALSE, colors = cols)

#### Convert genotype names to letter code

In [None]:
head(tree$tip.label)

### Heatmaps for subpopulation

#### Without pruning

In [None]:
options(repr.plot.width=30, repr.plot.height=60)

#### With pruning

###### K = 7

In [None]:
colnames(structure7) <- c("Subpop. 1", "Subpop. 2", "Subpop. 3", "Subpop. 4", "Subpop. 5", "Subpop. 6", "Subpop. 7")

In [None]:
phylo.heatmap(prune_tree,structure7,standardize=FALSE, labels = TRUE, pts = FALSE, split=c(0.8,0.2),
              fsize=c(0.01, 3, 1),
              type = "phylogram",
              color = rainbow(200))

###### K = 6

In [None]:
colnames(structure6) <- colnames(structure7)[1:6]

In [None]:
phylo.heatmap(prune_tree,structure6,standardize=FALSE, labels = TRUE, pts = FALSE, split=c(0.8,0.2),
              fsize=c(0.01, 3, 1),
              type = "phylogram",
              color = rainbow(200))

### Heatmaps for PCs

In [None]:
phylo.heatmap(prune_tree,
              (1 / 
               (1+ exp(1)^-PC_matrix[, 1:6])
              ) * 10,
              standardize=FALSE, labels = TRUE, pts = FALSE, split=c(0.8,0.2),
              fsize=c(0.01, 3, 1.5),
              type = "phylogram",
              color = rainbow(200)[20:180])