# Produce SNP density plots for ***common*** SNPs
## for 1323-genotype SNP set filtered to all SNPs with MAF *above* 0.05

##### We can make SNP density plots for 
###### 1) 1323-genotype SNP set 
###### -and- 
###### 2) 917-genotype SNP set 
###### -both with- 
###### A) minor allele frequency (MAF) lower threshold of 0.05 and 
###### B) maximum MAF of 0.05

<div class="alert alert-block alert-info"> Note: In this initial version of this notebook, the only SNP density plots produced are for 1A: the 1323-genotype SNP set filtered to SNPs of at least 0.05 MAF threshold. Upon inspection of this along with a complementary version of the notebook for analyzing rare SNPs, we will make a determination as to next steps.  </div>

##### Let's also write an R function for identifying gaps between SNPs (at bottom of notebook, after making SNP density plots)

Load data in PLINK binary format into `R` using `snpStats::read.plink` (available through Bioconductor repository)

In [None]:
install.packages("snpStats") # install from CRAN for sake of simplicity

In [None]:
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(version = "3.10")

In [None]:
BiocManager::install("snpStats")

#### 1A) 1323-genotype SNP set filtered with lower MAF threshold of 0.05

In [None]:
library(snpStats)

In [None]:
geno1323_maf05 <- read.plink("../00_SNP_format_conversions/1323_cohort_maf0.05_defaultmissingrates.snp.pass.bed")

This data is loaded in `snpMatrix` format, explained in the vignette documentation here http://www.bioconductor.org/packages//2.7/bioc/vignettes/snpMatrix/inst/doc/snpMatrix-vignette.pdf

##### Let's inspect this data to determine what steps are needed to get it into the appropriate format for `CMplot`

In [None]:
# show(geno1323_maf05)

The `map` attribute contains the SNPs themselves – this is the data we need to pass to `CMplot`

In [None]:
#install.packages("CMplot")

In [None]:
library(CMplot)

In [None]:
#?CMplot

The `CMplot` documentation notes a few important criteria for preparing our data in the appropriate format.<br>
1. Data must be in a `dataframe` object.
2. "The first column is the name of SNP, the second column is the chromosome of SNP, the third column is the position of SNP, and the remaining columns are the P-value of each trait(Note:each trait a column)"

<div class="alert alert-block alert-warning"> How does the format of our existing data, read in from `read.plink` differ from what is required by `CMplot`? </div>


In [None]:
is.data.frame(geno1323_maf05$map)

In [None]:
#colnames(geno1323_maf05$map)

In [None]:
data_CMplot <- geno1323_maf05$map[, c(2, 1, 4, 3, 5, 6)] # Create a copy of the data and reorder first 4 columns as needed

In [None]:
data_CMplot <- data_CMplot[!grepl("scaffold", # exclude unanchored scaffolds
                                  data_CMplot$chromosome), ]

In [None]:
nrow(data_CMplot) # number of SNPs anchored to Chr in this filtered SNP set

In [None]:
#levels(factor(data_CMplot$chromosome)) # Make sure we only have contiguous, anchored scaffolds

In [None]:
CMplot(data_CMplot,
       type = "p", # This parameter is shown in the example for making density plots with CMplot, but not explained in docs
       plot.type = "d", # Make a SNP density plot
       bin.size = 1e6,
       chr.den.col = c("red", "orange", "yellow", "darkgreen", "blue", "purple"), # Choose heat map color scale
       # file="jpg", # We want to make plots in Jupyter instead of saving to an output file
       file.output = FALSE, # See above note
       memo="",
       dpi = 300, # Nice high resolution
       verbose = TRUE,
       width = 9,
       height = 6)

Let's take a look with a smaller window size (1kb)

In [None]:
CMplot(data_CMplot,
       type = "p", # This parameter is shown in the example for making density plots with CMplot, but not explained in docs
       plot.type = "d", # Make a SNP density plot
       bin.size = 1e3,
       chr.den.col = c("darkgreen", "yellow", "red"), # Choose heat map color scale
       # file="jpg", # We want to make plots in Jupyter instead of saving to an output file
       file.output = FALSE, # See above note
       memo="",
       dpi = 300, # Nice high resolution
       verbose = TRUE,
       width = 9,
       height = 6)

<div class="alert alert-block alert-warning"> It is rather difficult to get much information from this SNP density plot with 1kb bins, since it appears most 1kb regions have a single SNP. However, we do not see any big white spots, indicative of long regions with no SNPs. </div>

##### Once more with bin size of 10kb...

In [None]:
CMplot(data_CMplot,
       type = "p", # This parameter is shown in the example for making density plots with CMplot, but not explained in docs
       plot.type = "d", # Make a SNP density plot
       bin.size = 1e4,
       chr.den.col = c("darkgreen", "yellow", "red"), # Choose heat map color scale
       # file="jpg", # We want to make plots in Jupyter instead of saving to an output file
       file.output = FALSE, # See above note
       memo="",
       dpi = 300, # Nice high resolution
       verbose = TRUE,
       width = 9,
       height = 6)

### Provide an R function for identifying gaps between SNPs

In [None]:
options(repr.plot.width=10, repr.plot.height=5)

In [None]:
SNP_gap_hist <- function(data,
                         Chr,
                         gap_min,
                         bin_width,
                         plot,
                         output_dir){

    data_this_Chr <- data[which(data$chromosome == Chr), ]

    prior_adjacent_SNPs <- c(NA, # The first SNP on a given chromosome has no SNP before it
                             data_this_Chr[-nrow(data_this_Chr), ]$position) # By excluding the final SNP and taking position... 
    # We obtain a vector of all SNP positions for a SNP prior to each given SNP.

    SNP_gaps <- data_this_Chr$position - prior_adjacent_SNPs # Measure sizes of gaps between adjacent SNPs

    print(paste0("Maximum length of any gap between two SNPs on Chromosome ",
                 Chr,
                 " is ",
                 max(na.omit(SNP_gaps))))

    data_this_Chr$distance_from_last <- SNP_gaps

    # Produce a table of all SNP data for SNPs with >1kb gaps, ordered from largest to smallest gap
    data_this_Chr_gaps_over_Xbp <- data_this_Chr[which(data_this_Chr$distance_from_last > gap_min), ] 

    print(paste0("There exist ",
                 nrow(data_this_Chr_gaps_over_Xbp),
                 " gaps between adjacent SNPs on Chromosome ",
                 Chr,
                 " greater than ",
                 gap_min,
                 " bp"))

    SNPS_w_gaps_gt_gap_min <- data_this_Chr_gaps_over_Xbp[order(data_this_Chr_gaps_over_Xbp$distance_from_last,
                                                                 decreasing = TRUE), ]

    if(plot == TRUE){
        hist(SNPS_w_gaps_gt_gap_min$distance_from_last,
             col = "lightblue",
             main = paste0("Histogram of lengths of >",
                           gap_min/1e3,
                           "kb gaps between adjacent SNPs on Chr. ", Chr),
             xlab = "Distance (in base pairs) of a given SNP from previous SNP",
             breaks = seq(0,
                          max(SNPS_w_gaps_gt_gap_min$distance_from_last) + bin_width,
                          by = bin_width),
             ylab = "Number of SNPs in bin")
    }
    
    if(!dir.exists(output_dir)) dir.create(output_dir)
    
    fwrite(SNPS_w_gaps_gt_gap_min,
           paste0("Chr", Chr, "_gt", gap_min, "bp_gaps.csv"))
    
}


In [None]:
head(data_CMplot)

In [None]:
library(data.table)

In [None]:
for(Chr in sort(as.numeric(as.character(
    levels(factor(data_CMplot$chromosome)))))){
    SNP_gap_hist(data = data_CMplot,
                 Chr = Chr,
                 gap_min = 1e4,
                 bin_width = 1e3,
                 plot = TRUE,
                 output_dir = 'Common_SNP_gaps_gt10kb')
}