# Further investigate which `windowSize` parameters tend to give best `R` when `alpha` fixed at 0.5

In [1]:
library(CpGWAS)

In [2]:
# Want big view for facet plots
options(repr.plot.width=10, repr.plot.height=8)

In [3]:
scaff_paths <- list.files("../output/", pattern = "rds", full.names = TRUE)

extract_info <- function(path) {
  matches <- regmatches(path, regexpr("\\d{8}-\\d{6}", path))
  if (length(matches) > 0) {
    datetime <- strsplit(matches, "-")[[1]]
    date <- paste(substr(datetime[1], 1, 4), substr(datetime[1], 5, 6), substr(datetime[1], 7, 8), sep="-")
    time <- paste(substr(datetime[2], 1, 2), substr(datetime[2], 3, 4), substr(datetime[2], 5, 6), sep=":")
    return(c(date, time))
  } else {
    return(c(NA, NA))
  }
}

data_frame <- do.call(rbind, lapply(scaff_paths, function(path) {
  info <- extract_info(path)
  data.frame(path = path, date = info[1], time = info[2], stringsAsFactors = FALSE)
}))

sorted_data_frame <- data_frame[order(data_frame$date, data_frame$time), ]

tail(sorted_data_frame)


Unnamed: 0_level_0,path,date,time
Unnamed: 0_level_1,<chr>,<chr>,<chr>
38,../output//libd_chr1-chr1_AA-libd_chr1-chr1_AA-1011000-1011999-dynamic-1corestotal-1corepera-20240205-165817.rds,2024-02-05,16:58:17
36,../output//libd_chr1-chr1_AA-libd_chr1-chr1_AA-1010000-1010999-dynamic-1corestotal-1corepera-20240208-104421.rds,2024-02-08,10:44:21
39,../output//libd_chr1-chr1_AA-libd_chr1-chr1_AA-1011000-1011999-dynamic-1corestotal-1corepera-20240208-104422.rds,2024-02-08,10:44:22
34,../output//libd_chr1-chr1_AA-libd_chr1-chr1_AA-1009000-1009999-dynamic-1corestotal-1corepera-20240212-125021.rds,2024-02-12,12:50:21
37,../output//libd_chr1-chr1_AA-libd_chr1-chr1_AA-1010000-1010999-dynamic-1corestotal-1corepera-20240212-125022.rds,2024-02-12,12:50:22
40,../output//libd_chr1-chr1_AA-libd_chr1-chr1_AA-1011000-1011999-dynamic-1corestotal-1corepera-20240212-125023.rds,2024-02-12,12:50:23


In [None]:
scaff_paths <- sorted_data_frame$path[38:40]

In [None]:
scaff_paths

In [None]:
convertToDataFrame <- function(object) {
  if (!inherits(object, "MethylationScaff")) {
    stop("The object must be of class 'MethylationScaff'.")
  }

  modelsList <- lapply(object@models, function(model) {
    data.frame(
      scaffoldIdentifier = object@scaffoldIdentifier,
      methylationPosition = model@methylationPosition,
      windowSize = model@windowSize,
      nSNPs = model@n_SNPs,
      cor = model@evaluation_results['cor'],
      mse = model@evaluation_results['mse'],
      alpha = model@alpha,
      lambda = model@lambda
    )
  })

  do.call("rbind", modelsList)
}

In [None]:
library(dplyr)

In [None]:
df <- data.frame()

In [None]:
for(scaff_path in scaff_paths){
    my_scaff <- readRDS(scaff_path)
    small_df <- convertToDataFrame(my_scaff)
    df <- bind_rows(df, small_df)
}

Let's get the first and last SNP positions so we can properly define our SNP window

In [None]:
start_pos <- min(df$methylationPosition)
end_pos <- max(df$methylationPosition)

In [None]:
start_pos

In [None]:
end_pos

In [None]:
head(levels(factor(df$methylationPosition))) # make sure these don't all end in 0

What percentage of models have all coefficients dropped during regularization?

In [None]:
sum(is.na(df$cor))/dim(df)[1]

In [None]:
nrow(df)

In [None]:
t(t(table(df$alpha)))

In [None]:
table(df$windowSize)

What if we limit to the best model for each positions?

## Which alpha, window_size tend to give most `NA`?

We get `NA` values for cor when all SNPs are dropped during regularization.

In [None]:
library(dplyr)
library(ggplot2)

na_percentage.1 <- df %>%
  mutate(alpha = factor(alpha, labels = levels(factor(alpha))),
         windowSize = factor(windowSize, labels = paste("window =", levels(factor(windowSize))))) %>%
  group_by(alpha, windowSize) %>%
  summarise(PercentageNA = mean(is.na(cor)) * 100, .groups = 'drop')

ggplot(na_percentage.1, aes(x = as.factor(alpha), y = PercentageNA)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  #geom_text(aes(label = sprintf("%.2f%%", PercentageNA)), vjust = -1, hjust = 0.5, angle = 45) +
  facet_wrap(~windowSize) +
  theme_minimal() +
  labs(title = paste0("Percentage of sites with NA R by\nalpha (x-axis) and window_size (facet)\nChr1:",
                      start_pos, ":", end_pos), x = "alpha", y = "Percentage of NA") +
  theme(
    text = element_text(size = 20),
    plot.title = element_text(size = 22),
    axis.title = element_text(size = 20),
    axis.text.x = element_text(size = 14, angle = 45, hjust = 1)
  )

na_percentage.2 <- df %>%
  mutate(alpha = factor(alpha, labels = paste("alpha =", levels(factor(alpha)))),
         windowSize = factor(windowSize, labels = levels(factor(windowSize)))) %>%
  group_by(alpha, windowSize) %>%
  summarise(PercentageNA = mean(is.na(cor)) * 100, .groups = 'drop')

ggplot(na_percentage.2, aes(x = as.factor(windowSize), y = PercentageNA)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  #geom_text(aes(label = sprintf("%.2f%%", PercentageNA)), vjust = -1, hjust = 0.5, angle = 45) +
  facet_wrap(~alpha) +
  theme_minimal() +
  labs(title = paste0("Percentage of sites with NA R by\nwindow size (x-axis) and alpha (facet)\nChr1:",
                      start_pos, ":", end_pos), x = "Window Size", y = "Percentage of NA") +
  theme(
    text = element_text(size = 20),
    plot.title = element_text(size = 22),
    axis.title = element_text(size = 20),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14)
  )


In [None]:
library(dplyr)
library(ggplot2)

df_formatted.1 <- df %>%
  mutate(alpha = factor(alpha, labels = paste(levels(factor(alpha)))),
         windowSize = factor(windowSize, labels = paste("window =", levels(factor(windowSize)))))

p_alpha_formatted <- ggplot(df_formatted.1, aes(x = alpha, y = cor)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.05) +
  geom_violin(fill = "skyblue", alpha = 0.3) +
  facet_wrap(~windowSize) +
  theme_minimal() +
  labs(title = paste0("Correlation (R) by alpha (x-axis) and window_size (facet)\nChr1:",
                      start_pos, ":", end_pos), x = "Alpha", y = "Correlation (R)") +
  scale_y_continuous(breaks = seq(floor(min(df$cor, na.rm = TRUE)), ceiling(max(df$cor, na.rm = TRUE)), by = 0.1)) +
  theme(
    text = element_text(size = 20),
    plot.title = element_text(size = 22),
    axis.title = element_text(size = 20),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 10)
  )

df_formatted.2 <- df %>%
  mutate(alpha = factor(alpha, labels = paste("alpha =", levels(factor(alpha)))),
         windowSize = factor(windowSize, labels = paste(levels(factor(windowSize)))))

p_windowSize_formatted <- ggplot(df_formatted.2, aes(x = windowSize, y = cor)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.05) +
  geom_violin(fill = "skyblue", alpha = 0.3) +
  facet_wrap(~alpha) +
  theme_minimal() +
  labs(title = paste0("Correlation (R) by window size (x-axis) and alpha (facet)\nChr1:",
                      start_pos, ":", end_pos), x = "Window Size", y = "Correlation (R)") +
  scale_y_continuous(breaks = seq(floor(min(df$cor, na.rm = TRUE)), ceiling(max(df$cor, na.rm = TRUE)), by = 0.1)) +
  theme(
    text = element_text(size = 20),
    plot.title = element_text(size = 22),
    axis.title = element_text(size = 20),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12)
  )

print(p_alpha_formatted)
print(p_windowSize_formatted)


<div class="alert alert-block alert-info">Including two additional window sizes (200k and 2M) it becomes increasingly clear to the eye that there is in fact a trend.</div>

# Track trends of changes in `alpha`, `window_size` for `cor` of specific given CpG sites

## By plotting

#### Without scaling `cor`

In [None]:
convert_and_order_windowSizes <- function(df, windowSizeColumn = 'windowSize') {
  convert_to_genetic_terms <- function(size) {
    if (size >= 1e6) {
      return(paste0(size / 1e6, "Mb"))
    } else if (size >= 1e3) {
      return(paste0(size / 1e3, "kb"))
    } else {
      return(paste0(size, "bp"))
    }
  }
  
  df$windowSizeLabel <- sapply(df[[windowSizeColumn]], convert_to_genetic_terms)
  
  sizes_numeric <- unique(df[[windowSizeColumn]])
  labels <- sapply(sizes_numeric, convert_to_genetic_terms)
  order_mapping <- setNames(sizes_numeric, labels)
  
  ordered_labels <- names(sort(order_mapping))
  
  return(df)
}

df <- convert_and_order_windowSizes(df, 'windowSize')


In [None]:
ggplot(df, aes(x = windowSizeLabel, y = cor, group = methylationPosition)) +
  geom_line(aes(color = as.factor(methylationPosition)), alpha = 0.2) +
  theme_minimal() +
  labs(title = paste0("Correlation (R) by Methylation Position across Window Sizes\nfor 2000 windows Chr1:",
                      start_pos, ":", end_pos),
       x = "Window Size", y = "Correlation (R)") +
  theme(legend.position = "none",
        text = element_text(size = 20),
        plot.title = element_text(size = 22),
        axis.title = element_text(size = 20),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))


In [None]:
filter_random_methylation_sites <- function(df, x) {
  x <- min(x, length(unique(df$methylationPosition)))
  
  selected_sites <- sample(unique(df$methylationPosition), x)
    
  filtered_df <- df %>% filter(methylationPosition %in% selected_sites)
  
  return(filtered_df)
}

filtered_df <- filter_random_methylation_sites(df, x = 20)
#head(filtered_df)


In [None]:
ggplot(filtered_df, aes(x = windowSizeLabel, y = cor, group = methylationPosition)) +
  geom_line(aes(color = as.factor(methylationPosition)), alpha = 0.75, size = 1.5) +
  theme_minimal() +
  labs(title = "Correlation (R) by methylation position across window sizes\n for 20 randomly selected windows",
       x = "Window Size", y = "Correlation (R)") +
  theme(legend.position = "none",
        text = element_text(size = 20),
        plot.title = element_text(size = 22),
        axis.title = element_text(size = 20),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))


#### With scaling `cor`

In [None]:
library(dplyr)

scale_cor_values <- function(df) {
  df %>% 
    group_by(methylationPosition) %>% 
    mutate(scaled_cor = (cor - min(cor)) / (max(cor) - min(cor)) * 100) %>% 
    ungroup()
}

df_scaled <- scale_cor_values(filtered_df)


In [None]:
max(df_scaled$scaled_cor, na.rm = TRUE)

In [None]:
min(df_scaled$scaled_cor, na.rm = TRUE)

In [None]:
ggplot(df_scaled, aes(x = windowSizeLabel, y = scaled_cor, group = methylationPosition)) +
  geom_line(aes(color = as.factor(methylationPosition)), alpha = 0.75, size = 1.5) +
  theme_minimal() +
  labs(title = "Correlation (R) by methylation position across window sizes",
       x = "Window Size", y = "Correlation (R), scaled") +
  theme(legend.position = "none",
        text = element_text(size = 20),
        plot.title = element_text(size = 22),
        axis.title = element_text(size = 20),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))

<div class="alert alert-block alert-info">What a mess... This didn't clear things up like I hoped. Let's try a nonvisual approach to see if decline of R over window sizes is significant.</div>

### Statistical models

Let's start with a simple linear model and ANOVA to see if we're able to detect any difference across groups for window size, alpha, and interaction.

In [None]:
model <- lm(cor ~ windowSize,
            data = df)

In [None]:
model

In [None]:
anova(model)

<div class="alert alert-block alert-info">Yes, clear effect</div>

## What's the relationship between SNP effect size and distance from methylation site... and how does this depend on window size?

First, extract all SNP coefficients, IDs, and corresponding methylation sites for our models.<br><br>Let's only make this df for one window size at a time, for fair comparisons.

In [None]:
SNPs_to_df <- function(object) {
  modelsList <- lapply(object@models, function(model) {
    snpIDs <- names(model@snpWeights)
    snpWeights <- model@snpWeights

    expandedModelAttrs <- data.frame(
      scaffoldIdentifier = rep(object@scaffoldIdentifier, length(snpIDs)),
      methylationPosition = rep(model@methylationPosition, length(snpIDs)),
      windowSize = rep(model@windowSize, length(snpIDs)),
      SNP_ID = snpIDs,
      SNP_Weight = snpWeights,
      cor = rep(model@evaluation_results[['cor']], length(snpIDs))
    )

    return(expandedModelAttrs)
  })

  df <- do.call("rbind", modelsList)
  
  df <- convert_and_order_windowSizes(df, 'windowSize')
  
  return(df)
}

In [None]:
snp_df <- data.frame()

In [None]:
for(scaff_path in scaff_paths){
    my_scaff <- readRDS(scaff_path)
    small_snp_df <- SNPs_to_df(my_scaff)
    snp_df <- bind_rows(snp_df, small_snp_df)
}

In [None]:
snp_df$SNP_Position <- as.numeric(gsub(".*:(\\d+):.*", "\\1", snp_df$SNP_ID))
snp_df$Distance = abs(snp_df$methylationPosition - snp_df$SNP_Position)


In [None]:
dim(snp_df)

In [None]:
r_squared_df <- snp_df %>%
  group_by(windowSize) %>%
  do({
    model <- lm(cor ~ Distance, data = .)
    data.frame(R2 = summary(model)$r.squared)
  }) %>%
  ungroup() %>%
  mutate(windowSize = as.factor(windowSize))

print(r_squared_df)


<div class="alert alert-block alert-info">Very weak correlation between distance from methylation site and SNP effect size is surprising</div>

In [None]:
library(ggplot2)

ggplot(snp_df, aes(x = Distance, y = SNP_Weight, color = as.factor(windowSize))) +
  geom_point(alpha = 0.1) +
  #geom_smooth(method = "lm", se = FALSE, size = 2) + # Add trendline
  labs(title = "SNP effect size vs. distance by window size",
       x = "Distance between SNP and methylation site",
       y = "SNP effect size") +  
  theme(legend.position = "none",
        text = element_text(size = 20),
        plot.title = element_text(size = 22),
        axis.title = element_text(size = 20),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))

In [None]:
library(ggplot2)

ggplot(snp_df, aes(x = Distance, y = SNP_Weight, color = as.factor(windowSize))) +
  geom_point(alpha = 0.2) +
  #geom_smooth(method = "lm", se = FALSE, size = 2) + # Add trendline
  labs(title = "SNP effect size vs. distance by window size (y-truncated)",
       x = "Distance between SNP and methylation site",
       y = "SNP effect size") +  
  ylim(0, 0.1)+
  theme(text = element_text(size = 20),
        plot.title = element_text(size = 22),
        axis.title = element_text(size = 20),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))

Does this trend hold up if we only look at large windows?

In [None]:
library(ggplot2)

snp_df_subset <- snp_df[which(snp_df$windowSize >= 500000), ]

ggplot(snp_df_subset, aes(x = Distance, y = SNP_Weight, color = as.factor(windowSize))) +
  geom_point(alpha = 0.2) +
  #geom_smooth(method = "lm", se = FALSE, size = 2) + # Add trendline
  labs(title = "SNP effect size vs. distance by window size (y-truncated)",
       x = "Distance between SNP and methylation site",
       y = "SNP effect size") +  
  ylim(0, 0.025)+
  theme(text = element_text(size = 20),
        plot.title = element_text(size = 22),
        axis.title = element_text(size = 20),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))

In [None]:
library(ggplot2)

snp_df_subset <- snp_df[which(snp_df$windowSize >= 2000000), ]

ggplot(snp_df_subset, aes(x = Distance, y = SNP_Weight, color = as.factor(windowSize))) +
  geom_point(alpha = 0.2) +
  #geom_smooth(method = "lm", se = FALSE, size = 2) + # Add trendline
  labs(title = "SNP effect size vs. distance by window size (y-truncated)",
       x = "Distance between SNP and methylation site",
       y = "SNP effect size") +  
  ylim(0, 0.025)+
  theme(text = element_text(size = 20),
        plot.title = element_text(size = 22),
        axis.title = element_text(size = 20),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))

## When window size varies, how similar is correlation for given SNPs

<div class="alert alert-block alert-info">In these plots, each dot represents a SNP included in runs for both window sizes, with cor for each window size shown on a different axis.</div>

In [None]:
plotSNPEffectSizes <- function(data, labelX, labelY) {
  # Filter for SNPs present in both specified window size labels
  dataX <- subset(data, windowSizeLabel == labelX)
  dataY <- subset(data, windowSizeLabel == labelY)
  
  # Find common SNPs in both window size labels
  commonSNPIDs <- intersect(dataX$SNP_ID, dataY$SNP_ID)
  
  # Subset data for common SNPs
  dataXCommon <- subset(dataX, SNP_ID %in% commonSNPIDs)
  dataYCommon <- subset(dataY, SNP_ID %in% commonSNPIDs)
  
  commonSNPs <- merge(dataXCommon, dataYCommon, by = "SNP_ID", suffixes = c("_x", "_y"))
  
  plot <- ggplot(commonSNPs, aes(x = SNP_Weight_x, y = SNP_Weight_y)) +
    geom_point(alpha = 0.05) +
    geom_smooth(method = "lm", se = FALSE, color = "blue", size = 2) +
    labs(title = paste("SNP effect size comparison between window sizes", labelX, "and", labelY),
         x = paste("SNP Effect Size - Window Size", labelX),
         y = paste("SNP Effect Size - Window Size", labelY)) +
    theme_minimal() +
    theme(legend.position = "none",
          text = element_text(size = 20),
          plot.title = element_text(size = 22),
          axis.title = element_text(size = 20),
          axis.text.x = element_text(size = 14),
          axis.text.y = element_text(size = 14))
  
  print(plot)
  
  # Calculate and print R^2 for linear model of SNP_Weight comparison
  model <- lm(SNP_Weight_y ~ SNP_Weight_x, data = commonSNPs)
  rSquared <- summary(model)$r.squared
  cat("R^2:", rSquared, "\n")
}

plotSNPEffectSizes(snp_df, "5kb", "500kb")


<div class="alert alert-block alert-info">Weak agreement when window sizes are very different</div>

In [None]:
levels(factor(snp_df$windowSize))

In [None]:
plotSNPEffectSizes(snp_df, "10kb", "20kb")

<div class="alert alert-block alert-info">Agreement is not much stronger when window sizes are closer</div>

In [None]:
plotSNPEffectSizes(snp_df, "50kb", "500kb")

## How frequently does each possible window size give highest cor for given methylation site?

In [None]:
highest_cor_labels <- aggregate(cor ~ methylationPosition, data = df, function(x) df$windowSizeLabel[which.max(x)])
frequency_of_max_cor <- table(highest_cor_labels$cor)
print(frequency_of_max_cor)


In [None]:
percentage_of_max_cor <- prop.table(frequency_of_max_cor) * 100
print(percentage_of_max_cor)
sum(percentage_of_max_cor)

<div class="alert alert-block alert-info">All considered, there seems to be no easy answer for best window size</div>