# Compare MWAS methods for sanity tests and troubleshooting

In version e, we unify the elastic.net functions and make sure 1se and 1min produce different results.

In version f, we try old covariates with all new data for everything else

In version g, we do another quick sanity test with "all" sample methylation data instead of AA.

In version h, we try old covariates, new BSseq data, new everything else. Since we're using old BSseq data, we must change back to Chr 7 for the selected peak for which we have sample data. We got normal results!

In version i, change back to Chr 1 and see if we still get normal results.... We did not!

Ok, now let's try other chromosomes/regions with random sites (version l)

version m: add manhattan plots and MAF filter

version n: go back to region in Shizhong's `BSsample.rda` file

Down the road (o), let's change covariates back to ours

Also in o, we found that Shizhong's old results still don't quite match up with ours.

So now in p, we will use original BSsample and see if they finally do... In p, we still get different results.

Now in q, we use old covar in addition to old BSsample.

In r, we also use old SNPs. Ding ding ding, now we get approximately same results (although slightly different I guess due to stochasticity)

In s, we use new SNPs for reference population in stage 2, but old SNPs for LIBD in stage 1. We get different result again.

In t, we switch back to old reference population SNPs for stage 2

In u, we closely inspect Shizhong's snp.1kg.eur2 and compare it to ours. We discovered alleles are flipped differently for 10 SNPs and not the other 56.

In v, we do another sanity check where we use our re-created snp.1kg.eur2 made from PLINK files, but use the exact same 56 SNPs as in Shizhong's version. Of note, 10 of these don't match, only the other 46.

In [1]:
Sys.time()

[1] "2024-08-06 13:08:28 PDT"

In [2]:
chr <- 7 # code set up for old bsseq chr 7, or anything with new bsseq
maf <- 0.05

min_site_to_test_pos <- 1980077
max_site_to_test_pos <- 1989957

In [3]:
sites_to_test_pos <- "range"
#n_samples <- 500

In [4]:
# # Chr 1 sites
# sites_to_test_pos <- c(73274305, 73274312, 73292330, 73307769, 73308571, 73419188, 73419830, 73420076)

In [5]:
filter_snps <- FALSE # speed things up by pre-filtering whole chromosomes to desired regions

## Be ready with matched up SNP and covariate files

In [6]:
library("glmnet")
library("e1071")
library("doParallel")
library("data.table")

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [7]:
df <- fread("09-OUT_matched_SNP_meth_cov_a2.csv")

In [8]:
df <- df[which(df$Chr == chr), ]

In [9]:
df <- df[which(df$population == "all"), ]
df <- df[which(df$brain_region == "caud"), ]

In [10]:
df

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,subpopulation,brain_region,population,region,cov_file,cov_file2,cov_file3,cov_file4
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
7,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr7.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr7_all.rda,159334659,49742,1490198,1,all,caud,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth-no-dx.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-dx.csv


In [11]:
i <- 1

In [12]:
df[1, ]

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,subpopulation,brain_region,population,region,cov_file,cov_file2,cov_file3,cov_file4
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
7,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr7.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr7_all.rda,159334659,49742,1490198,1,all,caud,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth-no-dx.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-dx.csv


In [13]:
set.seed(2018)
wind <- c(10000)
# output directory
#outd <- "/dcl02/lieber/shan/shizhong/finemapping/GWAS/tags/scz3/mwas/chr22/1/"
outd <- "20-OUT_original_mwas_sanity_test/"

## Functions

### Shizhong's original

In [14]:
###### model: learn elastic net model on training data 
######---------Input: trainX, trainY
######---------Return: selected features and coefficents

# original
# elastic.net <- function(trainX,trainY){
#     if(nrow(trainX)!=length(trainY)){
#             stop("Number of observations is differerent")
#     } 

#     # optimize alpha---mixing parameter  
#     a <- 0.5
#     search <- foreach(ai = a, .combine = rbind) %dopar% {
#         cv.fit <- cv.glmnet(
#                         trainX,
#                         trainY,
#                         nfold = 5,
#                         type.measure = "mse",
#                         paralle = TRUE,
#                         alpha = ai
#                         )
#         data.frame(
#                         cvm = min(cv.fit$cvm),
#                         lambda = cv.fit$lambda.min,
#                         alpha = ai
#                         )
#         } 
#     cv.opt <- search[search$cvm == min(search$cvm),] 

#         # fit model by optimized alpha and lambda
#         yfit = glmnet(
#         trainX,
#         trainY,
#         lambda = cv.opt$lambda,
#         alpha = cv.opt$alpha
#                 )       
#         idf <- coef(yfit)
#         idx <- which(idf != 0)
#         selectf <- data.frame(
#                 features = idf@Dimnames[[1]][idx], 
#                 coefs = idf [idx]
#         )
# }

MWAS <- function(gwas, weight, geno){
        z <- gwas %*% weight
        z.cor <- cor(geno)
        se <- sqrt(weight %*%  z.cor %*%  weight)
        z <- z/se
        p=pnorm(abs(z),lower.tail=F)*2
        return(c(z, p))
}

#### Modified `elastic.net` to use `lambda.1se`

In [15]:
# modified to use lambda 1se and appropriate cvm
elastic.net <- function(trainX,trainY, lambda.choice = "1se"){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        #set.seed(42)
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        parallel = TRUE,
                        alpha = ai
                        )
        #print(paste0("Dim of trainX: ", dim(trainX)))
        #print(paste0("Len of trainY: ", length(trainY)))
        coef_matrix <- as.matrix(coef(cv.fit))

        if(lambda.choice == "1se"){
            chosen_lambda <- cv.fit$lambda.1se
            chosen_cvm <- cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se]
        }
        if(lambda.choice == "min"){
            chosen_lambda <- cv.fit$lambda.min
            chosen_cvm <- min(cv.fit$cvm)
        }


        data.frame(
                        cvm = chosen_cvm,
                        lambda = chosen_lambda,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
    #set.seed(42)
    yfit <- glmnet(
    trainX,
    trainY,
    lambda = cv.opt$lambda,
    alpha = cv.opt$alpha)

    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
            features = idf@Dimnames[[1]][idx], 
            coefs = idf [idx]
    )
}

In [16]:
# version using for loop for easier debugging

elastic.net <- function(trainX, trainY, lambda.choice = "1se") {
    if (nrow(trainX) != length(trainY)) {
        stop("Number of observations is different")
    }

    a <- 0.5
    search <- NULL

    for (ai in a) {
        cv.fit <- cv.glmnet(
            trainX,
            trainY,
            nfold = 5,
            type.measure = "mse",
            parallel = FALSE,
            alpha = ai
        )
        coef_matrix <- as.matrix(coef(cv.fit))

        if (lambda.choice == "1se") {
            chosen_lambda <- cv.fit$lambda.1se
            chosen_cvm <- cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se]
        }
        if (lambda.choice == "min") {
            chosen_lambda <- cv.fit$lambda.min
            chosen_cvm <- min(cv.fit$cvm)
        }

        search <- rbind(
            search,
            data.frame(
                cvm = chosen_cvm,
                lambda = chosen_lambda,
                alpha = ai
            )
        )
    }

    cv.opt <- search[search$cvm == min(search$cvm),]

    yfit <- glmnet(
        trainX,
        trainY,
        lambda = cv.opt$lambda,
        alpha = cv.opt$alpha
    )

    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
        features = idf@Dimnames[[1]][idx],
        coefs = idf[idx]
    )
}


## Replace all old objects with new objects in same format

### Methylation data

In [17]:
suppressWarnings(library(bsseq))

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:data.table’:

    first, second


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following object is masked from ‘package:utils’:

    findMatches


The

In [18]:
# # # The code in this block is for NEW methylation data, which can be for any chromosome or portion

# # # load data for mwas
# # # load("./rda/caudate_mwas_data_chr22.rda")
# load(df$methylation_data[i])

# p <- getMeth(BSobj2)


# rownames(p) <- start(BSobj2)

# if(sites_to_test_pos[1] == "random"){
#     sites_to_test_pos <- sample(start(BSobj2), n_samples)
# } else {
#     sites_to_test_pos <- start(BSobj2)[which(start(BSobj2) >= min_site_to_test_pos & start(BSobj2) <= max_site_to_test_pos)]
# }

# sites_to_test <- which(start(BSobj2) %in% sites_to_test_pos)
# p <- p[sites_to_test, ]
# cg <- as.numeric(rownames(p))

In [19]:
# length(sites_to_test_pos)

In [20]:
# head(sites_to_test_pos)

Subset our p object to the same sites as the old one, compare values

In [21]:
# Old dataset
load("BSsample.rda", verbose = TRUE) # we get the @colData attribute as a data.frame here
load("p1.rda", verbose = TRUE) # and the p matrix from getMeth() here

BSobj2 <- BSsample
cg <- as.numeric(rownames(p)) # This line same whether we use old or new version
sites_to_test_pos <- cg

Loading objects:
  BSsample
Loading objects:
  snp.gwas2
  snp.1kg.eur2
  map.1kg.eur2
  snp2
  map2
  p
  BSsample


### covariates

In [22]:
# #The code in this block is for NEW covariate data

# covs <- fread(df$cov_file[i])
# covs <- t(covs)
# colnames(covs) <- covs[1, ]
# covs <- covs[2:nrow(covs), ]
# # transpose so we have same orientation as original code

In [23]:
# This is the OLD covariate data

load("covs_for_meqtl.rda")

### Regress methylation data over covariates

In [24]:
BSobj2$brnum <- gsub("Br0", "Br", BSobj2$brnum)
colnames(covs) <- gsub("Br0", "Br", colnames(covs))

In [25]:
mat <- match(BSobj2$brnum,colnames(covs)) 
covs <- t(covs[,mat])
p.residual=matrix(NA,dim(p)[1],dim(p)[2])

In [26]:
# This is something we only need to do for new covariates
if("genoPC1" %in% colnames(covs)){
    rownames(covs)[is.na(covs[, 'genoPC1'])] <- BSobj2$brnum[is.na(covs[, 'genoPC1'])]
}

In [27]:
colnames(p.residual) <- BSobj2$brnum

In [28]:
covs <- as.data.frame(covs)
# Convert all columns except Dx and Sex from character to numeric
cols_to_convert <- setdiff(names(covs), c("Dx", "Sex"))

for (col in cols_to_convert) {
  covs[[col]] <- as.numeric(covs[[col]])
}

# Print the modified data frame to check the conversion
#print(dat)


In [29]:
for (i in 1:dim(p)[1]) { # For each methylation site
    dat <- as.data.frame(cbind(y = p[i,], covs))
    
    # Check for rows with NAs (the ones for which we don't have covariate data)
    valid_rows <- complete.cases(dat)
    
    if (sum(valid_rows) > 0) {
        dat_valid <- dat[valid_rows,]
        model.res <- lm(y ~ ., data = dat_valid)
        
        # Store residuals in the corresponding positions
        p.residual[i, valid_rows] <- resid(model.res)
    }
}


# for(i in 1:dim(p)[1]){ # foro each methylation site
#         dat <- as.data.frame(cbind(p[i,],covs))
#         colnames(dat) <- c("y",paste0("x",1:ncol(covs)))
#         model.res <- lm(reformulate(paste0("x",1:ncol(covs)), "y"),dat)
#         p.residual[i,] = resid(model.res) 
# }

In [30]:
dim(p.residual)

In [31]:
p.residual[1:8, 1:8]

Br836,Br845,Br848,Br863,Br914,Br948,Br949,Br963
0.01273526,0.003955796,-0.006492196,-0.003937108,-0.003108317,-0.026351,-0.01780211,0.02198214
0.01244238,0.004791327,-0.006684356,-0.004304248,-0.002787528,-0.02590003,-0.01859511,0.02199029
0.01199289,0.005968815,-0.006910408,-0.004612377,-0.002502153,-0.02531498,-0.01932406,0.02187912
0.01185321,0.006292368,-0.006985009,-0.004655232,-0.002462299,-0.02516852,-0.01943433,0.02181896
0.01165765,0.006722936,-0.007091745,-0.004689633,-0.002431018,-0.02498177,-0.019532,0.02172186
0.01077923,0.008466136,-0.007601638,-0.004627312,-0.002500745,-0.02429218,-0.01948105,0.02116477
0.01066253,0.008680782,-0.007672761,-0.004601013,-0.002526803,-0.02421065,-0.01943371,0.02108082
0.0103596,0.009223814,-0.007860604,-0.004518253,-0.002606966,-0.02400573,-0.01927932,0.02085531


In [32]:
sum(is.na(p.residual))

In [33]:
sum(rowSums(is.na(p.residual)) == ncol(p.residual))
sum(colSums(is.na(p.residual)) == nrow(p.residual))

In [34]:
# p.residual <- p.residual[, colSums(is.na(p.residual)) != nrow(p.residual)]

In [35]:
sum(is.na(p.residual))

In [36]:
snp.gwas2 <- NULL

In [37]:
#load("p1.rda", verbose = TRUE)

### summary stats

In [38]:
library(data.table)
library(CpGWAS)

In [39]:
ss_path <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

In [40]:
snp.gwas2 <- fread(ss_path, skip = 1, header = FALSE)
colnames(snp.gwas2) <- strsplit(readLines(ss_path, n = 1), "\t")[[1]]

In [41]:
snp.gwas2$z <- log(snp.gwas2$OR)/snp.gwas2$SE

In [42]:
snp.gwas2 <- snp.gwas2[, c(2, 1, 3, 3, 8, 4, 5, 20, 11)]

In [43]:
head(snp.gwas2, n = 1)

SNP,CHR,BP,BP,INFO,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs62513865,8,100579985,100579985,0.963,C,T,0.7016221,0.4847


In [44]:
colnames(snp.gwas2)[1:5] <- c("snp", "chr", "pos_hg38", "pos_hg38", "info")

In [45]:
CHR <- chr # to avoid R df local env mixing variables of columns name and subset variable

In [46]:
snp.gwas2 <- snp.gwas2[which(snp.gwas2$chr == CHR), ]

In [47]:
if(filter_snps == TRUE){
    snp.gwas2 <- snp.gwas2[which(snp.gwas2$pos_hg38 >= (min(sites_to_test_pos)-10000) & snp.gwas2$pos_hg38 <= (max(sites_to_test_pos) + 10000)), ]
}

In [48]:
snp.gwas2 <- snp.gwas2[order(snp.gwas2$pos_hg38), ]

In [49]:
head(BSobj2)

DataFrame with 6 rows and 10 columns
               ID       brnum has_genotype   source  agedeath      sex     race
      <character> <character>    <logical> <factor> <numeric> <factor> <factor>
1 LIBD1633_190509       Br836         TRUE       DC     20.77        M       AA
2 LIBD1654_190509       Br845         TRUE       DC     24.79        M       AA
3 LIBD1508_190509       Br848         TRUE       DC     22.71        M       AA
4 LIBD1655_190509       Br863         TRUE       DC     43.85        M       AA
5 LIBD1604_190509       Br914         TRUE       DC     19.69        F       AA
6 LIBD1389_190509       Br948         TRUE       DC     45.91        M       AA
  primarydx       pmi        ph
   <factor> <numeric> <numeric>
1   Control      25.5      6.46
2   Control      43.5      6.94
3   Control      38.5      6.57
4   Control      24.5      6.57
5   Control      19.0      6.98
6   Control      38.5      6.66

In [50]:
dim(BSobj2)

In [51]:
BSobj2 <- BSobj2[which(BSobj2$brnum %in% colnames(p.residual)), ]

In [52]:
dim(BSobj2)

In [53]:
# built predition models
idx.ea <- BSobj2$race == "CAUC"

In [54]:
levels(factor(snp.gwas2$chr))

### SNPs in LIBD population

#### Old dataset

#### New dataset prep: For reference, first load Shizhong's formatted SNPs on Chr7

In [55]:
load("p1.rda", verbose = TRUE)

snp2_sorted <- snp2[, order(names(snp2))]

colnames(snp2) <- gsub("Br0", "Br", colnames(snp2))

snp2 <- snp2[, colnames(snp2) %in% colnames(p.residual)]

snp2_positions <- stringr::str_split_fixed(rownames(snp2), ":", 3)[, 2]

snp3 <- snp2
map3 <- map2

Loading objects:
  snp.gwas2
  snp.1kg.eur2
  map.1kg.eur2
  snp2
  map2
  p
  BSsample


#### New dataset: Now let's load ours on Chr1

In [56]:
# paths <- list(
#   pvar_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr", chr, ".pvar"),
#   pgen_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr", chr, ".pgen"),
#   psam_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr", chr, ".psam")
# )

# my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [57]:
# filter_snps <- TRUE

In [58]:
# if(filter_snps == TRUE){
#     snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= min(sites_to_test_pos)-100000 & my_SNPs$pvar_dt$POS <= max(sites_to_test_pos) + 100000)
# }

In [59]:
# if(filter_snps == TRUE){
#     snp3 <- pgenlibr::ReadList(my_SNPs$pgen,
#                                variant_subset = snp_indices_of_interest)
#     colnames(snp3) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
# } else {
#     snp3 <- pgenlibr::ReadList(my_SNPs$pgen)
#     colnames(snp3) <- my_SNPs$pvar_dt$ID
# }

In [60]:
# snp3[1:10, 1:10]

In [61]:
# rownames(snp3) <- my_SNPs$psam$`#IID`

filter by `maf`

In [62]:
# dim(snp3)

In [63]:
# verbose <- FALSE
# if (maf > 0){
#     mafs <- colMeans(snp3, na.rm = TRUE) / 2
#     mafs_below_threshold <- mafs < maf
#     if (any(mafs_below_threshold)) {
#       if (verbose) {
#         message(paste0("removing ", sum(mafs_below_threshold), " SNP(s) with MAF < ",
#                        maf, " for position ", meth_site_pos,
#                       " with window size ", window_size, ".\n\n"))
#       }
#       snp3 <- snp3[, !mafs_below_threshold, drop = FALSE] 
#     }
# }

In [64]:
# dim(snp3)

In [65]:
# map3 <- data.frame(POS = stringr::str_split_fixed(colnames(snp3), ":", 3)[, 2])

In [66]:
# snp3 <- t(snp3)

In [67]:
# map3 <- data.frame(POS = stringr::str_split_fixed(rownames(snp3), ":", 3)[, 2])

In [68]:
# dim(map3)

### SNPs in reference population

In [122]:
paths <- list(
  pvar_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr", chr, ".pvar"),
  pgen_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr", chr, ".pgen"),
  psam_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr", chr, ".psam")
)

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

filter_snps

#if(filter_snps == TRUE){
    #snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= min(sites_to_test_pos)-10000 & my_SNPs$pvar_dt$POS <= max(sites_to_test_pos) + 10000)
#}

snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 1963098 & my_SNPs$pvar_dt$POS <= 2009071)

snp.1kg.eur3 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp.1kg.eur3) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp.1kg.eur3) <- my_SNPs$psam$`IID`

map.1kg.eur3 <- my_SNPs$pvar_dt

map.1kg.eur3 <- map.1kg.eur3[snp_indices_of_interest, ]

snp.1kg.eur3 <- t(snp.1kg.eur3)

dim(map.1kg.eur3)

dim(snp.1kg.eur3)

### Compare reference population SNPs between Shizhong's old object and my new one

In [123]:
snp.1kg.eur2[1:10, 1:10]

Unnamed: 0_level_0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rs11773627,2,1,2,1,1,2,2,2,2,2
rs6972374,2,2,1,2,1,1,1,2,0,1
rs12666575,0,1,1,1,2,1,1,0,2,1
rs11766575,0,1,1,1,2,1,1,0,2,1
rs4721264,0,1,1,1,1,1,1,0,2,1
rs62442944,2,1,2,1,1,2,2,2,2,2
rs12699547,2,2,1,2,2,1,1,2,0,1
rs6461049,2,1,1,1,1,1,1,2,0,1
rs6969587,2,2,1,2,2,1,1,2,0,1
rs12699561,2,2,1,2,2,1,1,2,0,1


In [124]:
dim(snp.1kg.eur2)

In [125]:
dim(snp.1kg.eur3)

In [126]:
head(map.1kg.eur3)

#CHROM,POS,ID
<int>,<int>,<chr>
7,1963098,rs11773627
7,1963408,rs6972374
7,1963697,rs7795303
7,1964758,rs6946691
7,1964786,rs12666575
7,1964869,rs61467855


In [127]:
min(map.1kg.eur2$POS)
max(map.1kg.eur2$POS)

In [128]:
min(map.1kg.eur3$POS)
max(map.1kg.eur3$POS)

Is it possible that order is the only problem? What happens if we change order to match?

In [129]:
colnames(snp.1kg.eur3) <- paste0(colnames(snp.1kg.eur3), "_", colnames(snp.1kg.eur3))

In [130]:
all(colnames(snp.1kg.eur3) == colnames(snp.1kg.eur2))

I also want to compute MAF for all the SNPs.

I want to know the MAF of all SNPs (rows) that appear in snp.1kg.eur3 but not snp.1kg.eur2

In [131]:
# Compute MAF for all SNPs
maf <- rowMeans(snp.1kg.eur3) / 2

# Find SNPs in snp.1kg.eur3 but not in snp.1kg.eur2
snp_diff <- setdiff(rownames(snp.1kg.eur3), rownames(snp.1kg.eur2))

# MAF of SNPs that appear in snp.1kg.eur3 but not snp.1kg.eur2
maf_diff <- maf[snp_diff]

maf_diff

What's the minimum maf for snp.1kg.eur2

In [132]:
# Compute MAF for all SNPs in snp.1kg.eur2
maf_eur2 <- rowMeans(snp.1kg.eur2) / 2

# Find the minimum MAF for snp.1kg.eur2
min_maf_eur2 <- min(maf_eur2)

min_maf_eur2

In [133]:
# Subset snp.1kg.eur3 to include SNPs found in snp.1kg.eur2
snp.1kg.eur3 <- snp.1kg.eur3[rownames(snp.1kg.eur3) %in% rownames(snp.1kg.eur2), ]

In [134]:
dim(snp.1kg.eur3)

In [135]:
dim(snp.1kg.eur2)

In [136]:
# Reorder rows of snp.1kg.eur3 to match snp.1kg.eur2
snp.1kg.eur3 <- snp.1kg.eur3[rownames(snp.1kg.eur2), ]

In [137]:
snp.1kg.eur3[1:10, 1:10]

Unnamed: 0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109
rs11773627,0,1,0,1,1,0,0,0,0,0
rs6972374,0,0,1,0,1,1,1,0,2,1
rs12666575,2,1,1,1,0,1,1,2,0,1
rs11766575,2,1,1,1,0,1,1,2,0,1
rs4721264,2,1,1,1,1,1,1,2,0,1
rs62442944,0,1,0,1,1,0,0,0,0,0
rs12699547,0,0,1,0,0,1,1,0,2,1
rs6461049,2,1,1,1,1,1,1,2,0,1
rs6969587,0,0,1,0,0,1,1,0,2,1
rs12699561,0,0,1,0,0,1,1,0,2,1


In [138]:
snp.1kg.eur2[1:10, 1:10]

Unnamed: 0_level_0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rs11773627,2,1,2,1,1,2,2,2,2,2
rs6972374,2,2,1,2,1,1,1,2,0,1
rs12666575,0,1,1,1,2,1,1,0,2,1
rs11766575,0,1,1,1,2,1,1,0,2,1
rs4721264,0,1,1,1,1,1,1,0,2,1
rs62442944,2,1,2,1,1,2,2,2,2,2
rs12699547,2,2,1,2,2,1,1,2,0,1
rs6461049,2,1,1,1,1,1,1,2,0,1
rs6969587,2,2,1,2,2,1,1,2,0,1
rs12699561,2,2,1,2,2,1,1,2,0,1


In [139]:
dim(snp.1kg.eur2)

In [140]:
dim(snp.1kg.eur3)

In [141]:
snp.1kg.eur3 <- 2 - snp.1kg.eur3

In [144]:
diff <- snp.1kg.eur3 - snp.1kg.eur2

In [146]:
snp.1kg.eur2[1:10, 1:10]

Unnamed: 0_level_0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rs11773627,2,1,2,1,1,2,2,2,2,2
rs6972374,2,2,1,2,1,1,1,2,0,1
rs12666575,0,1,1,1,2,1,1,0,2,1
rs11766575,0,1,1,1,2,1,1,0,2,1
rs4721264,0,1,1,1,1,1,1,0,2,1
rs62442944,2,1,2,1,1,2,2,2,2,2
rs12699547,2,2,1,2,2,1,1,2,0,1
rs6461049,2,1,1,1,1,1,1,2,0,1
rs6969587,2,2,1,2,2,1,1,2,0,1
rs12699561,2,2,1,2,2,1,1,2,0,1


In [147]:
snp.1kg.eur3[1:10, 1:10]

Unnamed: 0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109
rs11773627,2,1,2,1,1,2,2,2,2,2
rs6972374,2,2,1,2,1,1,1,2,0,1
rs12666575,0,1,1,1,2,1,1,0,2,1
rs11766575,0,1,1,1,2,1,1,0,2,1
rs4721264,0,1,1,1,1,1,1,0,2,1
rs62442944,2,1,2,1,1,2,2,2,2,2
rs12699547,2,2,1,2,2,1,1,2,0,1
rs6461049,0,1,1,1,1,1,1,0,2,1
rs6969587,2,2,1,2,2,1,1,2,0,1
rs12699561,2,2,1,2,2,1,1,2,0,1


I want to create two new dfs, titled snp.1kg.eur2.diff and snp.1kg.eur3.diff, which only show rows that aren't perfect matches between snp.1kg.eur2 and snp.1kg.eur3. This is AFTER we flip alleles so that they match (or at least, so *most* of them match...)

In [148]:
# Find rows that aren't perfect matches between snp.1kg.eur2 and snp.1kg.eur3
mismatch_indices <- which(!apply(snp.1kg.eur2 == snp.1kg.eur3, 1, all))

# Create new dataframes showing rows that aren't perfect matches
snp.1kg.eur2.diff <- snp.1kg.eur2[mismatch_indices, ]
snp.1kg.eur3.diff <- snp.1kg.eur3[mismatch_indices, ]

# Display the new dataframes
snp.1kg.eur2.diff
snp.1kg.eur3.diff


Unnamed: 0_level_0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109,⋯,NA20814_NA20814,NA20815_NA20815,NA20818_NA20818,NA20819_NA20819,NA20821_NA20821,NA20822_NA20822,NA20826_NA20826,NA20827_NA20827,NA20828_NA20828,NA20832_NA20832
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rs6461049,2,1,1,1,1,1,1,2,0,1,⋯,0,2,1,1,0,0,1,2,0,1
rs3996330,0,0,1,0,1,1,1,0,2,1,⋯,2,1,1,1,1,1,1,0,0,0
rs11763870,1,1,1,1,1,1,1,2,0,1,⋯,0,1,1,1,0,0,1,2,0,0
rs1403174,0,0,1,0,0,1,1,0,2,1,⋯,2,0,1,1,1,1,0,0,0,1
rs3857706,0,0,1,0,0,1,1,0,2,1,⋯,2,0,1,1,1,1,0,0,0,1
rs872464,0,0,1,0,0,1,1,0,2,1,⋯,2,0,1,1,1,1,0,0,0,1
rs10950503,0,0,1,0,1,1,1,0,2,1,⋯,2,0,1,1,1,1,0,0,1,1
rs10479762,2,1,1,1,1,1,1,2,0,1,⋯,0,1,1,1,0,0,1,2,0,0
rs871925,2,1,1,1,1,1,1,2,0,1,⋯,0,1,1,1,0,0,1,2,0,0
rs13227554,2,1,1,1,1,1,1,2,0,1,⋯,0,1,1,1,0,0,1,2,0,0


Unnamed: 0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109,⋯,NA20814_NA20814,NA20815_NA20815,NA20818_NA20818,NA20819_NA20819,NA20821_NA20821,NA20822_NA20822,NA20826_NA20826,NA20827_NA20827,NA20828_NA20828,NA20832_NA20832
rs6461049,0,1,1,1,1,1,1,0,2,1,⋯,2,0,1,1,2,2,1,0,2,1
rs3996330,2,2,1,2,1,1,1,2,0,1,⋯,0,1,1,1,1,1,1,2,2,2
rs11763870,1,1,1,1,1,1,1,0,2,1,⋯,2,1,1,1,2,2,1,0,2,2
rs1403174,2,2,1,2,2,1,1,2,0,1,⋯,0,2,1,1,1,1,2,2,2,1
rs3857706,2,2,1,2,2,1,1,2,0,1,⋯,0,2,1,1,1,1,2,2,2,1
rs872464,2,2,1,2,2,1,1,2,0,1,⋯,0,2,1,1,1,1,2,2,2,1
rs10950503,2,2,1,2,1,1,1,2,0,1,⋯,0,2,1,1,1,1,2,2,1,1
rs10479762,0,1,1,1,1,1,1,0,2,1,⋯,2,1,1,1,2,2,1,0,2,2
rs871925,0,1,1,1,1,1,1,0,2,1,⋯,2,1,1,1,2,2,1,0,2,2
rs13227554,0,1,1,1,1,1,1,0,2,1,⋯,2,1,1,1,2,2,1,0,2,2


Now let's show the rows that ARE perfect matches.

In [150]:
# Find rows that are perfect matches between snp.1kg.eur2 and snp.1kg.eur3
match_indices <- which(apply(snp.1kg.eur2 == snp.1kg.eur3, 1, all))

# Create new dataframes showing rows that are perfect matches
snp.1kg.eur2.match <- snp.1kg.eur2[match_indices, ]
snp.1kg.eur3.match <- snp.1kg.eur3[match_indices, ]

dim(snp.1kg.eur2.match)
dim(snp.1kg.eur3.match)

# Display the new dataframes
head(snp.1kg.eur2.match)
head(snp.1kg.eur3.match)


Unnamed: 0_level_0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109,⋯,NA20814_NA20814,NA20815_NA20815,NA20818_NA20818,NA20819_NA20819,NA20821_NA20821,NA20822_NA20822,NA20826_NA20826,NA20827_NA20827,NA20828_NA20828,NA20832_NA20832
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
rs11773627,2,1,2,1,1,2,2,2,2,2,⋯,2,2,2,2,1,1,1,2,0,1
rs6972374,2,2,1,2,1,1,1,2,0,1,⋯,0,2,1,1,1,1,2,2,2,1
rs12666575,0,1,1,1,2,1,1,0,2,1,⋯,2,0,1,1,2,2,1,0,2,2
rs11766575,0,1,1,1,2,1,1,0,2,1,⋯,2,0,1,1,2,2,1,0,2,2
rs4721264,0,1,1,1,1,1,1,0,2,1,⋯,2,0,1,0,2,2,1,0,2,2
rs62442944,2,1,2,1,1,2,2,2,2,2,⋯,2,2,2,2,1,1,1,2,0,1


Unnamed: 0,HG00096_HG00096,HG00097_HG00097,HG00099_HG00099,HG00101_HG00101,HG00102_HG00102,HG00103_HG00103,HG00105_HG00105,HG00107_HG00107,HG00108_HG00108,HG00109_HG00109,⋯,NA20814_NA20814,NA20815_NA20815,NA20818_NA20818,NA20819_NA20819,NA20821_NA20821,NA20822_NA20822,NA20826_NA20826,NA20827_NA20827,NA20828_NA20828,NA20832_NA20832
rs11773627,2,1,2,1,1,2,2,2,2,2,⋯,2,2,2,2,1,1,1,2,0,1
rs6972374,2,2,1,2,1,1,1,2,0,1,⋯,0,2,1,1,1,1,2,2,2,1
rs12666575,0,1,1,1,2,1,1,0,2,1,⋯,2,0,1,1,2,2,1,0,2,2
rs11766575,0,1,1,1,2,1,1,0,2,1,⋯,2,0,1,1,2,2,1,0,2,2
rs4721264,0,1,1,1,1,1,1,0,2,1,⋯,2,0,1,0,2,2,1,0,2,2
rs62442944,2,1,2,1,1,2,2,2,2,2,⋯,2,2,2,2,1,1,1,2,0,1


As another sanity check, let's now rerun with our new, but subsetted to kinda match, snp.1kg.eur2 files.

In [None]:
snp.1kg.eur2 <- snp.1kg.eur3
map.1kg.eur2 <- map.1kg.eur3

Also remember to look at Manhattan plots and compare them

### Set window size and any other parameters

In [None]:
wind <- 10000

Is 1se vs min for lambda the problem?

#### Final formatting steps to prepare for MWAS - sort samples

In [None]:
p.residual <- p.residual[, order(colnames(p.residual))]

In [None]:
snp3 <- snp3[, colnames(snp3) %in% colnames(p.residual)]

In [None]:
snp3 <- snp3[, order(colnames(snp3))]

## MWAS

### With `lambda.1se`

#### Stage 1

Quick debug: A version to see why we end up with empty `models.aa` and `models.ea`

In [None]:
head(map3)

In [None]:
head(cg)

In [None]:
head(wind)

In [None]:
set.seed(42)
for(k in 1:length(wind)){
    models.ea <- c()
    models.all <- c()
    for(i in 1:length(cg)){
    print(i)
    #for(i in 1){
            #cat(i,"\n")
            #print(paste0("This cg is: ", cg[i]))
            range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
            range2 <- cg[i] + wind[k]
            idx <- map3$POS > range1 & map3$POS < range2
            # go to next cg if no snps within window
            if(sum(idx) <= 1){
                    next
            }
            geno <- snp3[idx,] # changed snp2 to snp3
            rownames(geno) <- map3$POS[idx]
            trainX <- t(geno)
            trainY <- p.residual[i,]
            fit <- elastic.net(trainX,trainY, "1se")
            fit <- tryCatch(
                    elastic.net(trainX,trainY, "1se"),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.all <- rbind(models.all,fit)
            # EA only
            trainX <- trainX[idx.ea,]
            if(sum(apply(trainX,2,var)!=0) <= 1){
                    next
            }
            trainY <- trainY[idx.ea]
            fit <- tryCatch(
                    elastic.net(trainX,trainY, "1se"),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next
            fit$cg <- cg[i]
            models.ea <- rbind(models.ea,fit)
    }
}

In [None]:
models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

#### Stage 2

In [None]:
if(!dir.exists(outd)) dir.create(outd)

In [None]:
set.seed(42)
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0,nrow=length(cg2),ncol=2)

for(i in 1:length(cg2)){
    pos <- models.all[models.all$cg == cg2[i],1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.all[models.all$cg == cg2[i],2]
    #geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]

    match_indices <- match(pos, map.1kg.eur2$POS)

    tryCatch({
        geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
    }, error = function(e) {
        cat("Error accessing genotype data at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })

    tryCatch({
        mwas.all[i,] <- MWAS(gwas, weight, t(geno))
    }, error = function(e) {
        cat("Error at iteration:", i, "\n")
        cat("cg2[i]:", cg2[i], "\n")
        cat("gwas:\n")
        print(gwas)
        cat("weight:\n")
        print(weight)
        stop(e)
    })
}

rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z","p")

In [None]:
gwas

In [None]:
weight

In [None]:
i

In [None]:
pos

In [None]:
gwas

In [None]:
length(pos)

In [None]:
length(gwas)

In [None]:
models.all[models.all$cg == cg2[i], ]

In [None]:
snp.gwas2[is.element(snp.gwas2$pos_hg38, pos), ]

In [None]:
# Why do we get non-conformable arguments for iteration 57?

In [None]:
weight

In [None]:
length(weight)

In [None]:
gwas

In [None]:
length(gwas)

In [None]:
head(gwas)

In [None]:
head(map3)

In [None]:
head(geno)

In [None]:
# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
        pos <- models.ea[models.ea$cg == cg2[i],1]
        gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
        weight <- models.ea[models.ea$cg == cg2[i],2]
        #geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]

        match_indices <- match(pos, map.1kg.eur2$POS)

        tryCatch({
            geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
        }, error = function(e) {
            cat("Error accessing genotype data at iteration:", i, "\n")
            cat("Error message:", e$message, "\n")
            stop("Stopping execution due to error.")
        })    
    
        mwas.ea[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z","p")

In [None]:
head(mwas.all)

In [None]:
mwas.all.1se <- mwas.all

### With `lambda.min`

#### Stage 1

In [None]:
set.seed(42)
for(k in 1:length(wind)){
    models.ea <- c()
    models.all <- c()
    for(i in 1:length(cg)){
    #for(i in 1){
        #cat(i,"\n")
        #print(paste0("This cg is: ", cg[i]))
        range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
        range2 <- cg[i] + wind[k]
        idx <- map3$POS > range1 & map3$POS < range2
        # go to next cg if no snps within window
        if(sum(idx) <= 1){
                next
        }
        geno <- snp3[idx,] # changed snp2 to snp3
        rownames(geno) <- map3$POS[idx]
        trainX <- t(geno)
        trainY <- p.residual[i,]
        fit <- elastic.net(trainX,trainY)
        fit <- tryCatch(
                elastic.net(trainX,trainY, "min"),
                error = function(e) {return ("err")})
        if(!is.data.frame(fit)){
            if(fit == "err"){
                next
            }
        }
        if(nrow(fit) == 0) next

        fit$cg <- cg[i]
        models.all <- rbind(models.all,fit)
        # EA only
        trainX <- trainX[idx.ea,]
        if(sum(apply(trainX,2,var)!=0) <= 1){
                next
        }
        trainY <- trainY[idx.ea]
        fit <- tryCatch(
                elastic.net(trainX,trainY, "min"),
                error = function(e) {return ("err")})
        if(!is.data.frame(fit)){
            if(fit == "err"){
                next
            }
        }
        if(nrow(fit) == 0) next
        fit$cg <- cg[i]
        models.ea <- rbind(models.ea,fit)
    }
}

In [None]:
models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

#### Stage 2

In [None]:
if(!dir.exists(outd)) dir.create(outd)

In [None]:
set.seed(42)
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.all[models.all$cg == cg2[i],1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.all[models.all$cg == cg2[i],2]
    #geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]

    match_indices <- match(pos, map.1kg.eur2$POS)

    tryCatch({
        geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
    }, error = function(e) {
        cat("Error accessing genotype data at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })

    mwas.all[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z","p")

In [None]:
# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.ea[models.ea$cg == cg2[i],1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.ea[models.ea$cg == cg2[i],2]
    #geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]

    match_indices <- match(pos, map.1kg.eur2$POS)

    tryCatch({
        geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
    }, error = function(e) {
        cat("Error accessing genotype data at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })    

    mwas.ea[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z","p")

In [None]:
head(mwas.all)

In [None]:
mwas.all.min <- mwas.all

## Compare

In [None]:
mwas.all.1se

In [None]:
mwas.all.min

In [None]:
library(ggplot2)

In [None]:
mwas.all.1se <- as.data.frame(mwas.all.1se)
mwas.all.min <- as.data.frame(mwas.all.min)

mwas.all.1se$pos <- rownames(mwas.all.1se)
mwas.all.min$pos <- rownames(mwas.all.min)

rownames(mwas.all.1se) <- rownames(mwas.all.min) <- NULL

colnames(mwas.all.1se)[2] <- "pval.lambda.1se"
colnames(mwas.all.min)[2] <- "pval.lambda.min"

df <- merge(mwas.all.1se, mwas.all.min, by = "pos")

ggplot(df, aes(x = -log(pval.lambda.1se, base = 10), y = -log(pval.lambda.min, base = 10))) +
  geom_point(alpha = 0.6) +
  labs(x = "-log_10 of p-values (lambda.1se)",
       y = "-log_10 of p-values (lambda.min)",
       title = "Scattergram of MWAS p-values",
       subtitle = "Comparing lambda.1se and lambda.min") +
  theme_minimal()

In [None]:
min(df$pval.lambda.1se)

In [None]:
min(df$pval.lambda.1se)

In [None]:
head(df)

In [None]:
df$pos <- as.numeric(df$pos)

In [None]:
min(df$pval.lambda.1se)

In [None]:
min(df$pval.lambda.min)

In [None]:
library(ggplot2)
library(scales)  # For comma formatting

In [None]:
# Convert numbers to comma-separated format
formatted_min_pos <- comma(min_site_to_test_pos)
formatted_max_pos <- comma(max_site_to_test_pos)

# Converting the data to long format if not already done
df_long <- tidyr::pivot_longer(df, 
                               cols = c("pval.lambda.1se", "pval.lambda.min"), 
                               names_to = "variable", 
                               values_to = "pval")

# Plot with detailed title
ggplot(df_long, aes(x = pos, y = -log(pval, base = 10), color = variable)) +
  geom_point(alpha = 0.6) +
  labs(x = "Genomic Position",
       y = "Log of p-values",
       title = paste("Scattergrams of MWAS p-values by Genomic Position\nchr:", chr,
                     "MAF:", maf, 
                     "Positions:", formatted_min_pos, "to", formatted_max_pos),
       subtitle = "Faceted by lambda type") +
  scale_color_manual(values = c("lambda.1se" = "blue", "lambda.min" = "red")) +
  facet_wrap(~ variable, scales = "free_y") +
  theme_minimal()

### Plot Shizhong's old results to compare

In [None]:
library(data.table)
library(qqman)

In [None]:
old <- fread("mwas.all.wind.10000")

In [None]:
head(old)

In [None]:
colnames(old) <- c("CHR", "BP", "Z", "P")

In [None]:
old$CHR <- gsub("chr", "", old$CHR)
old$CHR <- as.numeric(old$CHR)

In [None]:
old$SNP <- paste0("Chr", old$CHR, ":", old$BP)

In [None]:
manhattan(old)

### Merging/overlap of Shizhong's results, ours.

In [None]:
head(old)

In [None]:
head(models.all)

In [None]:
head(df)

In [None]:
df$CHR <- 7

In [None]:
colnames(df)

In [None]:
colnames(df)[1] <- "BP"

### What % of CpG sites in our results are also found in Shizhongs?

In [None]:
head(df)

In [None]:
colnames(df)[2] <- "Z_lambda.1se"
colnames(df)[4] <- "Z_lambda.min"

In [None]:
colnames(old)[3:4] <- c("Z_Shizhongs_old", "pval.shizhongs_old")

In [None]:
head(old)

In [None]:
merged <- merge(old, df)

In [None]:
head(merged)

In [None]:
dim(old)

In [None]:
dim(df)

In [None]:
dim(merged)

In [None]:
length(setdiff(old$BP, df$BP))
#setdiff(old$BP, df$BP)

In [None]:
length(setdiff(df$BP, old$BP))
setdiff(df$BP, old$BP)

In [None]:
length(intersect(old$BP, df$BP))
intersect(old$BP, df$BP)

### Among those found in both, what is agreement?

In [None]:
head(merged)

In [None]:
library(dplyr)
library(tidyr)
library(ggplot2)

# Assuming 'merged' is your dataframe
# You may have already set the column names as shown in your previous message

# Convert BP to numeric if not already
merged$BP <- as.numeric(merged$BP)

# Convert p-values to their -log10 for visualization
merged$log_pval_shizhong <- -log10(merged$pval.shizhongs_old)
merged$log_pval_lambda_1se <- -log10(merged$pval.lambda.1se)
merged$log_pval_lambda_min <- -log10(merged$pval.lambda.min)


In [None]:
# Create the ggplot visualizations
plot <- ggplot(merged, aes(x = BP)) +
  geom_point(aes(y = log_pval_shizhong, color = "Shizhong's Old"), alpha = 0.6) +
  geom_point(aes(y = log_pval_lambda_1se, color = "Lambda 1se"), alpha = 0.6) +
  geom_point(aes(y = log_pval_lambda_min, color = "Lambda Min"), alpha = 0.6) +
  scale_color_manual(values = c("Shizhong's Old" = "green", "Lambda 1se" = "blue", "Lambda Min" = "red")) +
  labs(x = "Genomic Position (BP)",
       y = "-log10 of p-values",
       title = "Comparative Scattergram of MWAS p-values by Genomic Position",
       subtitle = "Green: Shizhong's Old, Blue: Lambda.1se, Red: Lambda.min") +
  theme_minimal() +
  facet_wrap(~ CHR, scales = "free_y")  # Facet by chromosome

# Print the plot
print(plot)


In [None]:
library(ggplot2)
library(tidyr)
library(scales)  # for comma formatting

# Define your positions
min_site_to_test_pos <- 73274305
max_site_to_test_pos <- 73420076

# Convert numbers to comma-separated format
formatted_min_pos <- scales::comma(min_site_to_test_pos)
formatted_max_pos <- scales::comma(max_site_to_test_pos)


In [None]:
# Ensure your dataframe 'df' is already properly formatted and 'pos' is correct
df_long <- tidyr::pivot_longer(df, 
                               cols = c("pval.lambda.1se", "pval.lambda.min"), 
                               names_to = "variable", 
                               values_to = "pval")


In [None]:
head(df_long)

In [None]:
library(tidyr)
library(ggplot2)
library(scales)  # for comma formatting

# Convert numbers to comma-separated format
formatted_min_pos <- comma(min_site_to_test_pos)
formatted_max_pos <- comma(max_site_to_test_pos)

# Convert the dataframe to long format including the new Shizhong's p-values
df_long <- pivot_longer(merged, 
                        cols = c("pval.lambda.1se", "pval.lambda.min", "pval.shizhongs_old"),
                        names_to = "variable",
                        values_to = "pval")


In [None]:
library(data.table)
library(ggplot2)
library(scales)
library(tidyr)

# Convert 'BP' to numeric
merged$BP <- as.numeric(merged$BP)

# Minimum p-values
min_pval_lambda.1se <- min(merged$pval.lambda.1se)
min_pval_lambda.min <- min(merged$pval.lambda.min)
min_pval_shizhongs_old <- min(merged$pval.shizhongs_old)

# Convert numbers to comma-separated format
formatted_min_pos <- comma(min(merged$BP))
formatted_max_pos <- comma(max(merged$BP))

# Convert data to long format
df_long <- pivot_longer(merged, cols = c("pval.lambda.1se", "pval.lambda.min", "pval.shizhongs_old"), 
                        names_to = "variable", values_to = "pval")

# Plot with detailed title
ggplot(df_long, aes(x = BP, y = -log10(pval), color = variable)) +
  geom_point(alpha = 0.6) +
  labs(x = "Genomic Position",
       y = "Log of p-values",
       title = paste("Scattergrams of MWAS p-values by Genomic Position\nCHR:", unique(merged$CHR),
                     "Positions:", formatted_min_pos, "to", formatted_max_pos),
       subtitle = "Faceted by lambda type") +
  scale_color_manual(values = c("pval.lambda.1se" = "blue", "pval.lambda.min" = "red", "pval.shizhongs_old" = "green")) +
  facet_wrap(~ variable, scales = "free_y") +
  theme_minimal()


In [None]:
Sys.time()

In [None]:
min(sites_to_test_pos)

In [None]:
max(sites_to_test_pos)

## Check these sites on chr 7

In [None]:
# Shizhong's original result

# chr7    1987910 10.1255085321387        4.25759742738181e-24
# chr7    1987896 10.1255085321387        4.25759742738189e-24
# chr7    1987797 10.0241523897721        1.19379483108027e-23
# chr7    1987778 10.0105851568856        1.36940148731912e-23

In [None]:
selected_pos <- c(1987910, 1987896, 1987797, 1987778)

In [None]:
head(merged)

In [None]:
merged[which(merged$BP %in% selected_pos), ]