# Try MWAS with covariates prepared in various ways

# Pick the regions we will test

In [1]:
library(data.table)

prev_hits <- fread("
Chr     pos        old_z       old_p
11   38247902        -27.1535308285104       2.30024742330298e-162
2    47933357        18.3327793004811        4.53147699327216e-75
7    1987910         10.1255085321387        4.25759742738181e-24
7    1987896         10.1255085321387        4.25759742738189e-24
7    1987797         10.0241523897721        1.19379483108027e-23
7    1987778         10.0105851568856        1.36940148731912e-23
12   2194742         -10.0072634920486       1.41615523554955e-23
")


In [2]:
#df <- fread("09.5-OUT_matched_SNP_meth_cov_chunked_EXPANSE_a2.csv")

In [3]:
df <- fread("09-OUT_matched_SNP_meth_cov_a2.csv")

## Try original code

In [141]:
###### model: learn elastic net model on training data 
######---------Input: trainX, trainY
######---------Return: selected features and coefficents

# original
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        data.frame(
                        cvm = min(cv.fit$cvm),
                        lambda = cv.fit$lambda.min,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
        yfit = glmnet(
        trainX,
        trainY,
        lambda = cv.opt$lambda,
        alpha = cv.opt$alpha
                )       
        idf <- coef(yfit)
        idx <- which(idf != 0)
        selectf <- data.frame(
                features = idf@Dimnames[[1]][idx], 
                coefs = idf [idx]
        )
}

# modified to use lambda 1se and appropriate cvm
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        set.seed(42)
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        print(paste0("Dim of trainX: ", dim(trainX)))
        print(paste0("Len of trainY: ", length(trainY)))
        coef_matrix <- as.matrix(coef(cv.fit))

        non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
        print("Coefficients when fitting: ")
        print(non_zero_coefs)

        data.frame(
                        cvm = cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se],
                        lambda = cv.fit$lambda.1se,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
    set.seed(42)
    yfit <- glmnet(
    trainX,
    trainY,
    lambda = cv.opt$lambda,
    alpha = cv.opt$alpha)

    coef_matrix <- as.matrix(coef(yfit))

    non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
    print("Coefficients when optimal: ")
    print(non_zero_coefs)

    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
            features = idf@Dimnames[[1]][idx], 
            coefs = idf [idx]
    )
}

MWAS <- function(gwas, weight, geno){
        z <- gwas %*% weight
        z.cor <- cor(geno)
        se <- sqrt(weight %*%  z.cor %*%  weight)
        z <- z/se
        p=pnorm(abs(z),lower.tail=F)*2
        return(c(z, p))
}

In [5]:
df <- df[which(df$Chr == 7), ]

In [6]:
i <- 2

In [7]:
library("glmnet")
library("e1071")
library("doParallel")

set.seed(2018)
wind <- c(5000,10000)
# output directory
#outd <- "/dcl02/lieber/shan/shizhong/finemapping/GWAS/tags/scz3/mwas/chr22/1/"
outd <- "20-OUT_original_mwas_sanity_test/"

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [8]:
suppressWarnings(library(bsseq))

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The

In [9]:
# load data for mwas
# load("./rda/caudate_mwas_data_chr22.rda")
load(df$methylation_data[i])

p <- getMeth(BSobj2)


rownames(p) <- start(BSobj2)

sites_to_test <- which(start(BSobj2) >= (1987910 - 500) & start(BSobj2) <= (1987910 + 500))
p <- p[sites_to_test, ]

# candidate cg
cg <- as.numeric(rownames(p))

# regress out covariates
#load("covs_for_meqtl.rda")

In [10]:
sites_to_test

In [11]:
covs <- fread(df$cov_file[i])
covs <- t(covs)
colnames(covs) <- covs[1, ]
covs <- covs[2:nrow(covs), ]
# transpose so we have same orientation as original code

In [12]:
BSobj2$brnum <- gsub("Br0", "Br", BSobj2$brnum)
colnames(covs) <- gsub("Br0", "Br", colnames(covs))

In [13]:
length(match(BSobj2$brnum,colnames(covs)))

In [14]:
mat <- match(BSobj2$brnum,colnames(covs)) 
covs <- t(covs[,mat])
p.residual=matrix(NA,dim(p)[1],dim(p)[2])

In [15]:
rownames(covs)[is.na(covs[, 'genoPC1'])] <- BSobj2$brnum[is.na(covs[, 'genoPC1'])]

In [16]:
colnames(p.residual) <- BSobj2$brnum

In [17]:
covs <- as.data.frame(covs)
# Convert all columns except Dx and Sex from character to numeric
cols_to_convert <- setdiff(names(covs), c("Dx", "Sex"))

for (col in cols_to_convert) {
  covs[[col]] <- as.numeric(covs[[col]])
}

# Print the modified data frame to check the conversion
#print(dat)


In [18]:
for (i in 1:dim(p)[1]) { # For each methylation site
    dat <- as.data.frame(cbind(y = p[i,], covs))
    
    # Check for rows with NAs (the ones for which we don't have covariate data)
    valid_rows <- complete.cases(dat)
    
    if (sum(valid_rows) > 0) {
        dat_valid <- dat[valid_rows,]
        model.res <- lm(y ~ ., data = dat_valid)
        
        # Store residuals in the corresponding positions
        p.residual[i, valid_rows] <- resid(model.res)
    }
}


# for(i in 1:dim(p)[1]){ # foro each methylation site
#         dat <- as.data.frame(cbind(p[i,],covs))
#         colnames(dat) <- c("y",paste0("x",1:ncol(covs)))
#         model.res <- lm(reformulate(paste0("x",1:ncol(covs)), "y"),dat)
#         p.residual[i,] = resid(model.res) 
# }

In [19]:
load("p1.rda")

In [20]:
wind <- 10000

In [21]:
# built predition models
idx.ea <- BSobj2$race == "CAUC"

In [22]:
snp2_sorted <- snp2[, order(names(snp2))]

In [23]:
colnames(snp2) <- gsub("Br0", "Br", colnames(snp2))

In [24]:
snp2 <- snp2[, colnames(snp2) %in% colnames(p.residual)]

In [25]:
dim(snp2)

In [26]:
snp2_positions <- stringr::str_split_fixed(rownames(snp2), ":", 3)[, 2]

## Test for one

In [27]:
dim(snp2)

In [28]:
rownames(snp2)

In [29]:
k <- 1

i <- 1

range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
range2 <- cg[i] + wind[k]

range1
range2

In [30]:
cat(i,"\n")
range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
range2 <- cg[i] + wind[k]
range1
range2

idx <- map2$POS > range1 & map2$POS < range2

length(idx)

1 


In [31]:
range1

In [32]:
range2

In [33]:
# go to next cg if no snps within window
if(sum(idx) <= 1){
        next
}
geno <- snp2[idx,]
rownames(geno) <- map2$POS[idx]
trainX <- t(geno)
trainY <- p.residual[i,]

In [34]:
dim(snp2)

dim(geno)

head(trainY[order(names(trainY))])

In [35]:
inspect <- t(snp2)
inspect <- inspect[order(rownames(inspect)), ]

In [36]:
fit <- elastic.net(trainX,trainY)

“executing %dopar% sequentially: no parallel backend registered”


In [37]:
fit

features,coefs
<chr>,<dbl>
(Intercept),-0.0019839249
1979188,0.0019631282
1980240,0.0001047671


We're getting different elasticnet results here than with our package. Why?

Let's go inside elastic.net function

In [38]:
dim(trainX)

Deeper still

In [107]:
trainX2 <- readRDS("20-IN_sanity_test_Xmatrix.rds")

In [108]:
trainy2 <- readRDS("20-IN_sanity_test_y.rds")

In [109]:
fold_id <- readRDS("20-IN_sanity_test_fold_id.rds")

In [110]:
set.seed(42)
cv.fit <- cv.glmnet(
                trainX2,
                trainy2,
                foldid = fold_id,                    
        type.measure = "mse",
                    paralle = TRUE,
                    alpha = 0.5
                    )

In [111]:
cv.fit


Call:  cv.glmnet(x = trainX2, y = trainy2, type.measure = "mse", foldid = fold_id,      parallel = TRUE, alpha = 0.5) 

Measure: Mean-Squared Error 

      Lambda Index   Measure        SE Nonzero
min 0.001306    27 0.0004034 2.077e-05       7
1se 0.007649     8 0.0004224 2.365e-05       6

In [112]:
head(fold_id)

In [113]:
a <- 0.5
search <- foreach(ai = a, .combine = rbind) %dopar% {
    set.seed(42)
    cv.fit <- cv.glmnet(
                trainX2,
                trainy2,
                foldid = fold_id,                    
        type.measure = "mse",
                    paralle = TRUE,
                    alpha = ai
                    )
    data.frame(
                    cvm = cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se],
                    lambda = cv.fit$lambda.1se,
                    alpha = ai
                    )
    } 

In [114]:
cv.opt <- search[search$cvm == min(search$cvm),] 

In [115]:
cv.opt

Unnamed: 0_level_0,cvm,lambda,alpha
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
1,0.0004224127,0.007649007,0.5


In [116]:
cv.fit


Call:  cv.glmnet(x = trainX2, y = trainy2, type.measure = "mse", foldid = fold_id,      parallel = TRUE, alpha = 0.5) 

Measure: Mean-Squared Error 

      Lambda Index   Measure        SE Nonzero
min 0.001306    27 0.0004034 2.077e-05       7
1se 0.007649     8 0.0004224 2.365e-05       6

In [135]:
coef(cv.fit) # This matches what we get out of tune_alpha, without fitting optimized parameters on whole data

44 x 1 sparse Matrix of class "dgCMatrix"
                            s1
(Intercept)      -4.552958e-05
chr7:1977419:C:T  .           
chr7:1977810:C:T  7.693460e-04
chr7:1978402:C:T  .           
chr7:1978745:G:A  .           
chr7:1978783:G:T  .           
chr7:1978996:C:T  .           
chr7:1979170:C:T  .           
chr7:1979188:C:A  .           
chr7:1979235:G:A  .           
chr7:1979926:T:C  .           
chr7:1980240:G:A  .           
chr7:1980788:G:A  .           
chr7:1981269:G:C  .           
chr7:1981360:G:A -1.063717e-03
chr7:1981613:C:T  .           
chr7:1982304:G:T  .           
chr7:1982428:C:G  .           
chr7:1985366:G:C  .           
chr7:1985405:A:G  .           
chr7:1985487:G:A  .           
chr7:1987670:G:A  .           
chr7:1987676:G:T  .           
chr7:1987689:G:A  .           
chr7:1987719:C:T -1.223081e-03
chr7:1987813:T:C  1.830253e-04
chr7:1988307:C:T  .           
chr7:1989028:A:C  .           
chr7:1989333:G:T  .           
chr7:1989994:G:C  .         

In [136]:
a <- 0.5

In [137]:
search <- foreach(ai = a, .combine = rbind) %dopar% {
    set.seed(42)
    cv.fit <- cv.glmnet(
                    trainX2,
                    trainy2,
                    foldid = fold_id,
                    type.measure = "mse",
                    paralle = TRUE,
                    alpha = ai
                    )
    data.frame(
                    cvm = cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se],
                    lambda = cv.fit$lambda.1se,
                    alpha = ai
                    )
    } 
cv.opt <- search[search$cvm == min(search$cvm),] 

    # fit model by optimized alpha and lambda
    set.seed(42)
    yfit = glmnet(
    trainX2,
    trainy2,
    lambda = cv.opt$lambda,
    alpha = cv.opt$alpha
            )       
    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
            features = idf@Dimnames[[1]][idx], 
            coefs = idf [idx]
    )

In [138]:
cv.opt # GOOD! should be 
#cvm	lambda	alpha
#<dbl>	<dbl>	<dbl>
#1	0.0004224127	0.007649007	0.5


Unnamed: 0_level_0,cvm,lambda,alpha
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
1,0.0004224127,0.007649007,0.5


In [139]:
search

cvm,lambda,alpha
<dbl>,<dbl>,<dbl>
0.0004224127,0.007649007,0.5


Make sure we have same results not only from fitting but for optimal model with fixed alpha, lambda also

In [140]:
selectf

features,coefs
<chr>,<dbl>
(Intercept),-4.062561e-05
chr7:1977810:C:T,0.0007759077
chr7:1981360:G:A,-0.001070675
chr7:1987719:C:T,-0.001224151
chr7:1987813:T:C,0.0001768736
chr7:1990232:T:C,-0.001298076
chr7:1994382:C:T,0.001105434


In [129]:
dim(trainX)

In [103]:
models.all <- c()

In [104]:
rbind(models.all, cv.fit)

Unnamed: 0_level_0,lambda,cvm,cvsd,cvup,cvlo,nzero,call,name,glmnet.fit,lambda.min,lambda.1se,index
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<language>,<chr>,<elnet>,<dbl>,<dbl>,"<int[,1]>"
cv.fit,0.014670....,0.000450....,2.912830....,0.000479....,0.000421....,"0, 3, 3,....",cv.glmne....,Mean-Squ....,c(s0 = -....,0.001305....,0.007649....,"27, 8"


Is 1se vs min for lambda the problem?

## Run for all

In [58]:
snp2

Unnamed: 0_level_0,Br836,Br845,Br848,Br863,Br914,Br948,Br949,Br963,Br983,Br991,⋯,Br5373,Br5398,Br5422,Br5426,Br5460,Br5467,Br5475,Br5488,Br5584,Br5590
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr7:1963098:T:C,2.0,2,2.0,2,2.0,1.999,2.0,1.974,1.0,2,⋯,1.954,2.0,2,2.0,2,2,2,1,1,1.0
chr7:1963408:C:T,0.001,2,1.991,0,1.969,0.006,0.004,0.14,2.0,0,⋯,2.0,0.0,1,2.0,2,1,1,2,2,2.0
chr7:1964786:C:T,2.0,2,1.0,2,2.0,2.0,2.0,2.0,1.0,2,⋯,0.001,2.0,2,0.0,2,2,2,1,1,2.0
chr7:1966112:T:C,2.0,2,1.001,2,2.0,2.0,2.0,2.0,1.002,2,⋯,0.015,2.0,2,0.0,2,2,2,1,1,2.0
chr7:1973362:G:A,2.0,2,1.095,2,2.0,2.0,0.004,2.0,1.002,2,⋯,0.015,2.0,2,0.0,2,1,2,1,1,2.0
chr7:1975412:T:G,2.0,2,2.0,2,2.0,2.0,2.0,2.0,1.0,2,⋯,2.0,2.0,2,2.0,2,2,2,1,1,1.0
chr7:1976335:C:T,0.0,2,1.991,0,1.98,0.024,2.0,0.015,2.0,0,⋯,2.0,0.0,1,2.0,2,2,1,2,2,2.0
chr7:1977810:C:T,0.0,1,1.0,0,0.0,0.0,2.0,0.0,1.0,0,⋯,2.0,0.0,1,2.0,1,1,1,1,1,2.0
chr7:1978402:C:T,0.008,2,1.991,0,1.981,0.024,2.0,0.015,2.0,0,⋯,2.0,0.0,1,2.0,2,2,1,2,2,2.0
chr7:1978783:G:T,0.0,2,1.99,0,1.979,0.024,2.0,0.019,2.0,0,⋯,2.0,0.0,1,2.0,2,2,1,2,2,2.0


In [49]:
dim(snp2)

In [56]:
dim(snp2)

In [59]:
snp3 <- readRDS("20-IN_check_SNP_window_pos1987413_win50000.rds")

In [60]:
dim(snp2)

In [62]:
snp2[1:10, 1:10]

Unnamed: 0_level_0,Br836,Br845,Br848,Br863,Br914,Br948,Br949,Br963,Br983,Br991
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr7:1963098:T:C,2.0,2,2.0,2,2.0,1.999,2.0,1.974,1.0,2
chr7:1963408:C:T,0.001,2,1.991,0,1.969,0.006,0.004,0.14,2.0,0
chr7:1964786:C:T,2.0,2,1.0,2,2.0,2.0,2.0,2.0,1.0,2
chr7:1966112:T:C,2.0,2,1.001,2,2.0,2.0,2.0,2.0,1.002,2
chr7:1973362:G:A,2.0,2,1.095,2,2.0,2.0,0.004,2.0,1.002,2
chr7:1975412:T:G,2.0,2,2.0,2,2.0,2.0,2.0,2.0,1.0,2
chr7:1976335:C:T,0.0,2,1.991,0,1.98,0.024,2.0,0.015,2.0,0
chr7:1977810:C:T,0.0,1,1.0,0,0.0,0.0,2.0,0.0,1.0,0
chr7:1978402:C:T,0.008,2,1.991,0,1.981,0.024,2.0,0.015,2.0,0
chr7:1978783:G:T,0.0,2,1.99,0,1.979,0.024,2.0,0.019,2.0,0


In [65]:
p.residual[1:10, 1:10]

Br1122,Br2285,Br1764,Br1464,Br5062,Br1446,Br1503,Br1946,Br5323,Br1297
-0.013815412,-0.02255169,-0.0004894733,0.03405644,-0.0329548084,0.02449903,-0.007555459,0.04273147,0.01996025,-0.017726087
-0.013468656,-0.02324804,-0.0011163647,0.03364334,-0.0317491642,0.02437112,-0.00783563,0.04251409,0.01989064,-0.016912883
-0.012375431,-0.02539337,-0.0030962099,0.03227539,-0.0278967474,0.02391744,-0.008657468,0.04179608,0.01965915,-0.014462871
-0.011375639,-0.02727904,-0.0049116765,0.03093498,-0.0243133479,0.02343828,-0.009326051,0.04109117,0.01943299,-0.012377671
-0.010788167,-0.02834558,-0.0059806655,0.03010585,-0.0221857636,0.02312806,-0.009679408,0.04065515,0.01929359,-0.011223515
-0.010146799,-0.02946925,-0.0071498671,0.02916358,-0.019847767,0.02276303,-0.010028423,0.04016044,0.0191353,-0.010024834
-0.008658997,-0.03187249,-0.0098681871,0.02681875,-0.0143782574,0.02181009,-0.010682415,0.03893497,0.01874422,-0.007498337
-0.007533749,-0.0334261,-0.011923206,0.02488868,-0.010221245,0.02098906,-0.011021317,0.03793001,0.01842268,-0.005837704
-0.006622785,-0.03448333,-0.0135801042,0.02321933,-0.0068590052,0.02026025,-0.011189766,0.03706415,0.01814511,-0.004663062
-0.004587837,-0.03612092,-0.017219174,0.01912286,0.0005377276,0.01841334,-0.011192005,0.0349543,0.01746762,-0.002556482


In [66]:
snp3 <- t(snp3)

In [67]:
snp3[1:10, 1:10]

Unnamed: 0,Br1003,Br1004,Br1007,Br1016,Br1017,Br1021,Br1023,Br1030,Br1033,Br1034
chr7:1937582:G:A,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
chr7:1938366:T:C,1,1.0009765625,1.0,0,0.007019043,1,1.0009765625,0.601013184,0.0,0.0159912109
chr7:1939381:C:T,0,0.0029907227,0.0,0,0.0,0,0.0,0.007019043,0.0,0.0100097656
chr7:1939743:G:A,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
chr7:1939789:C:T,2,0.0009765625,1.0,1,1.0,1,1.9489746094,1.599975586,1.0,1.0059814453
chr7:1940105:G:A,1,0.0,0.0,1,1.0,0,0.908996582,0.999023438,1.0,0.9940185547
chr7:1940114:G:A,1,1.9970092773,1.995972,1,1.0,1,1.0120239258,0.994995117,0.9990234,0.9799804688
chr7:1940370:T:C,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0009765625
chr7:1941354:G:C,0,0.0,0.0,0,0.0,0,0.0009765625,0.0,0.0,0.0
chr7:1941619:C:T,0,0.0,0.0,0,1.0,1,0.0,0.0,0.0,0.0


In [68]:
wind

In [152]:
map3 <- data.frame(POS = stringr::str_split_fixed(rownames(snp3), ":", 3)[, 2])

In [148]:
head(map3)

In [145]:
snp3[1:10, 1:10]

Unnamed: 0,Br1003,Br1004,Br1007,Br1016,Br1017,Br1021,Br1023,Br1030,Br1033,Br1034
chr7:1937582:G:A,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
chr7:1938366:T:C,1,1.0009765625,1.0,0,0.007019043,1,1.0009765625,0.601013184,0.0,0.0159912109
chr7:1939381:C:T,0,0.0029907227,0.0,0,0.0,0,0.0,0.007019043,0.0,0.0100097656
chr7:1939743:G:A,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0
chr7:1939789:C:T,2,0.0009765625,1.0,1,1.0,1,1.9489746094,1.599975586,1.0,1.0059814453
chr7:1940105:G:A,1,0.0,0.0,1,1.0,0,0.908996582,0.999023438,1.0,0.9940185547
chr7:1940114:G:A,1,1.9970092773,1.995972,1,1.0,1,1.0120239258,0.994995117,0.9990234,0.9799804688
chr7:1940370:T:C,0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0009765625
chr7:1941354:G:C,0,0.0,0.0,0,0.0,0,0.0009765625,0.0,0.0,0.0
chr7:1941619:C:T,0,0.0,0.0,0,1.0,1,0.0,0.0,0.0,0.0


In [156]:
for(k in 1:length(wind)){
    models.ea <- c()
    models.all <- c()
    for(i in 1:length(cg)){
            cat(i,"\n")
            print(paste0("This cg is: ", cg))
            range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
            range2 <- cg[i] + wind[k]
            idx <- map2$POS > range1 & map2$POS < range2
            # go to next cg if no snps within window
            if(sum(idx) <= 1){
                    next
            }
            geno <- snp3[idx,] # changed snp2 to snp3
            rownames(geno) <- map3$POS[idx]
            trainX <- t(geno)
            trainY <- p.residual[i,]
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.all <- rbind(models.all,fit)
            # EA only
            trainX <- trainX[idx.ea,]
            if(sum(apply(trainX,2,var)!=0) <= 1){
                    next
            }
            trainY <- trainY[idx.ea]
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next
            fit$cg <- cg[i]
            models.ea <- rbind(models.ea,fit)
    }
}

1 
 [1] "This cg is: 1987413" "This cg is: 1987418" "This cg is: 1987434"
 [4] "This cg is: 1987449" "This cg is: 1987458" "This cg is: 1987468"
 [7] "This cg is: 1987492" "This cg is: 1987511" "This cg is: 1987527"
[10] "This cg is: 1987565" "This cg is: 1987604" "This cg is: 1987616"
[13] "This cg is: 1987627" "This cg is: 1987648" "This cg is: 1987656"
[16] "This cg is: 1987659" "This cg is: 1987665" "This cg is: 1987669"
[19] "This cg is: 1987680" "This cg is: 1987684" "This cg is: 1987688"
[22] "This cg is: 1987697" "This cg is: 1987705" "This cg is: 1987725"
[25] "This cg is: 1987732" "This cg is: 1987778" "This cg is: 1987797"
[28] "This cg is: 1987808" "This cg is: 1987853" "This cg is: 1987858"
[31] "This cg is: 1987861" "This cg is: 1987896" "This cg is: 1987910"
[34] "This cg is: 1987930" "This cg is: 1988010" "This cg is: 1988039"
[37] "This cg is: 1988059" "This cg is: 1988066" "This cg is: 1988078"
[40] "This cg is: 1988129" "This cg is: 1988133" "This cg is: 1988141"
[43

In [70]:
models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

In [71]:
if(!dir.exists(outd)) dir.create(outd)

In [72]:
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
        pos <- models.all[models.all$cg == cg2[i],1]
        gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
        weight <- models.all[models.all$cg == cg2[i],2]
        geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
        mwas.all[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z","p")

# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
        pos <- models.ea[models.ea$cg == cg2[i],1]
        gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
        weight <- models.ea[models.ea$cg == cg2[i],2]
        geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
        mwas.ea[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z","p")

# output models and mwas results
outf <- paste0(outd,"/models-cov1.all.wind.",wind[k])
write.csv(models.all,outf)
outf <- paste0(outd,"/models-cov1.ea.wind.",wind[k])
write.csv(models.ea,outf)
outf <- paste0(outd,"/mwas-cov1.all.wind.",wind[k])
write.csv(mwas.all,outf)
outf <- paste0(outd,"/mwas-cov1.ea.wind.",wind[k])
write.csv(mwas.ea,outf)

In [73]:
models.all

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
2,1979188,2.816307e-18,1987413
4,1979188,2.369840e-03,1987418
5,1980240,4.991343e-04,1987418
7,1979188,2.231990e-03,1987434
8,1980240,3.966771e-04,1987434
10,1979188,2.082817e-03,1987449
11,1980240,2.797247e-04,1987449
13,1979188,2.662321e-03,1987458
14,1980240,1.263084e-03,1987458
15,1987813,4.934992e-04,1987458


In [74]:
mwas.all

Unnamed: 0,z,p
1987413,5.483111,4.179106e-08
1987418,6.480977,9.11306e-11
1987434,6.349841,2.155381e-10
1987449,6.163944,7.095524e-10
1987458,6.512131,7.409196e-11
1987468,6.398813,1.56589e-10
1987492,6.418581,1.375502e-10
1987511,5.429066,5.664984e-08
1987527,5.634621,1.754435e-08
1987565,6.261898,3.80319e-10


In [39]:
models.all

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
2,1979188,2.8163070000000002e-18,1987413
4,1979188,2.0991940000000002e-18,1987418
6,1979188,6.860004999999999e-19,1987434
8,1979188,2.020601e-18,1987449
10,1979188,1.9988700000000002e-18,1987458
12,1979188,1.975426e-18,1987468
16,1979188,2.474245e-18,1987527
18,1979188,2.996334e-18,1987565
22,1979188,1.731503e-18,1987627
24,1979188,1.143716e-18,1987648


In [37]:
mwas.all

Unnamed: 0,z,p
1987413,5.483111,4.179106e-08
1987418,5.483111,4.179106e-08
1987434,5.483111,4.179106e-08
1987449,5.483111,4.179106e-08
1987458,5.483111,4.179106e-08
1987468,5.483111,4.179106e-08
1987527,5.483111,4.179106e-08
1987565,5.483111,4.179106e-08
1987627,5.483111,4.179106e-08
1987648,5.483111,4.179106e-08


In [None]:
head(models.ea)