# Look at previous code for GCTA, run with our data, become familiar

In [1]:
getwd()

In [1]:
library(data.table)

In [2]:
getwd()

In [3]:
outdir <- "/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/gcta_output"
if(!dir.exists(outdir)) dir.create(outdir)

In [4]:
chunk1 <- 1

In [5]:
chunk2 <- 10

## Check out these VMRs. Is this something we need to use, or is the same info in our newer file formats?

In [6]:
library(data.table)  # For efficient data handling

args <- commandArgs(trailingOnly=TRUE)

indir <- "/dcs04/lieber/statsgen/shizhong/AANRI/VMR2/99/caud/aa/"

setwd(outdir)

# Initialize lists to store sizes, dimensions, and loading times
data_sizes <- list()
data_dims <- list()
loading_times <- list()

# Load and analyze VMR files for each chromosome
for (i in 1:22) {
    cat("Processing chromosome", i, "...\n")
    
    # Measure loading time
    start_time <- Sys.time()
    load(paste0(indir, "out/chr", i, "_vmr.rda"))
    end_time <- Sys.time()
    loading_time <- as.numeric(difftime(end_time, start_time, units = "secs"))
    
    # Store loading time
    loading_times[[i]] <- loading_time
    
    # Print the 5x5 slice of the methylation data and head of VMR data
    cat("Slice [1:5, 1:5] of meth for chromosome", i, ":\n")
    print(meth[1:5, 1:5])
    cat("Head of vmrs for chromosome", i, ":\n")
    print(head(vmrs))
    
    # Store the size and dimensions of the data
    data_sizes[[i]] <- list(
        meth_size = object.size(meth),
        vmrs_size = object.size(vmrs)
    )
    data_dims[[i]] <- list(
        meth_dim = dim(meth),
        vmrs_dim = dim(vmrs)
    )
    
    # Combine data across chromosomes
    if (i == 1) {
        meth2 <- meth
        vmrs2 <- vmrs
    } else {
        meth2 <- rbind(meth2, meth)
        vmrs2 <- rbind(vmrs2, vmrs)
    }
}

# Print summary information
cat("\nSummary of loading times (seconds):\n")
for (i in 1:22) {
    cat("Chromosome", i, ":", loading_times[[i]], "seconds\n")
}

cat("\nSummary of data sizes (bytes):\n")
for (i in 1:22) {
    cat("Chromosome", i, ":\n")
    cat("  meth size:", data_sizes[[i]]$meth_size, "bytes\n")
    cat("  vmrs size:", data_sizes[[i]]$vmrs_size, "bytes\n")
}

cat("\nSummary of data dimensions:\n")
for (i in 1:22) {
    cat("Chromosome", i, ":\n")
    cat("  meth dimensions:", data_dims[[i]]$meth_dim, "\n")
    cat("  vmrs dimensions:", data_dims[[i]]$vmrs_dim, "\n")
}


Processing chromosome 1 ...
Slice [1:5, 1:5] of meth for chromosome 1 :
        Br1122    Br5323    Br1297    Br1722    Br1135
[1,] 0.3322684 0.3956835 0.4000000 0.4524887 0.3846154
[2,] 0.4923077 0.7581699 0.4852071 0.5450237 0.5978836
[3,] 0.2715655 0.5034014 0.5667870 0.4641350 0.4366667
[4,] 0.3209877 0.4821429 0.4790210 0.4617347 0.4918033
[5,] 0.7005348 0.5483871 0.5877193 0.6000000 0.6746032
Head of vmrs for chromosome 1 :
    chr   start     end idxStart idxEnd cluster  n    meanSDS
10 chr1  998114  998499     3406   3424      16 19 0.08608334
12 chr1 1013950 1014316     3860   3884      16 25 0.08550160
14 chr1 1023551 1025300     4094   4120      17 27 0.09811719
16 chr1 1034284 1034680     4178   4201      20 24 0.09901654
17 chr1 1039541 1040148     4275   4287      20 13 0.08407402
22 chr1 1140761 1141016     6967   6981      24 15 0.10167590
Processing chromosome 2 ...
Slice [1:5, 1:5] of meth for chromosome 2 :
        Br1122    Br5323    Br1297    Br1722    Br1135
[1,] 

Interesting how the allele data and methylationd ata appears to be for teh same limited number of sites. We use more SNPs as explanatory variables to compute heritability, right?

## Load the VMRs

In [7]:
wind <- as.numeric(c("10000"))
indir <- "/dcs04/lieber/statsgen/shizhong/AANRI/VMR2/99/caud/aa/"
gwas <- "/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/AA/all/plink/"
gcta <- "/dcs04/lieber/statsgen/shizhong/software/gcta/gcta-1.94.1-linux-kernel-3-x86_64/gcta-1.94.1"

#setwd(paste0("out/",outdir))

# load vmrs
load(paste0(indir,"out/chr1_vmr.rda"))
meth2 <- meth
vmrs2 <- vmrs
for(i in 2:22){
        load(paste0(indir,"out/chr",i,"_vmr.rda"))
        meth2 <- rbind(meth2,meth)
        vmrs2 <- rbind(vmrs2,vmrs)
}
p <- t(meth2)
ind <- rownames(p)

In [8]:
p[1:10,1:10]

0,1,2,3,4,5,6,7,8,9,10
Br1122,0.3322684,0.4923077,0.2715655,0.3209877,0.7005348,0.5780347,0.4562648,0.307971,0.5833333,0.6520101
Br5323,0.3956835,0.7581699,0.5034014,0.4821429,0.5483871,0.6129032,0.6024096,0.4251497,0.7916667,0.835924
Br1297,0.4,0.4852071,0.566787,0.479021,0.5877193,0.5283019,0.3729216,0.3243243,0.8265306,0.6453408
Br1722,0.4524887,0.5450237,0.464135,0.4617347,0.6,0.5084746,0.6150121,0.4183381,0.7317073,0.621374
Br1135,0.3846154,0.5978836,0.4366667,0.4918033,0.6746032,0.6111111,0.422629,0.4156627,0.8015267,0.8030303
Br1004,0.26,0.5227273,0.3911672,0.5152838,0.6775956,0.3969466,0.4719764,0.4501109,0.7142857,0.726094
Br1040,0.4352332,0.5050761,0.5548589,0.4578755,0.6489362,0.4390244,0.4063158,0.6188235,0.5214286,0.7902913
Br1517,0.4829268,0.5710383,0.3458904,0.4738562,0.7014925,0.5916667,0.6101695,0.5719298,0.8617886,0.6854305
Br1522,0.54,0.740566,0.361204,0.452381,0.704698,0.5235602,0.3532009,0.4677419,0.7962963,0.8052486
Br1164,0.5135135,0.5439331,0.3355263,0.5527426,0.6057692,0.5350318,0.314121,0.4674923,0.9133858,0.6753813


In [9]:
dim(p)

In [10]:
ind

## Load covariates

In [11]:
# Load covariate data
cat("\nLoading covariate data...\n")
f_demo <- "/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC"
f_pc <- paste0(indir, "/out/sva.csv")
f_ances <- "/dcs04/lieber/statsgen/shizhong/AANRI/structure/structure_CEU_AFR/structure.out_ancestry_proportion_raceDemo_compare"

demo <- read.table(f_demo, header=TRUE)
pc <- read.csv(f_pc)
ances <- read.table(f_ances, header=TRUE)

cat("\nHead of demographic data:\n")
print(head(demo))
cat("\nDimensions of demographic data:", dim(demo), "\n")

cat("\nHead of principal components data:\n")
print(head(pc))
cat("\nDimensions of principal components data:", dim(pc), "\n")

cat("\nHead of ancestry proportion data:\n")
print(head(ances))
cat("\nDimensions of ancestry proportion data:", dim(ances), "\n")

# Align samples
cat("\nAligning samples...\n")
id <- intersect(intersect(demo$BrNum, ind), pc$ind)
cat("\nNumber of common IDs after intersection:", length(id), "\n")

demo <- demo[match(id, demo$BrNum), ]
pc <- pc[match(id, pc$ind), ]
p <- p[match(id, ind), ]
ances <- ances[match(id, ances$id), ]

cat("\nAligned demographic data dimensions:", dim(demo), "\n")
cat("\nAligned principal components data dimensions:", dim(pc), "\n")
cat("\nAligned methylation data dimensions:", dim(p), "\n")
cat("\nAligned ancestry proportion data dimensions:", dim(ances), "\n")

# Prepare covariates
cat("\nPreparing covariates...\n")
#covs <- as.data.frame(cbind(Age=demo$Age,Sex=demo$Sex,demo[,11:20],pc[,3:12]))
covs <- as.data.frame(cbind(Age=demo$Age, Sex=demo$Sex, Afr=ances$Afr, pc[, 3:12]))
covs$Sex[covs$Sex == "M"] <- 0
covs$Sex[covs$Sex == "F"] <- 1

cat("\nHead of covariates data:\n")
print(head(covs))
cat("\nDimensions of covariates data:", dim(covs), "\n")

# Overlap samples with genotype data
cat("\nOverlapping samples with genotype data...\n")
fam <- paste0(gwas, "AA_chr1.psam")
fam <- read.table(fam, skip=1, header=FALSE)
cat("\nHead of genotype family data:\n")
print(head(fam))
cat("\nDimensions of genotype family data:", dim(fam), "\n")

id <- intersect(fam[, 1], demo$ID)
cat("\nNumber of overlapping genotype IDs:", length(id), "\n")
write.table(id, "id", col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")

# Align samples
cat("\nAligning final sample sets...\n")
idx <- match(id, demo$ID)
p <- p[idx, ]
covs <- covs[idx, ]
covs <- cbind(0, id, covs)

cat("\nFinal aligned methylation data dimensions:", dim(p), "\n")
cat("\nFinal aligned covariates data dimensions:", dim(covs), "\n")
write.table(covs, "covs", col.names=FALSE, row.names=FALSE, quote=FALSE, sep="\t")



Loading covariate data...

Head of demographic data:
                 ID  BrNum Batch SNPnum  MissRate      Dx      Age Sex Race
1 5421787087_R01C01 Br1602    1M 992741 0.0001158  Schizo 83.14000   F CAUC
2 5421787087_R01C02 Br1203    1M 992741 0.0001380  Schizo 24.33000   M CAUC
3 4572348457_R01C02 Br1573    1M 992741 0.0005671  Schizo 57.58000   M   AA
4 4572348844_R01C01 Br1214    1M 992741 0.0074820 Control 61.13000   M CAUC
5 4572348844_R01C02 Br1276    1M 992741 0.0077590 Control 24.25000   M   AA
6 5532971095_R01C01 Br2147    1M 992741 0.0002216 Control 51.64823   M HISP
  PCArace     PC1     PC2     PC3     PC4     PC5     PC6     PC7     PC8
1    CAUC -0.0128  0.0082  0.0038 -0.0039 -0.0030 -0.0017  0.0057  0.0036
2    CAUC -0.0129  0.0077  0.0061 -0.0010  0.0011 -0.0076  0.0108  0.0006
3      AA  0.0233  0.0032  0.0108  0.0085  0.0001 -0.0004 -0.0017  0.0073
4    CAUC -0.0124  0.0078  0.0043 -0.0038 -0.0024 -0.0090  0.0090  0.0012
5      AA  0.0203  0.0035  0.0104  0.0075 -0

## Test the loop for a single methylation site and window

In [12]:
getwd()

In [13]:
i <- 10 # outer loop
w <- 1 # inner loop
res <- c() # store results

In [14]:
chr <- gsub("chr","",vmrs2[i,1])

In [15]:
vmrs[i, ]

Unnamed: 0_level_0,chr,start,end,idxStart,idxEnd,cluster,n,meanSDS
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>
56,chr22,18997916,18998123,27988,28001,192,14,0.08271953


For a given VMR, we extract SNPs within the window of the region and these are the only ones used to compute heritability.

In [16]:
# Calculate p1 and p2
start_time <- Sys.time()
cat("\nProcessing VMR", i, "with window size", wind[w], "...\n")
p1 <- ifelse(vmrs2[i,2] - wind[w] > 0, vmrs2[i,2] - wind[w], 0)
p2 <- vmrs2[i,3] + wind[w]

cat("p1:", p1, "\tp2:", p2, "\n")


Processing VMR 10 with window size 10000 ...
p1: 1234000 	p2: 1254681 


In [17]:
# have to add this line so code works
vmrs2$chr <- gsub("chr", "", vmrs2$chr)

In [18]:
# Prepare PLINK command
gwas_prefix <- paste0(gwas, "AA_chr", vmrs2[i, 1])

# The SNPs we extract with PLINK are only the ones within the VMR
command <- paste("/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/plink2 --pfile",
                 gwas_prefix,
                 "--silent --keep id",
                 "--chr", vmrs2[i, 1],
                 "--from-bp", p1, "--to-bp", p2, "--snps-only 'just-acgt' --make-bed --out temp", sep=" ")

cat("Running PLINK command:\n", command, "\n")
output <- system(command, intern = TRUE, ignore.stderr = FALSE)
cat("PLINK output:\n", output, "\n")

if (!file.exists("temp.bim")) {
  cat("PLINK output not found. Skipping to next iteration.\n")
  next
}

Running PLINK command:
 /dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/plink2 --pfile /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/AA/all/plink/AA_chr1 --silent --keep id --chr 1 --from-bp 1234000 --to-bp 1254681 --snps-only 'just-acgt' --make-bed --out temp 
PLINK output:
  


In [19]:
# Prepare phenotype file
pheno <- cbind(0, id, p[, i])
write.table(pheno, "pheno", col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
cat("Phenotype data head:\n")
print(head(pheno))
cat("Phenotype data dimensions:", dim(pheno), "\n")

Phenotype data head:
           id                                     
Br1276 "0" "4572348844_R01C02" "0.579952267303103"
Br1007 "0" "4572348382_R01C02" "0.621262458471761"
Br1221 "0" "4463344431_R01C01" "0.71900826446281" 
Br1504 "0" "4578977042_R01C02" "0.81447963800905" 
Br1918 "0" "4578977010_R01C02" "0.69983948635634" 
Br1325 "0" "4463344524_R01C01" "0.639484978540773"
Phenotype data dimensions: 89 3 


In [20]:
# Create GRM using SNPs within VMR
command <- paste(gcta, "--bfile temp --make-grm-bin --out temp", sep=" ")
cat("Running GCTA command (GRM):\n", command, "\n")
output <- system(command, intern = TRUE, ignore.stderr = FALSE)
cat("GCTA GRM output:\n", output, "\n") # from here we get outputs including temp.grm.N.bin, temp.grm.bin temp.grm.id

Running GCTA command (GRM):
 /dcs04/lieber/statsgen/shizhong/software/gcta/gcta-1.94.1-linux-kernel-3-x86_64/gcta-1.94.1 --bfile temp --make-grm-bin --out temp 
GCTA GRM output:
 ******************************************************************* * Genome-wide Complex Trait Analysis (GCTA) * version v1.94.1 Linux * Built at Nov 15 2022 21:14:25, by GCC 8.5 * (C) 2010-present, Yang Lab, Westlake University * Please report bugs to Jian Yang <jian.yang@westlake.edu.cn> ******************************************************************* Analysis started at 10:41:15 EDT on Tue Jul 09 2024. Hostname: login31.cm.cluster  Accepted options: --bfile temp --make-grm-bin --out temp  Note: This is a multi-thread program. You could specify the number of threads by the --thread-num option to speed up the computation if there are multiple processors in your machine.  Reading PLINK FAM file from [temp.fam]. 89 individuals to be included from [temp.fam]. Reading PLINK BIM file from [temp.bim]. 155 SNPs 

In [21]:
# Estimate heritability
command <- paste(gcta, "--reml --grm-bin temp --pheno pheno --mpheno 1 --qcovar covs --out temp", sep=" ")
cat("Running GCTA command (heritability estimation):\n", command, "\n")
output <- system(command, intern = TRUE, ignore.stderr = FALSE)
cat("GCTA heritability output:\n", output, "\n") # from here we get outputs including temp.grm.N.bin, temp.grm.bin temp.grm.id

Running GCTA command (heritability estimation):
 /dcs04/lieber/statsgen/shizhong/software/gcta/gcta-1.94.1-linux-kernel-3-x86_64/gcta-1.94.1 --reml --grm-bin temp --pheno pheno --mpheno 1 --qcovar covs --out temp 
GCTA heritability output:
 ******************************************************************* * Genome-wide Complex Trait Analysis (GCTA) * version v1.94.1 Linux * Built at Nov 15 2022 21:14:25, by GCC 8.5 * (C) 2010-present, Yang Lab, Westlake University * Please report bugs to Jian Yang <jian.yang@westlake.edu.cn> ******************************************************************* Analysis started at 10:41:15 EDT on Tue Jul 09 2024. Hostname: login31.cm.cluster  Accepted options: --reml --grm-bin temp --pheno pheno --mpheno 1 --qcovar covs --out temp  Note: This is a multi-thread program. You could specify the number of threads by the --thread-num option to speed up the computation if there are multiple processors in your machine.  Reading IDs of the GRM from [temp.grm.id]

In [22]:
if (!file.exists("temp.hsq")) {
  cat("Heritability results not found. Skipping to next iteration.\n")
  next
}

# Collect results
temp <- read.table("temp.hsq", header = TRUE, fill = TRUE)
vmr <- paste0("chr", vmrs2[i, 1], "_", vmrs2[i, 2], "_", vmrs2[i, 3])
temp$vmr <- vmr
temp$wind <- wind[w]

cat("Collected results head:\n")
print(head(temp))

res <- rbind(res, temp)

# Clean up temporary files
cat("Cleaning up temporary files...\n")
system("rm temp*")

end_time <- Sys.time()
cat("Iteration runtime:", difftime(end_time, start_time, units = "secs"), "\n")

Collected results head:
   Source   Variance       SE                  vmr  wind
1    V(G)   0.002115 0.000924 chr1_1244000_1244681 10000
2    V(e)   0.002107 0.000432 chr1_1244000_1244681 10000
3      Vp   0.004222 0.000911 chr1_1244000_1244681 10000
4 V(G)/Vp   0.501021 0.132368 chr1_1244000_1244681 10000
5    logL 122.306000       NA chr1_1244000_1244681 10000
6   logL0 105.058000       NA chr1_1244000_1244681 10000
Cleaning up temporary files...
Iteration runtime: 4.4147 


## Try with new data

In [23]:
library(data.table)

In [24]:
df <- fread("/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/09.5-OUT_matched_SNP_meth_cov_chunked_JHPCE.csv")

In [25]:
df_row <- 1

In [26]:
#args = commandArgs(trailingOnly=TRUE)

#outdir <- args[1]
#chunk1 <- as.numeric(args[2])
#chunk2 <- as.numeric(args[3])

wind <- c(10000, 100000, 1000000)

#wind <- as.numeric(c("1000","2000","5000","10000","20000","50000","100000","200000","500000"))

# This directory contains vmr files, which are...
indir <- "/dcs04/lieber/statsgen/shizhong/AANRI/VMR2/99/caud/aa/"

# path to dir containing pgen, psam files for a given population/subpopulation
gwas <- paste0(dirname(df$SNP_data[df_row]), "/")
gcta <- "/dcs04/lieber/statsgen/shizhong/software/gcta/gcta-1.94.1-linux-kernel-3-x86_64/gcta-1.94.1"

outdir <- "/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/gcta_output"
if(!dir.exists(outdir)) dir.create(outdir)

setwd(outdir)

In [60]:
gwas

In [27]:
library(bsseq)

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname

In [28]:
load(df$methylation_data[df_row])
chunk1 <- 1
chunk2 <- 1000

### Get our methylation data in the same format as the earlier `p` matrix (row per sample, column per site)

In [41]:
p <- t(as.matrix(getMeth(BSobj2, type = "smooth", what = "perBase")))

ind <- BSobj2@colData$brnum <- gsub("Br0", "Br", BSobj2@colData$brnum)

ind <- gsub("Br0", "Br", BSobj2@colData$brnum)
id <- ind

rownames(p) <- ind

covs <- fread(df$cov_file[df_row])
covs$Sex[covs$Sex=="M"] <- 0
covs$Sex[covs$Sex=="F"] <- 1

covs$Dx[covs$Dx=="Control"] <- 0
covs$Dx[covs$Dx=="SCZ"] <- 1

missing_ids <- ind[!ind %in% covs$ID]
if (length(missing_ids) > 0) {
  cat("Missing covariates for IDs:", paste(missing_ids, collapse = ", "), "\n")
  ind <- ind[ind %in% covs$ID]
}
covs <- covs[match(ind, covs$ID), ]
p <- p[match(ind, rownames(p)), ]

id <- colData(BSobj2)$ID[which(colData(BSobj2)$brnum %in% ind)]

covs <- cbind(0, covs)
colnames(covs)[1] <- "intercept"

write.table(ind,"id",col.names=F,row.names=F,quote=F,sep="\t")

write.table(covs,"covs",col.names=F,row.names=F,quote=F,sep="\t")

chr <- unfactor(unique(seqnames(rowRanges(BSobj2))))
if(length(chr) > 1) stop ("Should be just one chromosome per BSobj")
chr <- gsub("chr","",chr)

CpG_positions <- start(ranges(granges(BSobj2)))

#### With multithreading

In [44]:
# Sys.time()

[1] "2024-07-09 10:43:49 EDT"

In [45]:
# # loop over vmr between two chunks
# res <- c()
# for(i in chunk1:chunk2){
# 	cat(i,"\n")
# 	chr <- gsub("chr","",chr)
# 	# loop over each window size
# 	for(w in 1:length(wind)){
#         # plink subset
#         p1 <- ifelse(CpG_positions[i] - wind[w] > 0, CpG_positions[i] - wind[w],0)
#         p2 <- CpG_positions[i] + wind[w]
#         gwas_prefix <- paste0(gwas,"libd_chr",chr)
#         command <- paste("/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/plink2 --pfile ", gwas_prefix, "--silent --keep id",
#                          "--chr ",chr,
#                          "--from-bp",p1,"--to-bp",p2,
#                          "--snps-only 'just-acgt' --make-bed --out temp",
#                          sep=" ")
#         system(command)	
#         if(!file.exists("temp.bim")){
#             next;
#         }
#         # phenotype file 
#         #pheno <- cbind(0,id,p[,i])
#         pheno <- cbind(0,ind,p[,i])
#         write.table(pheno,"pheno",col.names=F,row.names=F,quote=F,sep="\t")
#         # grm
#         command <- paste(gcta, "--bfile temp --make-grm-bin --out temp", sep=" ")
#         system(command)
#         # h2 estimation
#         command <- paste(gcta, "--reml --grm-bin temp --pheno pheno --mpheno 1 --qcovar covs --out temp", sep=" ")
#         system(command)
#         # collect results
#         if(!file.exists("temp.hsq")){
#             next;
#         }
#         temp <- read.table("temp.hsq",header=T, fill=TRUE)
#         temp$site <- paste0("chr",chr,"_",CpG_positions[i])
#         temp$wind <- wind[w]
#         res <- rbind(res,temp)
#         # remove temp files
#         system("rm temp*")
#         }
	
# }
# write.table(res,"res.txt",col.names=T,row.names=F,quote=F,sep="\t") 

In [46]:
# Sys.time()

[1] "2024-07-09 10:43:49 EDT"

#### Single-threaded

In [47]:
Sys.time() # 2024-07-09 10:01:27 EDT

[1] "2024-07-09 10:43:49 EDT"

In [48]:
# # loop over vmr between two chunks
# res <- c()
# for(i in chunk1:chunk2){
# 	cat(i,"\n")
# 	chr <- gsub("chr","",chr)
# 	# loop over each window size
# 	for(w in 1:length(wind)){
#         # plink subset
#         p1 <- ifelse(CpG_positions[i] - wind[w] > 0, CpG_positions[i] - wind[w],0)
#         p2 <- CpG_positions[i] + wind[w]
#         gwas_prefix <- paste0(gwas,"libd_chr",chr)
#         command <- paste("/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/plink2 --pfile ", gwas_prefix, "--silent --keep id",
#                          "--chr ",chr,
#                          "--from-bp",p1,"--to-bp",p2,
#                          "--snps-only 'just-acgt' --make-bed --threads 1 --out temp",
#                          sep=" ")
#         system(command)	
#         if(!file.exists("temp.bim")){
#             next;
#         }
#         # phenotype file 
#         #pheno <- cbind(0,id,p[,i])
#         pheno <- cbind(0,ind,p[,i])
#         write.table(pheno,"pheno",col.names=F,row.names=F,quote=F,sep="\t")
#         # grm
#         command <- paste(gcta, "--bfile temp --make-grm-bin --thread-num 1 --out temp", sep=" ")
#         system(command)
#         # h2 estimation
#         command <- paste(gcta, "--reml --grm-bin temp --pheno pheno --mpheno 1 --qcovar covs --thread-num 1 --out temp", sep=" ")
#         system(command)
#         # collect results
#         if(!file.exists("temp.hsq")){
#             next;
#         }
#         temp <- read.table("temp.hsq",header=T, fill=TRUE)
#         temp$site <- paste0("chr",chr,"_",CpG_positions[i])
#         temp$wind <- wind[w]
#         res <- rbind(res,temp)
#         # remove temp files
#         system("rm temp*")
#         }
	
# }
# write.table(res,"res-singlethread.txt",col.names=T,row.names=F,quote=F,sep="\t") 

In [49]:
Sys.time() # 2024-07-09 10:22:36 EDT

[1] "2024-07-09 10:43:49 EDT"

In [50]:
# number seconds to run

In [51]:
n_sec <- (21*60)+9
n_min <- n_sec/60
n_hr <- n_min/60
print(n_hr)

[1] 0.3525


In [52]:
core_hr_per_cpg <- n_hr/1000

In [53]:
core_hr_per_cpg*26000000

In [55]:
Sys.time()

[1] "2024-07-09 10:43:49 EDT"

In [56]:
chunk2 <- 5000

In [57]:
Sys.time()

[1] "2024-07-09 10:43:49 EDT"

In [58]:
# loop over vmr between two chunks
res <- c()
for(i in chunk1:chunk2){
	cat(i,"\n")
	chr <- gsub("chr","",chr)
	# loop over each window size
	for(w in 1:length(wind)){
        # plink subset
        p1 <- ifelse(CpG_positions[i] - wind[w] > 0, CpG_positions[i] - wind[w],0)
        p2 <- CpG_positions[i] + wind[w]
        gwas_prefix <- paste0(gwas,"libd_chr",chr)
        command <- paste("/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/scripts/plink2 --pfile ", gwas_prefix, "--silent --keep id",
                         "--chr ",chr,
                         "--from-bp",p1,"--to-bp",p2,
                         "--snps-only 'just-acgt' --make-bed --threads 1 --out temp",
                         sep=" ")
        system(command)	
        if(!file.exists("temp.bim")){
            next;
        }
        # phenotype file 
        #pheno <- cbind(0,id,p[,i])
        pheno <- cbind(0,ind,p[,i])
        write.table(pheno,"pheno",col.names=F,row.names=F,quote=F,sep="\t")
        # grm
        command <- paste(gcta, "--bfile temp --make-grm-bin --thread-num 1 --out temp", sep=" ")
        system(command)
        # h2 estimation
        command <- paste(gcta, "--reml --grm-bin temp --pheno pheno --mpheno 1 --qcovar covs --thread-num 1 --out temp", sep=" ")
        system(command)
        # collect results
        if(!file.exists("temp.hsq")){
            next;
        }
        temp <- read.table("temp.hsq",header=T, fill=TRUE)
        temp$site <- paste0("chr",chr,"_",CpG_positions[i])
        temp$wind <- wind[w]
        res <- rbind(res,temp)
        # remove temp files
        system("rm temp*")
        }
	
}
write.table(res,"res-singlethread.txt",col.names=T,row.names=F,quote=F,sep="\t") 

1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 
17 
18 
19 
20 
21 
22 
23 
24 
25 
26 
27 
28 
29 
30 
31 
32 
33 
34 
35 
36 
37 
38 
39 
40 
41 
42 
43 
44 
45 
46 
47 
48 
49 
50 
51 
52 
53 
54 
55 
56 
57 
58 
59 
60 
61 
62 
63 
64 
65 
66 
67 
68 
69 
70 
71 
72 
73 
74 
75 
76 
77 
78 
79 
80 
81 
82 
83 
84 
85 
86 
87 
88 
89 
90 
91 
92 
93 
94 
95 
96 
97 
98 
99 
100 
101 
102 
103 
104 
105 
106 
107 
108 
109 
110 
111 
112 
113 
114 
115 
116 
117 
118 
119 
120 
121 
122 
123 
124 
125 
126 
127 
128 
129 
130 
131 
132 
133 
134 
135 
136 
137 
138 
139 
140 
141 
142 
143 
144 
145 
146 
147 
148 
149 
150 
151 
152 
153 
154 
155 
156 
157 
158 
159 
160 
161 
162 
163 
164 
165 
166 
167 
168 
169 
170 
171 
172 
173 
174 
175 
176 
177 
178 
179 
180 
181 
182 
183 
184 
185 
186 
187 
188 
189 
190 
191 
192 
193 
194 
195 
196 
197 
198 
199 
200 
201 
202 
203 
204 
205 
206 
207 
208 
209 
210 
211 
212 
213 
214 
215 
216 
217 
218 
219 
220 
221 
222

In [59]:
Sys.time()

[1] "2024-07-09 12:38:36 EDT"

5000 tests in 1hr54min47sec. 1.9136hr

In [1]:
(1.913/5000)*2.6*10^6

In [3]:
(1.913/5000)*26*10^6 # Estimated runtime for all

In [None]:
|