# Investigate NA values in our genotype data

In [1]:
library(CpGWAS)
library(data.table)

## Load complete SNP dataset from plink2 raw file

This raw file we're loading was produced via `plink2 --pfile libd_chr1 --export A --out libd_chr1_test`

In [2]:
file <- fread("~/data/libd_chr1_test.raw")

# Subset to columns with at least one NA
na_columns <- cbind(file$IID, file[, .SD, .SDcols = colSums(is.na(file)) > 0])

In [3]:
dim(file)

In [4]:
na_columns

V1,chr1:5215377:C:T_C,chr1:165457376:C:A_C,chr1:234308448:A:G_A
<chr>,<dbl>,<dbl>,<dbl>
Br1602,2,1.009,2.000
Br1203,2,2.000,2.000
Br1214,2,2.000,2.000
Br2149,2,2.000,1.000
Br1016,2,2.000,2.000
Br1580,2,2.000,2.000
Br1646,2,2.000,2.000
Br1823,2,2.000,1.002
Br1696,2,2.000,2.000
Br1513,2,2.000,2.000


Do these columns (SNPs) contain NA for the same exact samples?

In [5]:
all(rowSums(is.na(na_columns[, -1, with = FALSE])) %in% c(0, ncol(na_columns) - 1))

Which indices, and which samples?

In [6]:
length(which(is.na(na_columns$`chr1:165457376:C:A_C`), arr.ind = TRUE))

In [7]:
2189-1659+1

<div class="alert alert-block alert-info">The last 531 genotypes in the SNP set have NA for these 3 SNPs.</div>

## Load `methylationInput` object containing partial SNP data loaded into R, methylation data, and various metadata

This function exists to reload data after subsetting to random samples in another session, without breaking pointers to SNP data on hard drive.

In [8]:
reinitializeMethylationInput <- function(rds_path, snp_data_path, no_cores = detectCores()) {
  if (!file.exists(rds_path)) {
    stop("RDS file does not exist: ", rds_path)
  }
  
  # Load the MethylationInput object from RDS
  loadedObject <- readRDS(rds_path)
  if (!inherits(loadedObject, "MethylationInput")) {
    stop("Loaded object is not a MethylationInput.")
  }
  
  # Reinitialize external pointers for SNP data
  pgen_path <- gsub(snp_data_path, pattern = "pvar", replacement = "pgen")
  pvar_path <- gsub(snp_data_path, pattern = "pgen", replacement = "pvar")
  psam_path <- gsub(pvar_path, pattern = "pvar", replacement = "psam")
  
  if (!file.exists(pgen_path) || !file.exists(pvar_path) || !file.exists(psam_path)) {
    stop("One or more SNP data files not found at the specified paths.")
  }
  
  loadedObject@pvar_pointer <- pgenlibr::NewPvar(pvar_path)
  loadedObject@pvar_dt <- fread(pvar_path)[, 1:3]
  loadedObject@pgen <- pgenlibr::NewPgen(pgen_path, pvar = loadedObject@pvar_pointer)
  loadedObject@psam <- fread(psam_path)
  
  # Reinitialize genotype_IDs based on intersection with methylations
  psam_in_wgbs <- loadedObject@psam[which(loadedObject@psam$`#IID` %in% rownames(loadedObject@methylations))]
  genotype_IDs <- psam_in_wgbs$`#IID`
  genotype_IDs <- intersect(rownames(loadedObject@methylations), genotype_IDs)
  loadedObject@genotype_IDs <- genotype_IDs[order(genotype_IDs)]
  
  # Ensure methylations are filtered and ordered according to the new genotype_IDs, if necessary
  loadedObject@methylations <- loadedObject@methylations[which(rownames(loadedObject@methylations) %in% genotype_IDs), ]
  
  return(loadedObject)
}

In [9]:
setwd("~/data/")

In [10]:
methInput <- reinitializeMethylationInput(rds_path = "chr1_AA_methylation_10k_samples.rds",
                                          snp_data_path = "libd_chr1.pgen",
                                          no_cores = 4)

Among samples for which we have methylation data, how many are in the the group of genotypes that has NA for the three mysterious Chr1 SNPs?

In [11]:
length(intersect(methInput@genotype_IDs, na_columns$V1[1659:2189]))

<div class="alert alert-block alert-info">All 111 of the genotypes in our dataset are found in the last 531 rows of the SNP set, which contain NA for all values of these three SNPs.</div>