# Make Manhattan plots and histograms

In [1]:
library(data.table)
library(stringr)
library(qqman)

# Function to list and read files
read_files <- function(pattern, path = "./") {
  files <- list.files(path, pattern = pattern)
  lapply(files, fread)
}

# Function to subset data based on population and region
subset_data <- function(data, population, region) {
  data <- data[population == population & region == region]
  unique(data)
}

# Function to process data
process_data <- function(data) {
  data$neglogP <- -log10(data$p)
  data$chr_p <- paste0(data$chr, "_", data$pos)
  colnames(data) <- paste0("mwas_", colnames(data))
  colnames(data) <- gsub("mwas_chr_p", "chr_p", colnames(data))
  data <- data[!duplicated(data$chr_p, fromLast = TRUE)]
  setorder(data, mwas_p)
  data
}

# Function to clean and standardize summary statistics
clean_and_standardize_colnames <- function(summary_stats) {
  if (grepl("\t", colnames(summary_stats)[1])) {
    real_colnames <- str_split(colnames(summary_stats)[1], "\t")[[1]]
    colnames(summary_stats) <- real_colnames
  }
  colnames(summary_stats) <- gsub("chr|#CHROM", "CHR", colnames(summary_stats))
  colnames(summary_stats) <- gsub("pos|POS", "BP", colnames(summary_stats))
  colnames(summary_stats) <- gsub("MarkerName|ID", "SNP", colnames(summary_stats))
  colnames(summary_stats) <- gsub("LogOR", "logOR", colnames(summary_stats))
  if (!"logOR" %in% colnames(summary_stats) && "OR" %in% colnames(summary_stats)) {
    summary_stats[, logOR := log(OR)]
  }
  colnames(summary_stats) <- gsub("logOR", "BETA", colnames(summary_stats))
  setkey(summary_stats, SNP)
  summary_stats
}

# Function to load and clean summary stats
load_clean_summary_stats <- function(file) {
  ss <- fread(file)
  ss <- clean_and_standardize_colnames(ss)
  colnames(ss) <- paste0("gwas_", colnames(ss))
  ss$chr_p <- paste0(ss$gwas_CHR, "_", ss$gwas_BP)
  colnames(ss) <- gsub("gwas_P", "gwas_p", colnames(ss))
  setorder(ss, gwas_p)
  ss
}

# Function to omit NA and zeros
omit_na_and_zeros <- function(x) {
  if (is.data.frame(x)) {
    rows_to_keep <- which(rowSums(is.na(x) | x == 0) == 0)
    x_clean <- x[rows_to_keep, ]
  } else if (is.vector(x)) {
    x_clean <- x[which(x != 0 & !is.na(x))]
  } else {
    stop("Input must be a data.frame or vector")
  }
  x_clean
}

# Function to check if P is numeric
check_and_convert_P <- function(data, dataset_name) {
  if (!is.numeric(data$P)) {
    warning(paste("P is not numeric in", dataset_name, "- converting to numeric"))
    print(head(data))
    print(dim(data))
    print(names(data))
    data$P <- as.numeric(data$P)
  }
  data
}

# Function to create and save Manhattan plot
create_manhattan_plot <- function(data, main_title, filename) {
  data <- na.omit(data)
  file_in_subset <- data[1:10000, ]
  file_in_subset$SNP <- file_in_subset$chr_p
  png(filename)
  qqman::manhattan(file_in_subset, main = main_title)
  dev.off()
}

# Function to create and save QQ plot
create_qq_plot <- function(data, main_title, filename) {
  data_sample <- data[sample(nrow(data), 10000), ]
  png(filename)
  qqman::qq(data_sample$P, main = main_title)
  dev.off()
}

# Function to create and save histogram
create_histogram <- function(data, main_title, filename) {
  data_sample <- data[sample(nrow(data), 10000), ]
  png(filename)
  hist(data_sample$P, main = main_title, xlab = "P-value", breaks = 50)
  dev.off()
}

# Main function to process files and create plots
process_and_plot <- function(files_pattern, ss_files_path) {
  dir.create("19-OUT_plots", showWarnings = FALSE)
  
  ss_files <- list.files(ss_files_path, pattern = "gwas_stat", full.names = TRUE)
  
  for (file in list.files("./", pattern = files_pattern, full.names = TRUE)) {
    file_in <- fread(file)
    populations <- levels(factor(file_in$population))
    regions <- levels(factor(file_in$region))
    
    for (population in populations) {
      print(population)
      for (region in regions) {
        print(region)
        file_in_subset <- subset_data(file_in, population, region)
        file_in_subset <- process_data(file_in_subset)
        file_in_subset <- omit_na_and_zeros(file_in_subset)
        
        base_name <- paste0("19-OUT_plots/", basename(file), "_", population, "_", region)

        colnames(file_in_subset) <- gsub("mwas_chr", "CHR", colnames(file_in_subset))
        colnames(file_in_subset) <- gsub("mwas_pos", "BP", colnames(file_in_subset))
        colnames(file_in_subset) <- gsub("mwas_p", "P", colnames(file_in_subset))
        colnames(file_in_subset) <- gsub("gwas_chr", "CHR", colnames(file_in_subset))
        colnames(file_in_subset) <- gsub("gwas_pos", "BP", colnames(file_in_subset))
        colnames(file_in_subset) <- gsub("gwas_p", "P", colnames(file_in_subset))

        file_in_subset <- check_and_convert_P(file_in_subset, paste("MWAS -", population, region))
        
        create_manhattan_plot(file_in_subset, paste("MWAS Manhattan -", population, region), paste0(base_name, "_manhattan.png"))
        create_qq_plot(file_in_subset, paste("MWAS QQ -", population, region), paste0(base_name, "_qq.png"))
        create_histogram(file_in_subset, paste("MWAS Histogram -", population, region), paste0(base_name, "_hist.png"))
      }
    }
  }
  
  for (ss_file in ss_files) {
    ss <- load_clean_summary_stats(ss_file)
    ss <- check_and_convert_P(ss, basename(ss_file))
    
    top_hits <- ss[1:10000, ]
    top_hits_file <- sub("\\.csv$", "_10k-top.csv", ss_file)
    fwrite(top_hits, top_hits_file)
    
    ss_base_name <- paste0("19-OUT_plots/", basename(ss_file))
    
    create_manhattan_plot(ss, "SCZ GWAS associations", paste0(ss_base_name, "_manhattan.png"))
    create_qq_plot(ss, "SCZ GWAS QQ plot", paste0(ss_base_name, "_qq.png"))
    create_histogram(ss, "SCZ GWAS Histogram", paste0(ss_base_name, "_hist.png"))
  }
}

# Execute the function with the desired files and summary stats files path
process_and_plot("16a9", "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas")




For example usage please run: vignette('qqman')



Citation appreciated but not required:

Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.





[1] "AA"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "all"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "EA"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "AA"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "all"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "EA"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "AA"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "all"
[1] "caud"
[1] "dlpfc"
[1] "hippo"
[1] "EA"
[1] "caud"
[1] "dlpfc"
[1] "hippo"


“Detected 1 column names but the data has 16 columns (i.e. invalid file). Added 15 extra default column names at the end.”


ERROR: Error: some columns are not in the data.table: [gwas_p]


In [None]:
head(ss)

In [None]:
getwd()