# Script for making list of subset of individuals to analyze using linear models

Author: Jose Jaime Martinez-Magana

Day: 14 February 2023

This script will make list of individuals to run linear models using several samples subsets.

In [None]:
# if you have slurm work handler, request computational resources
# request computational resources
srun --pty --mem=32G -p interactive bash
# load environment
module load miniconda
# activate environment
conda activate ewas_saliva
# this step should be included in a bash script

In [None]:
# this script will use the output qcdata following this github https://github.com/martinezjaime/ewas_saliva_ses/blob/main/qc_data/probe_and_sample_quality_control.ipynb
# reading phenotype information
opt=list()
opt$file="/vast/palmer/scratch/montalvo-ortiz/jjm262/epigenomics/ewas_saliva_ses/databases/qced/qced_data_v02062023.rds"
# setting output paths
opt$out="/vast/palmer/scratch/montalvo-ortiz/jjm262/epigenomics/ewas_saliva_ses/samplelist/"
opt$outname="sample_subset"

In [None]:
# loading library
library(dplyr)

# loading rds
paste0("Start analysis of data:",Sys.time(),"---","###Analysis path[",opt$file,"]###")
file=readRDS(opt$file)
# setting output file
outfile=paste0(opt$out,opt$outname,sep="")
paste0("Output of these script will be saved to:",Sys.time(),"---","###Analysis path[",outfile,"]###")

In [None]:
# loading phenotype file
paste0("Loading pheno file:",Sys.time())
# getting phenodata
pheno_data=file$pheno
# add SampleID as rownames
rownames(pheno_data)=pheno_data$SampleID

In [None]:
# transforming familyID from characther to numberic
# this part of the script is unique for SES analysis in children, have to be removed from new versions of the script
pheno_data$Family_ID=as.numeric(gsub('-','',pheno_data$Family_ID))

# based on family ID
# table(duplicated(pheno_data$Family_ID))
# we have 108 samples with unique Family_ID and 37 in the same Family_ID

# select only one random sample from duplicates
# setting seed for replicability
set.seed(123456)
# using dplyr to subset only one random sample using Family_ID
# this script will select only one sample based on the Family_ID
# selecting only unrelated individuals based on Family ID
pheno_data_unrel=pheno_data %>% 
                    group_by(Family_ID) %>%
                    sample_n(1)

In [None]:
# first select unrelated individuals for linear models
# self-reported race
# table(pheno_data_unique$race)
# we have the following numbers
# 1   2   6   7  999 
# 84  7   7   2  8 

# get only unrelated individuals
pheno_data_unrel_list=pheno_data_unrel$SampleID

# subset females
pheno_data_unrel_females_list=pheno_data_unrel[pheno_data_unrel$gender == 0,]$SampleID

# subset males
pheno_data_unrel_males_list=pheno_data_unrel[pheno_data_unrel$gender == 1,]$SampleID

In [None]:
# subsetting only based on sex for mixed linear models, stratify by gender
# subset related females
pheno_data_rel_females_list=pheno_data[pheno_data$gender == 0,]$SampleID

# subset related males
pheno_data_rel_males_list=pheno_data[pheno_data$gender == 1,]$SampleID

In [None]:
# writting list for statistical testing
# we are writting csv files with the name of Array_Slide, because the script for statistical testing
# https://github.com/martinezjaime/ewas_saliva_ses/blob/main/assoc/assoc_linear_models.ipynb
# https://github.com/martinezjaime/ewas_saliva_ses/blob/main/assoc/assoc_mixed_linear_models.ipynb
# uses this structure for statistical testing
# saving unrelated individuals
write.csv(file=paste0(outfile,'_unrelated_v02142023.csv',sep=''),
          pheno_data_unrel_list,
          quote=FALSE,
          row.names=FALSE)

write.csv(file=paste0(outfile,'_unrelated_females_v02142023.csv',sep=''),
          pheno_data_unrel_females_list,
          quote=FALSE,
          row.names=FALSE)

write.csv(file=paste0(outfile,'_unrelated_males_v02142023.csv',sep=''),
          pheno_data_unrel_males_list,
          quote=FALSE,
          row.names=FALSE)

# saving related individuals
write.csv(file=paste0(outfile,'_related_females_v02142023.csv',sep=''),
          pheno_data_rel_females_list,
          quote=FALSE,
          row.names=FALSE)

write.csv(file=paste0(outfile,'_related_males_v02142023.csv',sep=''),
          pheno_data_rel_males_list,
          quote=FALSE,
          row.names=FALSE)

paste0('Warning: manually add SampleID as header to all this files, or the assoc scripts will failed')