# Flip allele order in 1k genome reference to match alleles in LIBD SNP dataset.

In [1]:
cd /expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped/

In [10]:
conda activate mwas

## Run for one chromosome

In [16]:
head ref_EUR_chr1.pvar

#CHROM	POS	ID	REF	ALT
1	1079456	rs12401605	G	A
1	1079484	rs12411041	A	T
1	1079746	rs36027499	G	A
1	1079877	rs9442369	G	A
1	1080171	rs9442370	T	C
1	1080437	rs12746483	A	G
1	1081790	rs3766193	G	C
1	1081817	rs3766192	T	C
1	1081961	rs1133647	T	G


In [18]:
# Define paths to the pvar files
ref_pvar="ref_EUR_chr1.pvar"
libd_pvar="libd_chr1.pvar"
output_file="snps_to_flip.txt"

# Call the R script to generate the flip list
Rscript /expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/generate_flip_list.R $ref_pvar $libd_pvar $output_file

Total SNPs in ref file: 448584 
Total SNPs in libd file: 448584 
Percentage of SNPs to potentially flip in ref file: 22.55% 
Percentage of SNPs to potentially flip in libd file: 22.55% 


In [19]:
head snps_to_flip.txt

rs12401605	A
rs12411041	T
rs36027499	A
rs9442369	A
rs9442370	C
rs12746483	G
rs3766193	C
rs3766192	C
rs1133647	G
rs9442395	T


In [21]:
# Path to the PLINK binary
plink_path="./plink2"

# Check if the SNP flip list was created and is not empty
if [ -s "$output_file" ]; then

  # Run PLINK to flip the alleles
  ./plink2 --pfile ref_EUR_chr1 --ref-allele snps_to_flip.txt 2 1 --make-pgen --out ref_EUR_flipped_chr1

  echo "PLINK has been executed to flip the alleles."
else
  echo "No SNPs need flipping, or an error occurred."
fi

PLINK v2.00a5.12LM AVX2 AMD (25 Jun 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ref_EUR_flipped_chr1.log.
Options in effect:
  --make-pgen
  --out ref_EUR_flipped_chr1
  --pfile ref_EUR_chr1
  --ref-allele snps_to_flip.txt 2 1

Start time: Mon Aug 12 12:09:43 2024
257485 MiB RAM detected, ~214434 available; reserving 128742 MiB for main
workspace.
Using up to 12 threads (change this with --threads).
489 samples (0 females, 0 males, 489 ambiguous; 489 founders) loaded from
ref_EUR_chr1.psam.
448584 variants loaded from ref_EUR_chr1.pvar.
Note: No phenotype data present.
--ref-allele: 101161 sets of allele codes rotated.
Writing ref_EUR_flipped_chr1.psam ... done.
Writing ref_EUR_flipped_chr1.pvar ... 1010111112121313141415151616171718181919202021212222232324252526262727282829293030313132323333343435353636373738383939404041414242434344444545464647474848495050515152525353545455555656575758585959606061

## Run for all

In [22]:
# Path to the PLINK binary
plink_path="./plink2"

# Path to the R script
r_script_path="/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/generate_flip_list.R"

# Loop through chromosomes 1 to 22
for chr in {1..22}
do
  echo "Processing chromosome $chr..."

  # Define file paths for this chromosome
  ref_pvar="ref_EUR_chr${chr}.pvar"
  libd_pvar="libd_chr${chr}.pvar"
  output_file="snps_to_flip_chr${chr}.txt"

  # Call the R script to generate the flip list
  Rscript $r_script_path $ref_pvar $libd_pvar $output_file

  # Check if the SNP flip list for this chromosome was created and is not empty
  if [ -s "$output_file" ]; then
    # Run PLINK to set the reference alleles
    $plink_path --pfile ref_EUR_chr$chr --ref-allele $output_file 2 1 --make-pgen --out ref_EUR_flipped_chr$chr

    echo "PLINK has been executed to flip the alleles for chromosome $chr."
  else
    echo "No SNPs need flipping for chromosome $chr, or an error occurred."
  fi

  # Add a blank line for readability between iterations
  echo ""
  echo ""
  echo ""
  echo ""
  echo ""
done


Processing chromosome 1...
Total SNPs in ref file: 448584 
Total SNPs in libd file: 448584 
Percentage of SNPs to potentially flip in ref file: 22.55% 
Percentage of SNPs to potentially flip in libd file: 22.55% 
PLINK v2.00a5.12LM AVX2 AMD (25 Jun 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ref_EUR_flipped_chr1.log.
Options in effect:
  --make-pgen
  --out ref_EUR_flipped_chr1
  --pfile ref_EUR_chr1
  --ref-allele snps_to_flip_chr1.txt 2 1

Start time: Mon Aug 12 12:15:43 2024
257485 MiB RAM detected, ~214555 available; reserving 128742 MiB for main
workspace.
Using up to 12 threads (change this with --threads).
489 samples (0 females, 0 males, 489 ambiguous; 489 founders) loaded from
ref_EUR_chr1.psam.
448584 variants loaded from ref_EUR_chr1.pvar.
Note: No phenotype data present.
--ref-allele: 101161 sets of allele codes rotated.
Writing ref_EUR_flipped_chr1.psam ... done.
Writing ref_EUR_flippe

### Verify

#### We should get 0% need to be flipped if we re-run script over flipped files

In [23]:
# Path to the PLINK binary
plink_path="./plink2"

# Path to the R script
r_script_path="/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/generate_flip_list.R"

# Loop through chromosomes 1 to 22
for chr in {1..22}
do
  echo "Processing chromosome $chr..."

  # Define file paths for this chromosome
  ref_pvar="ref_EUR_flipped_chr${chr}.pvar"
  libd_pvar="libd_chr${chr}.pvar"
  output_file="snps_to_flip_chr${chr}.txt"

  # Call the R script to generate the flip list
  Rscript $r_script_path $ref_pvar $libd_pvar $output_file

  # Add a blank line for readability between iterations
  echo ""
  echo ""
  echo ""
  echo ""
  echo ""
done


Processing chromosome 1...
Total SNPs in ref file: 448584 
Total SNPs in libd file: 448584 
Percentage of SNPs to potentially flip in ref file: 0.00% 
Percentage of SNPs to potentially flip in libd file: 0.00% 





Processing chromosome 2...
Total SNPs in ref file: 502800 
Total SNPs in libd file: 502800 
Percentage of SNPs to potentially flip in ref file: 0.00% 
Percentage of SNPs to potentially flip in libd file: 0.00% 





Processing chromosome 3...
Total SNPs in ref file: 429002 
Total SNPs in libd file: 429002 
Percentage of SNPs to potentially flip in ref file: 0.00% 
Percentage of SNPs to potentially flip in libd file: 0.00% 





Processing chromosome 4...
Total SNPs in ref file: 435606 
Total SNPs in libd file: 435606 
Percentage of SNPs to potentially flip in ref file: 0.00% 
Percentage of SNPs to potentially flip in libd file: 0.00% 





Processing chromosome 5...
Total SNPs in ref file: 380613 
Total SNPs in libd file: 380613 
Percentage of SNPs to potentially flip in re

#### pvar files should match now. Let's check this in notebook 30 in R kernel.