# Data preparation final approach

## Overview
1. Read in files received from 23andMe
2. Fasta - generate correct REF allele
3. Annotation
4. Remove synonymous variants and benign/likely benign
5. Fix AA change for 23andMe file
6. Merge annotated clean file with original values
7. Extract info from 23andMe
8. Extract variants from UKB and AMP-PD
9. Association testing
10. Meta-analysis
11. Clean files for analysis

## 1. Read in files from 23andMe

## 2. Fasta - generate correct REF allele

Now run it with the fasta file
(had to download the correct file according to this https://lh3.github.io/2017/11/13/which-human-reference-genome-to-use)

## 3. Annotation

Write file for annotation

In [None]:
cut -f 1,2,3,7,8 963_snps_alleles_correct_fasta.txt > for_annotation.txt

tail -n +2 for_annotation.txt > for_annotation_nohead.txt

Annotate

In [None]:
module load annovar

table_annovar.pl for_annotation_nohead.txt $ANNOVAR_DATA/hg38/ -buildver hg38 -protocol refGene,avsnp150,clinvar_20220320,gnomad211_genome -operation g,f,f,f -outfile 963_23andMe_annotated_all -nastring .

Check file in R

## 4. Remove synonymous variants and benign/likely_benign for certain genes

In [None]:
# Parkinson's
anti %>% filter(grepl("arkinson", CLNDN)) %>% tally()
    n
1 281

anti %>% filter(grepl("arkinson", CLNDN)) %>% group_by(CLNSIG) %>% tally()
# A tibble: 12 × 2
CLNSIG	n
<chr>	<int>
Benign	34
Benign/Likely_benign	18
Conflicting_interpretations_of_pathogenicity	43
Conflicting_interpretations_of_pathogenicity|_risk_factor	1
Likely_benign	29
Likely_pathogenic	4
Pathogenic	39
Pathogenic/Likely_pathogenic	6
Pathogenic/Likely_pathogenic|_risk_factor	1
Pathogenic|_risk_factor	1
risk_factor	6
Uncertain_significance	99

#another 52 to be removed


remove2=anti %>% filter(grepl("arkinson", CLNDN) & CLNSIG == "Benign" | grepl("arkinson", CLNDN) & CLNSIG == "Benign/Likely_benign" | grepl("arkinson", CLNDN) & CLNSIG == "Likely_benign")

anti2 = anti_join(anti, remove2)
dim(anti2)
[1] 839  33

anti2 %>% filter(grepl("arkinson", CLNDN)) %>% group_by(CLNSIG) %>% tally()

#A tibble: 9 × 2
CLNSIG	n
<chr>	<int>
Conflicting_interpretations_of_pathogenicity	43
Conflicting_interpretations_of_pathogenicity|_risk_factor	1
Likely_pathogenic	4
Pathogenic	39
Pathogenic/Likely_pathogenic	6
Pathogenic/Likely_pathogenic|_risk_factor	1
Pathogenic|_risk_factor	1
risk_factor	6
Uncertain_significance	99

In [2]:
# Write file with chr, start, rsID, REF, ALT

cut -f 1,2,4,5,11 834_23andMe_rare_variants_annotated_fullfile.txt > 834_23andMe_rare_variants_annotated_short.txt

In [27]:
head 834_23andMe_rare_variants_annotated_short.txt

Chr	Start	Ref	Alt	avsnp150
chr1	155235024	T	C	rs536425950
chr1	155235057	C	T	rs121908301
chr1	155235195	C	T	rs80356772
chr1	155235196	G	A	rs80356771
chr1	155235197	G	C	.
chr1	155235231	T	C	rs76071730
chr1	155235252	A	G	rs421016
chr1	155235699	T	C	rs74752878
chr1	155235703	A	C	.


## 5. Fix AA change for 23andMe file only

In [None]:
Use this file: 834_23andMe_rare_variants_annotated_fullfile.txt

and this script: AA_change_column.ipynb
# these chunks are only the differences to the above mentioned script

#write.table(fulljoin2, "NM_transcript_list.txt.txt", row.names=F, sep = "\t", quote = F)

merge_empty %>% filter(AAChange != ".")

In [15]:
# write.table(leftjoin, "Edited_AAChange_23andMe_834_clean.txt", quote = F, sep = "\t", row.names = F) # this has all the VariantNames automatically edited to only one name
wc -l Edited_AAChange_23andMe_834_clean.txt

835 Edited_AAChange_23andMe_834_clean.txt


## 6. Merge annotated clean file with original values

In [17]:
head variants_to_grep.txt

CHR.BP.REF.ALT
chr1:155235024:T:C
chr1:155235057:C:T
chr1:155235195:C:T
chr1:155235196:G:A
chr1:155235197:G:C
chr1:155235231:T:C
chr1:155235252:A:G
chr1:155235699:T:C
chr1:155235703:A:C


## 7. Extract info from 23andMe sumstats

## 8. Extract variants from UKB and AMP

In [18]:
# Write file for that
head variants_to_grep.txt
#cut -f 2 23andMe_834variants_annotated_and_stats.txt > variants_to_grep.txt
# format chr1:bp:REF:ALT

CHR.BP.REF.ALT
chr1:155235024:T:C
chr1:155235057:C:T
chr1:155235195:C:T
chr1:155235196:G:A
chr1:155235197:G:C
chr1:155235231:T:C
chr1:155235252:A:G
chr1:155235699:T:C
chr1:155235703:A:C


In [19]:
tail -n +2 variants_to_grep.txt > variants_to_grep_nohead.txt
wc -l variants_to_grep_nohead.txt

head variants_to_grep_nohead.txt

834 variants_to_grep_nohead.txt
chr1:155235024:T:C
chr1:155235057:C:T
chr1:155235195:C:T
chr1:155235196:G:A
chr1:155235197:G:C
chr1:155235231:T:C
chr1:155235252:A:G
chr1:155235699:T:C
chr1:155235703:A:C
chr1:155235704:C:T


### 8.1 UKB

In [10]:
# Clean UKB first

#mkdir UKB_and_AMP
# cd ./UKB_and_AMP
# UKB data contains indels
# PLINK file merged with no relateds
# /data/CARD/UKBIOBANK/EXOME_DATA_200K/PVCF_FILES/MERGED_UKB_first_pass.*

In [None]:
# these files are in plink2 format - convert

## .psam IDs
#IID	SEX
#-000001	NA
#-000002	NA

module load plink/2
plink2 --pfile /data/CARD/UKBIOBANK/EXOME_DATA_200K/PVCF_FILES/MERGED_UKB_first_pass --make-bed --out MERGED_UKB_first_pass

wc -l MERGED_UKB_first_pass.bim
# 16285684 MERGED_UKB_first_pass.bim


# Remove potential indels from UKB
## Write "no_indel" file
#In bash
awk 'length($NF)==1 && length($(NF-1))==1' MERGED_UKB_first_pass.bim > MERGED_UKB_no_indels.txt
wc -l MERGED_UKB_no_indels.txt
# 14908659 MERGED_UKB_no_indels.txt

awk 'length($NF)>1 || length($(NF-1))>1' MERGED_UKB_first_pass.bim > UKB_indels.txt
wc -l UKB_indels.txt
# 1377025 UKB_indels.txt

cut -f 2 MERGED_UKB_no_indels.txt > UKB_noindels_tokeep.txt
module load plink

plink --bfile MERGED_UKB_first_pass --extract UKB_noindels_tokeep.txt --make-bed --out MERGED_UKB_no_indels
# 14908659 variants and 200648 people pass filters and QC.
# Note: No phenotypes present.

# new working files: MERGED_UKB_no_indels

In [None]:
# Grep variants

In [1]:
# files written to ./UKB_and_AMP
head variants_to_grep_nohead.txt

grep -w -f variants_to_grep_nohead.txt MERGED_UKB_no_indels.bim > UKB_variants_extracted_from23andMe.txt

wc -l UKB_variants_extracted_from23andMe.txt
# 608 variants

chr1:155235024:T:C
chr1:155235057:C:T
chr1:155235195:C:T
chr1:155235196:G:A
chr1:155235197:G:C
chr1:155235231:T:C
chr1:155235252:A:G
chr1:155235699:T:C
chr1:155235703:A:C
chr1:155235704:C:T
608 UKB_variants_extracted_from23andMe.txt


#### 8.1.1 Write binary file

In [2]:
module load plink/1.9.0-beta4.4

plink --bfile MERGED_UKB_no_indels --extract UKB_variants_extracted_from23andMe.txt --make-bed --out UKB_608_from23andme

# 608 variants and 200648 people pass filters and QC.
# Note: No phenotypes present.


[+] Loading plink  1.9.0-beta4.4  on cn4271 
PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to UKB_608_from23andme.log.
Options in effect:
  --bfile MERGED_UKB_no_indels
  --extract UKB_variants_extracted_from23andMe.txt
  --make-bed
  --out UKB_608_from23andme

515537 MB RAM detected; reserving 257768 MB for main workspace.
14908659 variants loaded from .bim file.
200648 people (0 males, 0 females, 200648 ambiguous) loaded from .fam.
Ambiguous sex IDs written to UKB_608_from23andme.nosex .
--extract: 608 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 200648 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
To

In [3]:
head UKB_608_from23andme.fam

0 -000001 0 0 0 -9
0 -000002 0 0 0 -9
0 -000003 0 0 0 -9
0 -000004 0 0 0 -9
0 -000005 0 0 0 -9
0 -000006 0 0 0 -9
0 -000007 0 0 0 -9
0 -000008 0 0 0 -9
0 -000009 0 0 0 -9
0 -000010 0 0 0 -9


In [4]:
# This file needs to be fixed because of an odd format where column 1 is all zeros (column 1 & 2 need to be same)
#cd ./UKB_and_AMP/
cut -d " " -f 2 UKB_608_from23andme.fam > column2.txt
cut -d " " -f 2,3,4,5,6 UKB_608_from23andme.fam > column23456.txt
scp UKB_608_from23andme.fam UKB_608_from23andme_ORIGINAL.fam
paste column2.txt column23456.txt > UKB_608_from23andme.fam

In [5]:
head UKB_608_from23andme.fam

-000001	-000001 0 0 0 -9
-000002	-000002 0 0 0 -9
-000003	-000003 0 0 0 -9
-000004	-000004 0 0 0 -9
-000005	-000005 0 0 0 -9
-000006	-000006 0 0 0 -9
-000007	-000007 0 0 0 -9
-000008	-000008 0 0 0 -9
-000009	-000009 0 0 0 -9
-000010	-000010 0 0 0 -9


#### 8.1.2 Write frequency file

In [6]:
module load plink/1.9.0-beta4.4
plink --bfile UKB_608_from23andme --freq --out UKB_608_from23andme
# Total genotyping rate is 0.998984.
# 200648 people (0 males, 0 females, 200648 ambiguous) loaded from .fam.
#Ambiguous sex IDs written to UKB_644_from23andme.nosex .

[-] Unloading plink  1.9.0-beta4.4  on cn4271 
[+] Loading plink  1.9.0-beta4.4  on cn4271 
PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to UKB_608_from23andme.log.
Options in effect:
  --bfile UKB_608_from23andme
  --freq
  --out UKB_608_from23andme

515537 MB RAM detected; reserving 257768 MB for main workspace.
608 variants loaded from .bim file.
200648 people (0 males, 0 females, 200648 ambiguous) loaded from .fam.
Ambiguous sex IDs written to UKB_608_from23andme.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 200648 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.999036.
--freq: Allele f

In [7]:
head UKB_608_from23andme.frq

 CHR                SNP   A1   A2          MAF  NCHROBS
   1   chr1:7965399:G:A    A    G     0.000304   401272
   1   chr1:7984930:A:C    C    A    3.987e-05   401284
   1   chr1:7984971:G:A    A    G    2.492e-06   401286
   1   chr1:7985019:G:A    A    G    0.0004685   401250
   1  chr1:16985990:C:T    T    C    0.0004585   401280
   1  chr1:16986065:C:T    T    C    0.0001744   401276
   1  chr1:16986091:G:A    A    G     0.006444   401014
   1  chr1:16986097:G:A    A    G      0.05694   400590
   1  chr1:16986101:T:A    A    T     0.006382   401128


#### 8.1.3 Cohort age per group

In [8]:
module load plink/1.9.0-beta4.4
plink --bfile UKB_608_from23andme --allow-no-sex --pheno updatePheno_UKB.txt --keep updatePheno_UKB.txt  --make-bed --out UKB_608_from23andme_PhenoUpdate
# 608 variants and 45857 people pass filters and QC.
# Among remaining phenotypes, 7806 are cases and 38051 are controls.

[-] Unloading plink  1.9.0-beta4.4  on cn4271 
[+] Loading plink  1.9.0-beta4.4  on cn4271 
PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to UKB_608_from23andme_PhenoUpdate.log.
Options in effect:
  --allow-no-sex
  --bfile UKB_608_from23andme
  --keep updatePheno_UKB.txt
  --make-bed
  --out UKB_608_from23andme_PhenoUpdate
  --pheno updatePheno_UKB.txt

515537 MB RAM detected; reserving 257768 MB for main workspace.
608 variants loaded from .bim file.
200648 people (0 males, 0 females, 200648 ambiguous) loaded from .fam.
Ambiguous sex IDs written to UKB_608_from23andme_PhenoUpdate.nosex .
45857 phenotype values present after --pheno.
--keep: 45857 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 45857 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031

In [9]:
head UKB_608_from23andme_PhenoUpdate.fam
# this step also removes weird IDs

1000012 1000012 0 0 0 1
1000047 1000047 0 0 0 1
1000068 1000068 0 0 0 1
1000085 1000085 0 0 0 1
1000220 1000220 0 0 0 1
1000369 1000369 0 0 0 2
1000425 1000425 0 0 0 1
1000580 1000580 0 0 0 1
1000741 1000741 0 0 0 1
1000777 1000777 0 0 0 1


In [10]:
head /data/CARD/UKBIOBANK/PHENOTYPE_DATA/disease_groups/UKB_EXOM_ALL_PD_PHENOTYPES_CONTROL_2021_with_PC.txt

FID	IID	BIRTH_YEAR	TOWNSEND	AGE_OF_RECRUIT	BATCH	GENETIC_SEX	EUROPEAN	PHENO	SEX	EXOMEBATCH	PHENO_NAME	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10
1000012	1000012	1949	-4.90174	61	3	1	1	1	1	1	CONTROL	0.004693123	0.01022435	0.002983434	-0.02391978	-0.0009319563	-0.009654937	-2.472167e-05	0.02195255	-0.03584618	0.03827368
1000047	1000047	1943	-2.44014	65	34	0	1	1	2	2	CONTROL	-0.01540129	0.01213101	-0.0419949	0.01037456	0.002587751	-0.008139144	0.006751324	-0.007508589	-0.03538605	-0.009730054
1000068	1000068	1948	-4.37721	61	41	1	1	1	1	2	CONTROL	-0.01141674	0.01838264	-0.002818214	0.006112886	0.00955852	-0.004763706	0.01867819	-0.007523244	0.005544525	-0.009509542
1000085	1000085	1944	-0.774111	65	69	0	1	1	2	2	CONTROL	0.01709672	0.01037233	0.004323209	0.02104864	-0.007535406	0.007889541	-0.01520523	-0.004904777	-0.0005962849	-0.02620082
1000220	1000220	1940	-2.52582	68	25	0	1	1	2	2	CONTROL	0.002730831	0.02571671	0.008347779	-0.01671972	0.005950087	-0.005401692	0.0005578872	0.0114738	-0.04742

### 8.2 AMP-PD

#### 8.2.1 Write binary files

In [11]:
grep -w -f variants_to_grep_nohead.txt /data/CARD/PD/AMP_NIH/no_relateds/DALGB_12MAR2022/AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins.bim > AMPonly_variants_extracted_from23andMe.txt
wc -l AMPonly_variants_extracted_from23andMe.txt
# 282 AMPonly_variants_extracted_from23andMe.txt

282 AMPonly_variants_extracted_from23andMe.txt


In [12]:
# Write new binary files
#module load plink/1.9.0-beta4.4

plink --bfile /data/CARD/PD/AMP_NIH/no_relateds/DALGB_12MAR2022/AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins \
--extract AMPonly_variants_extracted_from23andMe.txt \
--make-bed \
--out AMP_282_from23andme

# 282 variants and 4007 people pass filters and QC.
# Among remaining phenotypes, 1451 are cases and 2556 are controls.
# --make-bed to AMP_282_from23andme.bed + AMP_282_from23andme.bim +
# AMP_282_from23andme.fam ... done.
# AMP only contains 282 out of x variants provided by 23andme

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AMP_282_from23andme.log.
Options in effect:
  --bfile /data/CARD/PD/AMP_NIH/no_relateds/DALGB_12MAR2022/AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins
  --extract AMPonly_variants_extracted_from23andMe.txt
  --make-bed
  --out AMP_282_from23andme

515537 MB RAM detected; reserving 257768 MB for main workspace.
Allocated 193326 MB successfully, after larger attempt(s) failed.
937983 variants loaded from .bim file.
4007 people (2197 males, 1810 females) loaded from .fam.
4007 phenotype values loaded from .fam.
--extract: 282 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 4007 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758

#### 8.2.2 Generate frequency file

In [13]:
# module load plink/1.9.0-beta4.4
plink --bfile AMP_282_from23andme --freq --out AMP_282_from23andme

# 4007 people (2197 males, 1810 females) loaded from .fam.
# 4007 phenotype values loaded from .fam.

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AMP_282_from23andme.log.
Options in effect:
  --bfile AMP_282_from23andme
  --freq
  --out AMP_282_from23andme

515537 MB RAM detected; reserving 257768 MB for main workspace.
282 variants loaded from .bim file.
4007 people (2197 males, 1810 females) loaded from .fam.
4007 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 4007 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
--freq: Allele frequencies (founders only) written to AMP_282_from23andme.frq .


#### 8.2.3 Cohort age per group

In [None]:
# Sex (percentage)

# binary pheno
join_1 %>% group_by(PD_PHENO, SEX) %>% tally() %>% mutate(perc =n/sum(n)*100)

## 9. Association testing

### 9.1 Associatino testing in Plink

#### 9.1.1 UKB

In [14]:
module load plink/1.9.0-beta4.4
plink --bfile UKB_608_from23andme --assoc --pheno /data/CARD/UKBIOBANK/PHENOTYPE_DATA/disease_groups/UKB_EXOM_ALL_PD_PHENOTYPES_CONTROL_2021_with_PC.txt --pheno-name PHENO --allow-no-sex --out UKB_608_from23andme_ALL_PD

[-] Unloading plink  1.9.0-beta4.4  on cn4271 
[+] Loading plink  1.9.0-beta4.4  on cn4271 
PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to UKB_608_from23andme_ALL_PD.log.
Options in effect:
  --allow-no-sex
  --assoc
  --bfile UKB_608_from23andme
  --out UKB_608_from23andme_ALL_PD
  --pheno /data/CARD/UKBIOBANK/PHENOTYPE_DATA/disease_groups/UKB_EXOM_ALL_PD_PHENOTYPES_CONTROL_2021_with_PC.txt
  --pheno-name PHENO

515537 MB RAM detected; reserving 257768 MB for main workspace.
608 variants loaded from .bim file.
200648 people (0 males, 0 females, 200648 ambiguous) loaded from .fam.
Ambiguous sex IDs written to UKB_608_from23andme_ALL_PD.nosex .
45857 phenotype values present after --pheno.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 200648 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718

#### 9.1.2 AMP-PD

In [15]:
plink --bfile AMP_282_from23andme --assoc --pheno /data/CARD/PD/AMP_NIH/no_relateds/COV_PD_NIH_AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins.txt --pheno-name PD_PHENO --out AMP_282_from23andme_pheno

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AMP_282_from23andme_pheno.log.
Options in effect:
  --assoc
  --bfile AMP_282_from23andme
  --out AMP_282_from23andme_pheno
  --pheno /data/CARD/PD/AMP_NIH/no_relateds/COV_PD_NIH_AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins.txt
  --pheno-name PD_PHENO

515537 MB RAM detected; reserving 257768 MB for main workspace.
282 variants loaded from .bim file.
4007 people (2197 males, 1810 females) loaded from .fam.
4007 phenotype values present after --pheno.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 4007 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
282 

### 9.2 Association testing in Rvtest

#### 9.2.1 UKB

In [16]:
# Convert binary files to vcf for input
module load plink/2.0-dev-20191128
module load samtools
VARIANT_FILE=$1
OUTNAME=${VARIANT_FILE/".txt"/""}
plink2 --bfile UKB_608_from23andme \
--export vcf bgz id-paste=iid --out UKB_608_from23andme${OUTNAME} --mac 1

tabix -p vcf  UKB_608_from23andme${OUTNAME}.vcf.gz

[-] Unloading plink  1.9.0-beta4.4  on cn4271 
[+] Loading plink  2.0-dev-20191128 

The following have been reloaded with a version change:
  1) plink/1.9.0-beta4.4 => plink/2.0-dev-20191128

[+] Loading samtools 1.17  ... 
PLINK v2.00a2LM 64-bit Intel (28 Nov 2019)     www.cog-genomics.org/plink/2.0/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to UKB_608_from23andme.log.
Options in effect:
  --bfile UKB_608_from23andme
  --export vcf bgz id-paste=iid
  --mac 1
  --out UKB_608_from23andme

Start time: Tue Mar 14 11:25:16 2023
515537 MiB RAM detected; reserving 257768 MiB for main workspace.
Using up to 128 threads (change this with --threads).
200648 samples (0 females, 0 males, 200648 ambiguous; 200648 founders) loaded
from UKB_608_from23andme.fam.
608 variants loaded from UKB_608_from23andme.bim.
Note: No phenotype data present.
Calculating allele frequencies... done.
0 variants removed due to allele frequency threshold(s)
(--maf/--max-maf/

In [17]:
# Now run rvtest
module load rvtests

rvtest --inVcf UKB_608_from23andme.vcf.gz --pheno /data/CARD/UKBIOBANK/PHENOTYPE_DATA/disease_groups/UKB_EXOM_ALL_PD_PHENOTYPES_CONTROL_2021_with_PC.txt --pheno-name PHENO --out UKB_608_from23andme_withcovars_score_ALL_PD --single wald,score --covar /data/CARD/UKBIOBANK/PHENOTYPE_DATA/disease_groups/UKB_EXOM_ALL_PD_PHENOTYPES_CONTROL_2021_with_PC.txt --covar-name GENETIC_SEX,AGE_OF_RECRUIT,TOWNSEND,PC1,PC2,PC3,PC4,PC5


[+] Loading rvtests  2.1.0  on cn4271 
Thank you for using rvtests (version: 20190205, git: c86e589efef15382603300dc7f4c3394c82d69b8)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, plase send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

The following parameters are available.  Ones with "[]" are in effect:

Available Options
      Basic Input/Output: --inVcf [UKB_608_from23andme.vcf.gz], --inBgen []
                          --inBgenSample [], --inKgg []
                          --out [UKB_608_from23andme_withcovars_score_ALL_PD]
                          --outputRaw
       Specify Covariate:
                          --covar [/data/CARD/UKBIOBANK/PHENOTYPE_DATA/disease_groups/UKB_EXOM_ALL_PD_PHENOTYPES_CONTROL_2021_with_PC.txt]
                          --covar-name [GENETIC_SEX,AGE_OF_RECRUIT,TOWNSEND,PC1
                         PC2,PC3,PC4,PC5], --sex
 

#### 9.2.2 AMP-PD

In [18]:
# convert binary plink files to vcf for rvtest
module load plink/2.0-dev-20191128
module load samtools
VARIANT_FILE=$1
OUTNAME=${VARIANT_FILE/".txt"/""}
plink2 --bfile AMP_282_from23andme \
--export vcf bgz id-paste=iid --out AMP_282_from23andme${OUTNAME} --mac 1

tabix -p vcf AMP_282_from23andme${OUTNAME}.vcf.gz


[-] Unloading plink  2.0-dev-20191128 
[+] Loading plink  2.0-dev-20191128 
[-] Unloading samtools 1.17  ... 
[+] Loading samtools 1.17  ... 
PLINK v2.00a2LM 64-bit Intel (28 Nov 2019)     www.cog-genomics.org/plink/2.0/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AMP_282_from23andme.log.
Options in effect:
  --bfile AMP_282_from23andme
  --export vcf bgz id-paste=iid
  --mac 1
  --out AMP_282_from23andme

Start time: Tue Mar 14 11:27:20 2023
515537 MiB RAM detected; reserving 257768 MiB for main workspace.
Using up to 128 threads (change this with --threads).
4007 samples (1810 females, 2197 males; 4007 founders) loaded from
AMP_282_from23andme.fam.
282 variants loaded from AMP_282_from23andme.bim.
1 binary phenotype loaded (1451 cases, 2556 controls).
Calculating allele frequencies... done.
26 variants removed due to allele frequency threshold(s)
(--maf/--max-maf/--mac/--max-mac).
256 variants remaining after main filters.
reconstruct the

In [None]:
# new files
AMP_282_from23andme.vcf.gz
AMP_282_from23andme.vcf.gz.tbi

In [19]:
# Run Rvtest
module load rvtests

# don't correct for age in AMP
rvtest --inVcf AMP_282_from23andme.vcf.gz --pheno /data/CARD/PD/AMP_NIH/no_relateds/COV_PD_NIH_AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins.txt --pheno-name PD_PHENO --out AMP_256_from23andme_withcovars_score --single wald,score --covar /data/CARD/PD/AMP_NIH/no_relateds/COV_PD_NIH_AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins.txt --covar-name SEX,PC1,PC2,PC3,PC4,PC5

[-] Unloading rvtests  2.1.0  on cn4271 
[+] Loading rvtests  2.1.0  on cn4271 
Thank you for using rvtests (version: 20190205, git: c86e589efef15382603300dc7f4c3394c82d69b8)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, plase send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

The following parameters are available.  Ones with "[]" are in effect:

Available Options
      Basic Input/Output: --inVcf [AMP_282_from23andme.vcf.gz], --inBgen []
                          --inBgenSample [], --inKgg []
                          --out [AMP_256_from23andme_withcovars_score]
                          --outputRaw
       Specify Covariate:
                          --covar [/data/CARD/PD/AMP_NIH/no_relateds/COV_PD_NIH_AMPv2.5_samplestoKeep_EuroOnly_noDups_noNIHDups_wPheno_wSex_no_cousins.txt]
                          --covar-name [SEX,PC1,PC2,PC3,PC4,PC5], --sex
       

## 10. Meta-analysis

### 10.1 Prepare files to match

#### 10.1.1 23andMe

In [None]:
head 23andMe_834variants_annotated_and_stats.txt

In [1]:
head toMETA_23andme_summary.txt

CHR.BP	REF	ALT	FREQ	effect	stderr	pvalue	N_INFORMATIVE
chr1:155235057	C	T	6.80999434732321e-07	0	0	NA	3090507
chr1:155235195	C	T	7.02901311999682e-06	1.27652395181766	1.05224015597278	0.314177783769655	3090507
chr1:155235196	G	A	0.000199293592070049	1.26221164344355	0.19902372591352	8.49569716680291e-08	3090507
chr1:155235231	T	C	6.58365554961904e-07	-1.00928042988576	17.4340750237578	0.910473718125855	3090507
chr1:155235699	T	C	3.57333009304295e-06	0	0	NA	3090507
chr1:155235704	C	T	1.78739315859211e-06	0	0	NA	3090507
chr1:155235708	G	C	4.03741203860442e-05	0.320929842946655	0.491069189760776	0.524141839042594	3090507
chr1:155235725	G	C	0	0	0	NA	3090507
chr1:155235726	T	A	0	NA	NA	NA	3090507


#### 10.1.2 UKB

In [2]:
head toMETA_SCORE_UKBALL.txt

CHR.BP	REF	ALT	N_INFORMATIVE	AF	U	V	STAT	DIRECTION	EFFECT	SE	PVALUE
chr1:7965399	G	A	45857	0	NA	NA	NA	NA	NA	NA	NA
chr1:7984930	A	C	45857	0	NA	NA	NA	NA	NA	NA	NA
chr1:7984971	G	A	45857	0	NA	NA	NA	NA	NA	NA	NA
chr1:7985019	G	A	45857	0.000403428	1.65211	3.81673	0.715137	+	0.432862	0.511864	0.397744
chr1:16985990	C	T	45857	9.81311e-05	1.75147	0.977177	3.13931	+	1.79238	1.01161	0.0764263
chr1:16986065	C	T	45857	0.000218069	-2.18282	2.37407	2.00699	-	-0.919445	0.649013	0.156576
chr1:16986091	G	A	45857	0.00666201	-14.5681	68.5895	3.0942	-	-0.212395	0.120746	0.0785719
chr1:16986097	G	A	45857	0.0615173	-16.4508	600.372	0.450767	-	-0.027401	0.0408122	0.501971
chr1:16986101	T	A	45857	0.0069237	4.77173	68.9193	0.330378	+	0.0692365	0.120456	0.565437


#### 10.1.3 AMP-PD

In [1]:
head toMETA_SCORE_AMP.txt

CHR.BP	REF	ALT	N_INFORMATIVE	AF	U	V	STAT	DIRECTION	EFFECT	SE	PVALUE
chr1:7985019	G	A	4007	0.00074869	1.21194	1.23887	1.18561	+	0.97827	0.898438	0.276217
chr1:16985990	C	T	4007	0.00074869	-1.08424	1.31297	0.895357	-	-0.825794	0.872717	0.34403
chr1:16986065	C	T	4007	0.000374345	0.0300306	0.648305	0.00139107	+	0.0463217	1.24197	0.970248
chr1:16986091	G	A	4007	0.00411779	-1.46934	7.54414	0.286178	-	-0.194766	0.364079	0.59268
chr1:16986097	G	A	4007	0.0504118	-1.67883	84.7755	0.0332463	-	-0.0198033	0.108609	0.855319
chr1:16986101	T	A	4007	0.00549039	-1.41294	9.16515	0.217825	-	-0.154165	0.330317	0.640702
chr1:16986246	G	A	4007	0.000124782	0.604212	0.238637	1.52982	+	2.53193	2.04706	0.216139
chr1:16986248	T	C	4007	0.484901	-18.5438	422.854	0.813223	-	-0.0438541	0.0486301	0.367169
chr1:16986291	C	T	4007	0.000249563	0.214196	0.448084	0.102391	+	0.478026	1.49389	0.748979


### 10.2 Check ID overlap

In [4]:
cut -f 1 toMETA_23andme_summary.txt > 23andMe_IDs.txt
cut -f 1 toMETA_SCORE_AMP.txt > AMP_IDs.txt
cut -f 1 toMETA_SCORE_UKBALL.txt > UKB_IDs.txt

In [5]:
# they're all in the same layout, let's merge them and see how much we have left

cat 23andMe_IDs.txt AMP_IDs.txt UKB_IDs.txt > merged_IDs_all3datasets.txt
sort merged_IDs_all3datasets.txt | uniq > merged_IDs_all3datasets_nodupli.txt

wc -l merged_IDs_all3datasets_nodupli.txt
# 780 merged_IDs_all3datasets_nodupli.txt
head merged_IDs_all3datasets_nodupli.txt

# there's an overlap of 780 ID's for all studies, IF all rows have info, i.e. stderr and pvalue etc

780 merged_IDs_all3datasets_nodupli.txt
chr1:155235006
chr1:155235057
chr1:155235195
chr1:155235196
chr1:155235197
chr1:155235231
chr1:155235699
chr1:155235704
chr1:155235708
chr1:155235725


In [7]:
## Check LRRK2 p.G2019S
head -1 toMETA_SCORE_AMP.txt
grep chr12:40340400 toMETA_SCORE_AMP.txt

CHR.BP	REF	ALT	N_INFORMATIVE	AF	U	V	STAT	DIRECTION	EFFECT	SE	PVALUE
[01;31m[Kchr12:40340400[m[K	G	A	4007	0.00349389	13.8512	5.93653	32.3177	+	2.33321	0.410425	1.30912e-08


In [8]:
head -1 toMETA_SCORE_UKBALL.txt
grep chr12:40340400 toMETA_SCORE_UKBALL.txt

CHR.BP	REF	ALT	N_INFORMATIVE	AF	U	V	STAT	DIRECTION	EFFECT	SE	PVALUE
[01;31m[Kchr12:40340400[m[K	G	A	45857	0.000457945	9.4569	5.24759	17.0427	+	1.80214	0.436536	3.65493e-05


In [14]:
head -1 toMETA_23andme_summary_VP_new.txt
grep chr12:40340400 toMETA_23andme_summary_VP_new.txt

CHR.BP	REF	ALT	FREQ	effect	stderr	pvalue	N_INFORMATIVE
[01;31m[Kchr12:40340400[m[K	G	A	0.000626183692731042	2.48010896869361	0.0600112561298007	3.77983849690177e-250	3090507


### 10.3 Create METAL file

In [10]:
pwd

/data/CARD/projects/23andme_annotation/Rare_variant_project_VP


In [3]:
cat my_METAL.txt

#../generic-metal/metal metalAll.txt
#THIS SCRIPT EXECUTES AN ANALYSIS OF THREE STUDIES
#THE RESULTS FOR EACH STUDY ARE STORED IN FILES Inputfile1.txt THROUGH Inputfile3.txt
SCHEME  STDERR
AVERAGEFREQ ON
MINMAXFREQ ON
LABEL TotalSampleSize as N # If input files have a column for the sample size labeled as 'N'
# LOAD THE FIRST TWO INPUT FILES

# UNCOMMENT THE NEXT LINE TO ENABLE GenomicControl CORRECTION
# GENOMICCONTROL ON

# === DESCRIBE AND PROCESS THE FIRST INPUT FILE ===
MARKER CHR.BP
ALLELE REF ALT
FREQ AF
EFFECT EFFECT
STDERR SE
PVALUE PVALUE
WEIGHT N_INFORMATIVE 
PROCESS toMETA_SCORE_AMP.txt

# === DESCRIBE AND PROCESS THE SECOND INPUT FILE ===
MARKER CHR.BP
ALLELE REF ALT
FREQ AF
EFFECT EFFECT
STDERR SE
PVALUE PVALUE
WEIGHT N_INFORMATIVE 
PROCESS toMETA_SCORE_UKBALL.txt

# === DESCRIBE AND PROCESS THE THIRD INPUT FILE ===
MARKER CHR.BP
ALLELE REF ALT
FREQ FREQ
EFFECT effect
STDERR stderr
PVALUE pvalue
WEIGHT N_INFORMATIVE 
PROCESS toMETA_23andme_summary.txt

OUTFILE MY_META_AMP

### 10.4 Run METAL

In [None]:
module load metal
metal my_METAL.txt

[+] Loading metal  2020-05-05 
MetaAnalysis Helper - (c) 2007 - 2009 Goncalo Abecasis
This version released on 2020-05-05

# This program faciliates meta-analysis of genome-wide association studies.
# Commonly used commands are listed below:
#
# Options for describing input files ...
#   SEPARATOR        [WHITESPACE|COMMA|BOTH|TAB] (default = WHITESPACE)
#   COLUMNCOUNTING   [STRICT|LENIENT]            (default = 'STRICT')
#   MARKERLABEL      [LABEL]                     (default = 'MARKER')
#   ALLELELABELS     [LABEL1 LABEL2]             (default = 'ALLELE1','ALLELE2')
#   EFFECTLABEL      [LABEL|log(LABEL)]          (default = 'EFFECT')
#   FLIP
#
# Options for filtering input files ...
#   ADDFILTER        [LABEL CONDITION VALUE]     (example = ADDFILTER N > 10)
#                    (available conditions are <, >, <=, >=, =, !=, IN)
#   REMOVEFILTERS
#
# Options for sample size weighted meta-analysis ...
#   WEIGHTLABEL      [LABEL]                     (default = 'N')
#   PVALUELAB

#### 10.4.1 Check warning messages

AMP-PD

UKB

23andMe

In [9]:
head -1 MY_META_AMP_UKB_23andme1.tbl
grep chr12:40340400 MY_META_AMP_UKB_23andme1.tbl
grep chr1:16986248 MY_META_AMP_UKB_23andme1.tbl
grep chr15:61910283 MY_META_AMP_UKB_23andme1.tbl
grep chr15:61967438 MY_META_AMP_UKB_23andme1.tbl
grep chr3:184319745 MY_META_AMP_UKB_23andme1.tbl

MarkerName	Allele1	Allele2	Freq1	FreqSE	MinFreq	MaxFreq	Effect	StdErr	P-value	Direction	HetISq	HetChiSq	HetDf	HetPVal
[01;31m[Kchr12:40340400[m[K	a	g	0.9993	0.0004	0.9965	0.9995	-2.4648	0.0588	1.66e-383	---	19.1	2.472	2	0.2905
[01;31m[Kchr1:16986248[m[K	t	c	0.5222	0.0071	0.4849	0.5236	0.0324	0.0091	0.0003886	-?+	60.7	2.546	1	0.1106
[01;31m[Kchr15:61910283[m[K	t	c	0.5241	0.0082	0.4815	0.5257	0.0076	0.0091	0.4049	-?+	82.1	5.585	1	0.01812
[01;31m[Kchr15:61967438[m[K	t	c	0.5796	0.0330	0.4083	0.5860	-0.0098	0.0092	0.284	+?-	71.4	3.499	1	0.06142
[01;31m[Kchr3:184319745[m[K	a	g	0.4096	0.4885	0.0025	0.9974	-0.0395	0.0905	0.6626	---	0.0	1.218	2	0.5439


In [10]:
pwd

/data/CARD/projects/23andme_annotation/Rare_variant_project_VP


## 11. Clean files for analysis

Now move files to local and make plots

## 12. Add identifier for variants passing of failing internal QC

In [None]:
## add identifier to initial file


In [None]:
scp pitzv2@biowulf.nih.gov:/data/CARD/projects/23andme_annotation/Rare_variant_project_VP/Meta_results_679_AMP_UKB_23andMe_annotation_QC.txt /Users/pitzv2/Documents/Projects/23andme_rare_variants/Final_Dec_2022/Sumstats