# Computation of GWAS summary statistics
### Author: Mihail Mihov

In [None]:
#Prepare phenotype and covariates files to be used in the GWAS
library(data.table)

# Read in the phenotypes data
phenotypes <- fread("/home//mmihov/Projects/MS_TWAS_PheWAS_heritability/MS_Cox_PheWAS_Cor/data/V2/UoA_UKB_phenotypes_2021-19-11.txt", data.table=F)

# exclude individuals whith sex chromosome abnormalities
filtered=which(!is.na(phenotypes$sex_aneu))

nrow(phenotypes[filtered,])
nrow(phenotypes[-filtered,])
phenotypes=phenotypes[-filtered,]

# exclude individuals which failed genotyping QC
filtered=which(is.na(phenotypes$gen_exclude))

nrow(phenotypes[-filtered,])
nrow(phenotypes[filtered,])
phenotypes=phenotypes[filtered,]

# exclude samples which are not europeans genetically
filtered=which(phenotypes$gen_ethnicity==1)

nrow(phenotypes[-filtered,])
nrow(phenotypes[filtered,])
phenotypes=phenotypes[filtered,]

# exclude samples whith missing Maternal smoking during pregnancy status
filtered=which(phenotypes$maternal_smoking=="missing")

nrow(phenotypes[filtered,])
nrow(phenotypes[-filtered,])
phenotypes=phenotypes[-filtered,]

phenotypes$maternal_smoking=as.numeric(pheno1$maternal_smoking)
phenotypes$education=as.numeric(pheno1$education)
phenotypes$alcohol=as.numeric(pheno1$alcohol)
phenotypes$lack_of_PA=as.numeric(pheno1$lack_of_PA)
phenotypes$deprv_index=as.numeric(pheno1$deprv_index)
phenotypes$smoking_status=as.numeric(pheno1$smoking_status)

In [None]:
#Make phenotype variable containing FID and IID ID columns as required by regenie and a column for the trait we are testing
pheno = phenotypes[,c("studiepersonid","maternal_smoking")]
    colnames(pheno)[1]="FID"
    pheno$IID=pheno$FID

#Make a covariate variable containing FID and IID ID columns as required by regenie and a columns containing the covariates to be used for adjusting
cov = phenotypes[,c("studiepersonid","sex,education,smoking_status,lack_of_PA, genoPC1,genoPC2,genoPC3,genoPC4,genoPC5,genoPC6,genoPC7,genoPC8,genoPC9,genoPC10,deprv_index,x_agebase,x_BMI")]
    colnames(pheno)[1]="FID"
    pheno$IID=pheno$FID

#save them to files for later use

#### Create a list of SNPs passing quality control to be used in the GWAS analysis

In [None]:
#Install Plink2
#For more information visit 'https://www.cog-genomics.org/plink/2.0/general_usage'
#Create a list of SNPs of minor allele frequency >= 0.01 ; minimum allele count = 100 and Hardy-Weinberg equilibrium exact test p-value > 10^(-15)
plink2 
--bgen ~Your_directory/Genotypes/Imputed/imputed_bgen_file_chr.bgen #chr is one of the 22 autosomes
--sample ~Your_directory/Genotypes/Imputed/imputed_sample_file_chr.sample  #chr is one of the 22 autosomes
--keep ~Your_directory/GWAS/Phenotypes/pheno.txt #the phenotype file created earlier
--mac 100 
--memory 200000 
--threads 10 
--maf 0.01 
--geno 0.1 
--hwe 1e-15 
--mind 0.1  
--write-snplist 
--out ~Your_directory/snps_pass

#### Perform GWAS using regenie 

! more information on the use of regenie can be found on 'https://rgcgithub.github.io/regenie/options/' !

In [None]:
#Install regenie
#Generate a set of genomic predictions (step 1) to be used for SNP association testing (step 2)
/regenie 
--step 1   
--bed ~Your_directory/Genotypes/gen_data_chromo1_22_v2.01  #whole genome sequencing data on the 22 autosomes
--phenoFile ~Your_directory/GWAS/Phenotypes/pheno.txt   #the phenotype file created earlier
--covarFile ~Your_directory/GWAS/Covariates/cov.txt  #the covariate file created earlier
--catCovarList sex,education,smoking_status,lack_of_PA 
--covarColList genoPC1,genoPC2,genoPC3,genoPC4,genoPC5,genoPC6,genoPC7,genoPC8,genoPC9,genoPC10,deprv_index,x_agebase,x_BMI 
--extract ~Your_directory/snps_pass.snplist #the list of SNPs passing quality control created with Plink2
--bt  
--bsize 1000  
--threads 10  
--out ~Your_directory/fit_bin_out_all

#Fitting the Firth logistic regression model for each of the 22 autosomes
/regenie 
--step 2   
--bgen ~Your_directory/Genotypes/Imputed/gen_imp_(chr).bgen   #Input genetic data file of imputed SNPs; chr is one of the 22 autosomes
--ref-first   
--sample ~Your_directory/Genotypes/Imputed/gen_imp_(chr).sample   #Sample file corresponding to input BGEN file; chr is one of the 22 autosomes
--phenoFile ~Your_directory/GWAS/Phenotypes/pheno.txt   #the phenotype file created earlier
--covarFile ~Your_directory/GWAS/Covariates/cov.txt   #the covariate file created earlier
--catCovarList sex,education,smoking_status,lack_of_PA 
--covarColList genoPC1,genoPC2,genoPC3,genoPC4,genoPC5,genoPC6,genoPC7,genoPC8,genoPC9,genoPC10,deprv_index,x_agebase,x_BMI  
--extract ~Your_directory/snps_pass.snplist #the list of SNPs passing quality control created with Plink2
--bt   
--firth   
--approx   
--pred ~Your_directory/fit_bin_out_all.list   #the predicitions file generated in step 1
--bsize 400   
--threads 10  
--out ~Your_directory/Step2/fit_bin_out_all_step2_(chr) #output file; chr is one of the 22 autosomes

#combine all output files into one summary statistics file