### Calculating PRS using VCF files


In [60]:
# making the notebok wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [32]:
from __future__ import division
from pyspark import SparkConf, SparkContext
from operator import add
import re
import glob, os
import csv
from collections import Counter
import ntpath
import functools
#from functools import reduce
from math import log
import itertools
import PRS_VCF_utils
from time import time

import pyspark 
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.types import *

#**ATTN: python index starts at 0, so if you want to specify the second column, use 1
#**ATTN: please remove the header of the GWAS file if there is any

# define column number for contents in GWAS

gwas_id=0    # column of SNP ID
gwas_p=7     # column of P value
gwas_or=5    # column of odds ratio
gwas_a1=3    # column of a1 in the GWAS
gwas_maf= 10 # column index of maf in the GWAS

# defin column number for contents in genfile
geno_id=2  # column number with rsID
geno_start=9 # column number of the 1st genotype, in the raw vcf files, after separated by the delimiter of choice
geno_a1 = 3  # column number that contains the reference allele

# List of thresholds:
thresholds=[0.5, 0.3, 0.2, 0.1, 0.05, 0.01, 0.001, 0.0001]

# file delimiters:
GWAS_delim="\t"
GENO_delim="\t"

# file names:
home="/home/nyao111/MAVAN_imputed_161121/MOMS_info03_vcf/"  #define homefolder path

gwasFiles="/home/nyao111/PRS_imputed/pgc.mdd.clump.withAF.txt"       # Name of GWAS file 


def getFileFromPattern(*pattern): # Multiple patterns need to be put into list format
    files=[]
    for pathpattern in pattern:
        files=glob.glob(files)

genoFileNamePattern=home+"21_info03.vcf"   

genoFileNames=glob.glob(genoFileNamePattern)
# Alternatively, directly specify filename:
#genoFileName=[home+"fcgene_out_chr21comb.bierut1M_plus_filtered_chr21_c1_EA_COGA.gen",
              #home+"fcgene_out_chr21comb.bierut1M_plus_filtered_chr21_c1_EA_COGEND.gen",
              #home+"fcgene_out_chr22comb.bierut1M_plus_filtered_chr22_c1_EA_COGA.gen",
              #home+"fcgene_out_chr22comb.bierut1M_plus_filtered_chr22_c1_EA_COGEND.gen"]

genoExtension=".vcf"


# programme parameters
log_or=True  # sepcify whether you want to log your odds ratios
check_ref=True # if you know that there are mismatch between the top strand in the genotypes and that of the GWAS, set True. Not checking the reference allele will improve the speed

# sample file path and name
sampleFilePath=home+"MAVAN_35_impute161121_MOM_orderedSamples.csv" # include the full/relative path and name of the sample file
sampleFileDelim=","  # sample File Delimiter
sampleFileID=0   # which column in the sample file has the ID
sample_skip=1  # how many lines to skip so that the sample names can be matched to the genotypes 1-to-1, taking into account the header of the sample file 
##output file information

outputPath=home+"MAVAN_MOMS_mddPRS_161125.csv"



In [134]:
sc.stop()
spark.stop()

In [2]:


# We can give a name to our app (to find it in Spark WebUI) and configure execution mode

APP_NAME="MAVANvcfPRS"

conf = pyspark.SparkConf().setAppName(APP_NAME)#.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = pyspark.SparkContext(conf=conf)
print(sc)
sc.setLogLevel("WARN")
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("Start Reading Files")
#def main(gwasFile, genoFileList, thresholdList):

spark=SparkSession.builder.appName(APP_NAME).getOrCreate()
print spark


<pyspark.context.SparkContext object at 0x2abea40396d0>
<pyspark.sql.session.SparkSession object at 0x2abe98cbd290>


In [3]:
print("="*40)
print("Using these genoytpe files: ")

counter = 0
for filename in genoFileNames:
    if counter<20:
        counter+=1
        print(filename)
    else:
        print("and more....")
        break

Using these genoytpe files: 
/home/nyao111/MAVAN_imputed_161121/MOMS_info03_vcf/21_info03.vcf


### 1. Load files 

In [4]:
genodata=sc.textFile(genoFileNamePattern)
gwasfile=sc.textFile(gwasFiles)
print("Using the GWAS file: {}".format(ntpath.basename(gwasFiles)))
gwastable=gwasfile.filter(lambda line: "snpid" not in line).map(lambda line: line.split(GWAS_delim))
gwastableCA=gwastable.cache()


Using the GWAS file: pgc.mdd.clump.withAF.txt


### 1.1 Filter GWAS and prepare odds ratio


In [5]:
maxThreshold=max(thresholds)
gwasOddsMapMax=PRS_VCF_utils.filterGWASByP(GWASRdd=gwastableCA, pcolumn=gwas_p, idcolumn=gwas_id, oddscolumn=gwas_or, pHigh=maxThreshold, logOdds=log_or)
gwasOddsMapMaxCA=sc.broadcast(gwasOddsMapMax).value

Taking the log of odds ratios


### 2. Initial processing 

In [6]:
# at this step, the genotypes are already filtered to keep only the ones in 'gwasOddsMap'
genointermediate=genodata.filter(lambda line: ("#" not in line))\
.map(lambda line: line.split(GENO_delim))\
.filter(lambda line: line[geno_id] in gwasOddsMapMaxCA)\
.map(lambda line: line[0:5]+[chunk.split(":")[3] for chunk in line[geno_start::]])\
.map(lambda line: line[0:5]+[triplet.split(",") for triplet in line[5::]])

genoAlleles=genointermediate.map(lambda line: (line[geno_id], (line[geno_a1], line[geno_a1+1])))
genotable=genointermediate.map(lambda line: (line[geno_id], list(itertools.chain.from_iterable(line[5::]))))\
.mapValues(lambda geno: [float(x) for x in geno])


### 2.1 Calculate and store MAF

In [7]:
reload(PRS_VCF_utils)
genoa1f=genointermediate.map(lambda line: (line[geno_id], (line[geno_a1], line[geno_a1+1]), [float(x) for x in list(itertools.chain.from_iterable(line[5::]))]))\
.map(lambda line: (line[0], line[1][0], line[1][1], PRS_VCF_utils.getMaf(line[2]))).toDF(["Snpid_geno", "GenoA1", "GenoA2", "GenoA1f"])

#genoa1f.map(lambda line:"\t".join([line[0], "\t".join(line[1]), str(line[2])])).saveAsTextFile("../MOMS_info03_maf")


### 3. Determine whether each SNP needs to be flipped

In [8]:
gwasA1f=gwastableCA.map(lambda line:(line[gwas_id], line[gwas_a1], line[gwas_a1+1], line[gwas_maf])).toDF(["Snpid_gwas", "GwasA1", "GwasA2", "GwasMaf"])


In [9]:
checktable=genoa1f.join(gwasA1f, genoa1f["Snpid_geno"]==gwasA1f["Snpid_gwas"], "inner")

In [11]:
checktable.show()

+----------+------+------+-------------------+----------+------+------+---------+
|Snpid_geno|GenoA1|GenoA2|            GenoA1f|Snpid_gwas|GwasA1|GwasA2|  GwasMaf|
+----------+------+------+-------------------+----------+------+------+---------+
| rs1734920|     T|     G|0.39320754716981116| rs1734920|     T|     G| 0.454128|
| rs2831805|     C|     T| 0.8340566037735849| rs2831805|     T|     C|  0.16055|
| rs2837290|     C|     T| 0.8647169811320753| rs2837290|     T|     C| 0.133028|
| rs2837734|     G|     A| 0.9345283018867923| rs2837734|     A|     G|0.0688073|
| rs2839270|     A|     C| 0.6241509433962265| rs2839270|     A|     C| 0.643519|
| rs8134347|     G|     A| 0.9363207547169813| rs8134347|     A|     G|0.0458716|
| rs9977792|     G|     A| 0.8863207547169811| rs9977792|     A|     G| 0.140187|
|rs17230638|     C|     T|  0.873679245283019|rs17230638|     T|     C| 0.116822|
| rs2222994|     G|     T|0.28056603773584904| rs2222994|     T|     G| 0.729358|
| rs9982929|    

In [13]:
def sumscore(maf):
    return sum(maf)

udfSumScore=udf(sumscore, FloatType())
score=checktable.withColumn("scoresum", udfSumScore(checktable["GenoA1f"]))

In [24]:
bpMap={"A":"T", "T":"A", "C":"G", "G":"C"}

In [28]:
%time
reload(PRS_VCF_utils)
flagMap=checktable.rdd.map(lambda line: PRS_VCF_utils.checkAlignmentDF(line, bpMap)).collectAsMap()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.05 µs


In [30]:
# filter the raw genotype file
reload(PRS_VCF_utils)
if check_ref:
    print("Calculating genotype dosage while taking into account strand alignment differences")
    genotypeMax=genotable.map(lambda line: PRS_VCF_utils.makeGenotypeCheckRef(line, checkMap=flagMap)).cache()
    samplesize=len(genotypeMax.first()[1])
else:
    genotypeMax=genotable.map(lambda line: PRS_VCF_utils.makeGenotype(line, gwasOddsMapCA)).cache()
    samplesize=len(genotypeMax.first()[1])

Calculating genotype dosage while taking into account strand alignment differences


In [33]:
# Calculate the PRS with the maximum threshold
# calculate PRS from genotype
reload(PRS_VCF_utils)
def calcPRSFromGeno(genotypeRDD, oddsMap):
    totalcount=genotypeRDD.count()
    multiplied=genotypeRDD.map(lambda line:[call * oddsMap[line[0]] for call in line[1]])
    filtered=multiplied.filter(lambda line: line is not None)
    PRS=multiplied.reduce(lambda a,b: map(add, a, b))
    normalizedPRS=[x/totalcount for x in PRS]
    return (totalcount,PRS)
start=time()
prsMax=calcPRSFromGeno(genotypeMax, gwasOddsMapMaxCA)
prsDict={}
prsDict[maxThreshold]=prsMax
print("finished calculating PRS at threshold of {}, used {:3.1f} seconds".format(str(maxThreshold), time()-start))
# Calculate PRS for the rest of the thresholds

finished calculating PRS at threshold of 0.5, used 1.5 seconds


In [34]:
def calcNoMax(genotypeRDD, gwasRDD, thresholdlist, prsMap):
    
    if len(thresholdlist)>1:
        thresholdListNoMax=[x for x in thresholds if x != maxThreshold]
        thresholdNoMaxSorted=sorted(thresholdListNoMax, reverse=True)
    else:
        thresholdNoMaxSorted=thresholdlist
    start=time()
    for threshold in thresholdNoMaxSorted:
        tic=time()
        gwasFiltered=PRS_VCF_utils.filterGWASByP(GWASRdd=gwasRDD, pcolumn=gwas_p, idcolumn=gwas_id, oddscolumn=gwas_or, pHigh=threshold, logOdds=log_or)
        print("filtering GWAS at threshold of {} took {:3.2f} seconds".format( str(threshold), time()-tic) )
        
        checkpoint=time()

        gwasFilteredBC=sc.broadcast(gwasFiltered)
        filteredgenotype=genotypeRDD.filter(lambda line: line[0] in gwasFilteredBC.value)
        if not filteredgenotype.isEmpty():
            prsOther=calcPRSFromGeno(filteredgenotype, gwasFilteredBC.value)
            prsMap[threshold]=prsOther
            print("finished calculating PRS at threshold of {}, used {:3.1f} seconds".format(str(threshold), time()-checkpoint))
            
    return prsMap

finalresult=calcNoMax(genotypeMax,gwastableCA, thresholds, prsDict)

Taking the log of odds ratios
filtering GWAS at threshold of 0.3 took 0.30 seconds
finished calculating PRS at threshold of 0.3, used 1.5 seconds
Taking the log of odds ratios
filtering GWAS at threshold of 0.2 took 0.27 seconds
finished calculating PRS at threshold of 0.2, used 1.3 seconds
Taking the log of odds ratios
filtering GWAS at threshold of 0.1 took 0.26 seconds
finished calculating PRS at threshold of 0.1, used 1.0 seconds
Taking the log of odds ratios
filtering GWAS at threshold of 0.05 took 0.22 seconds
finished calculating PRS at threshold of 0.05, used 0.9 seconds
Taking the log of odds ratios
filtering GWAS at threshold of 0.01 took 0.20 seconds
finished calculating PRS at threshold of 0.01, used 0.6 seconds
Taking the log of odds ratios
filtering GWAS at threshold of 0.001 took 0.18 seconds
finished calculating PRS at threshold of 0.001, used 0.7 seconds
Taking the log of odds ratios
filtering GWAS at threshold of 0.0001 took 0.18 seconds


In [35]:
reload(PRS_VCF_utils)
subjNames=PRS_VCF_utils.getSampleNames(sampleFilePath,sampleFileDelim,sampleFileID, skip=1)


In [66]:
reload(PRS_VCF_utils)
test=PRS_VCF_utils.writePRS(finalresult,  outputPath, samplenames=subjNames)

Successfully wrote scores to MAVAN_MOMS_mddPRS_161125.csv
