In [2]:
from __future__ import division
from operator import add
from math import log
import csv
import pickle
import sys
from collections import Counter
import re
import glob, os

import ntpath
import functools
import itertools

from time import time
import argparse
from PRS_run import *

In [3]:
# type of files, VCF or GEN
filetype="VCF"

# write snp log file or not 
snp_log= True
## Setting parameters
gwas_id=0 # column of SNP ID
gwas_p=1     # column of P value
gwas_or=2    # column of odds ratio
gwas_a1=3    # column of a1 in the GWAS
gwas_a2=4
gwas_a1f=5  # column index of maf in the GWAS

# defin column number for contents in genfile
if filetype.lower()=="vcf":
    chrom_id=0
    bp_id=1
    geno_id= 2 # column number with rsID
    geno_start=9 # column number of the 1st genotype, in the raw vcf files, after separated by the delimiter of choice
    geno_a1 = 3 # column number that contains the reference allele
    GENO_delim= "\t"
    
elif filetype.lower()=="gen":
    chrom_id=0
    geno_id = 1
    bp_id=2
    geno_start=5
    geno_a1=3
    GENO_delim= " "


# List of thresholds:
thresholds=[0.01, 0.05,0.1,0.2]
threshold_seq=[0.001 ,0.01,0.002]
            
step=0.01  # default step size
threshold_interval=[]
if threshold_seq is not None:
    if len(threshold_seq)==3:
        lower=min(threshold_seq[0:2])
        upper=max(threshold_seq[0:2])
        step=threshold_seq[2]
        threshold_interval=np.arange(lower, upper+step, step).tolist()
    else:
        raise("Invalid input for threshold sequence parameters")
        logger.error("Invalid input for threshold sequence parameters")
        
thresholds=thresholds+threshold_interval
# file delimiters:
GWAS_delim="\t"

# file names:
#home="/Volumes/mavan/Genotyping_161114/MAVAN_imputed_161121/KIDS_info03/"  #define homefolder path

# Name of GWAS file
gwasFiles="file:///home/meaney.lab/nyao/PRS/PRS/FirstTest/TestGWAS.txt"
GWAS_has_header=True

# programme parameter
log_or=False  # sepcify whether you want to log your odds ratios
check_ref=True # if you know that there are mismatch between the top strand in the genotypes and that of the GWAS, set True. Not checking the reference allele will improve the speed
use_maf=True   # whether to use MAF to check reference allele

# sample file path and name
sampleFilePath="../KIDS.sample" # include the full/relative path and name of the sample file
sampleFileDelim=" "  # sample File Delimiter
sampleFileID=[0]   # which column in the sample file has the ID
sample_skip=2  # how many lines to skip so that the sample names can be matched to the genotypes 1-to-1, taking into account the header of the sample file
##output file information

outputPath="../FirstTest/TestResult"

# Sepcify whether to check for duplicate SNPs
checkDup=False

In [4]:
# get the name of the genotype files
genoFileNamePattern="file:///home/meaney.lab/nyao/PRS/PRS/FirstTest/Test.vcf"
if "file:/" in genoFileNamePattern:
    genoFilePaths=re.sub("file://", "", genoFileNamePattern)


# get the whole list of the file names
genoFileNames=glob.glob(genoFilePaths)

# parameter for phenotype regression
pheno_file=None
#pheno_columns=results.pheno_columns
#pheno_delim=results.pheno_delim
#pheno_no_header=results.pheno_no_header
#covar_columns=results.covar_columns

In [5]:
""" configure logging control """

logger = logging.getLogger("Test")
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(outputPath+".log")
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter1 = logging.Formatter('%(asctime)s %(levelname)s : %(message)s')
formatter2 = logging.Formatter('%(asctime)s %(levelname)s : %(message)s')

ch.setFormatter(formatter1)
fh.setFormatter(formatter2)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)



In [6]:
##  start spark context
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row
APP_NAME="PRS"

spark=SparkSession.builder.appName(APP_NAME).config("spark.submit.pyFiles", "file:///home/meaney.lab/nyao/PRS/PRS/PRS-on-SPARK/PRS_run.py").getOrCreate()

# if using spark < 2.0.0, use the pyspark module to make Spark context
# conf = pyspark.SparkConf().setAppName(APP_NAME).set()#.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

#sc  = SparkContext("spark://172.100.100.101:7077")
sc=spark.sparkContext

sc.setLogLevel("WARN")
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
print("Start Reading Files")
print("Using these genoytpe files: ")

for filename in genoFileNames[:min(24, len(genoFileNames))]:
    print(filename)
if len(genoFileNames)>23:
    print("and more...")

print("total of {} files".format(str(len(genoFileNames))))
# 1. Load files

Start Reading Files
Using these genoytpe files: 
/home/meaney.lab/nyao/PRS/PRS/FirstTest/Test.vcf
total of 1 files


In [7]:
# read the raw data
genodata=sc.textFile(genoFileNamePattern)
#print("Using the GWAS file: {}".format(ntpath.basename(gwasFiles)))
print("Using the GWAS file: {}".format(gwasFiles))
gwastable=spark.read.option("header",GWAS_has_header).option("delimiter", "\t").csv(gwasFiles).cache()
print("Showing top 5 rows of GWAS file")
gwastable.show(5)

print("System recognizes the following information in the GWAS :")
print("SNP ID : Column {}".format(gwas_id))
print("P-values : Column {}".format(gwas_p))
print("Effect size : Column {}".format(gwas_or))
print("Allele A1 : Column {}".format(gwas_a1))
print("Allele A2 : Column {}".format(gwas_a1+1))
if use_maf:
    print("Allele Frequencies : Column {}".format(gwas_a1f))

Using the GWAS file: file:///home/meaney.lab/nyao/PRS/PRS/FirstTest/TestGWAS.txt
Showing top 5 rows of GWAS file
+----------+-------+-------------------+---+---+---------+
|       SNP|      P|               BETA| A1| A2|    CEUAF|
+----------+-------+-------------------+---+---+---------+
| rs3131967|0.06683| 0.0322157032979816|  T|  C|        .|
|rs12562034| 0.8489|-0.0030070181092943|  A|  G|0.0925926|
|rs12124819|0.05396| 0.0342272607705507|  A|  G|        .|
| rs4970383|0.05491| 0.0277572046905535|  A|  C| 0.201835|
| rs1806509|0.00927|-0.0338113190438628|  A|  C| 0.600917|
+----------+-------+-------------------+---+---+---------+
only showing top 5 rows

System recognizes the following information in the GWAS :
SNP ID : Column 0
P-values : Column 1
Effect size : Column 2
Allele A1 : Column 3
Allele A2 : Column 4
Allele Frequencies : Column 5


In [8]:
# filter the genotype to contain only the SNPs less than the maximum p value threshold in the GWAS
# Add a function that only keep the SNPs that match in chromosome and BP in geno and gwas

maxThreshold=max(thresholds)  # maximum p value
gwasOddsMapMax=filterGWASByP_DF(GWASdf=gwastable, pcolumn=gwas_p, idcolumn=gwas_id, oddscolumn=gwas_or, pHigh=maxThreshold, logOdds=log_or)
gwasOddsMapMaxCA=sc.broadcast(gwasOddsMapMax).value  # Broadcast the map

# ### 2. Initial processing
# at this step, the genotypes are already filtered to keep only the ones in 'gwasOddsMapMax'
bpMap={"A":"T", "T":"A", "C":"G", "G":"C"}
tic=time.time()


if filetype.lower()=="vcf":
    logger.info("Genotype data format : .VCF ")

    # Change to the format [snpid, A1, A2, *genotypelist]
    genointermediate=genodata.filter(lambda line: ("#" not in line)).map(lambda line: line.split(GENO_delim)).filter(lambda line: line[geno_id] in gwasOddsMapMaxCA).map(lambda line:([line[x] for x in [geno_id, geno_a1, geno_a1+1]],[chunk.strip('"').split(":")[3] for chunk in line[geno_start::]]))\
    .mapValues(lambda line:[float(x) for x in ",".join(line).split(",")])  
    
elif filetype.lower() == "gen":
    logger.info("Genotype data format : .GEN")
    # Change to the format [snpid, A1, A2, *genotypelist]
    genointermediate=genodata\
    .filter(lambda line: line.split(GENO_delim)[geno_id] in gwasOddsMapMaxCA)\
    .map(lambda line: ([line.split(GENO_delim)[x] for x in [geno_id,geno_a1, geno_a1+1]], [float(x) for x in line.split(GENO_delim)[geno_start::]]))
    

# Change to the format [snpid, *genotypelist]
genotable=genointermediate.map(lambda line: (line[0][0], line[1]))  


2017-03-21 10:37:43,531 INFO : Genotype data format : .VCF 


In [173]:
genodata.first().split("\t")

[u'##fileformat=VCFv4.2']

In [9]:
if check_ref:
    if use_maf:
        logger.info("Determining strand alignment, using MAF")
        genoA1f=genointermediate.map(lambda line: (line[0]+[getA1f(line[1])])).toDF(["Snpid_geno", "GenoA1", "GenoA2", "GenoA1f"])
        gwasA1f=gwastable.rdd.map(lambda line:(line[gwas_id], line[gwas_a1], line[gwas_a2], line[gwas_a1f])).toDF(["Snpid_gwas", "GwasA1", "GwasA2", "GwasA1f" ])
        checktable=genoA1f.join(gwasA1f, genoA1f["Snpid_geno"]==gwasA1f["Snpid_gwas"], "inner").cache()
        if checkDup:
            logger.info("Searching and removing duplicated SNPs")
            flagList = checktable.rdd.map(lambda line: checkAlignmentDF(line, bpMap)).collect()
            flagMap = rmDup(flagList)
        else:
            flagMap = checktable.rdd.map(lambda line: checkAlignmentDF(line, bpMap)).collectAsMap()
    else:
        logger.info("Determining strand alignment, without using MAF. SNPs with Alleles that are reverse compliments will be discarded")
        genoalleles=genotable.map(lambda line: (line[0])).toDF(["Snpid_geno", "GenoA1", "GenoA2"])
        gwasalleles=gwastable.rdd.map(lambda line:(line[gwas_id], line[gwas_a1], line[gwas_a2])).toDF(["Snpid_gwas", "GwasA1", "GwasA2"])
        checktable=genoalleles.join(gwasalleles, genoalleles["Snpid_geno"]==gwasalleles["Snpid_gwas"], "inner").cache()

        if checkDup:
            logger.info("Searching and removing duplicated SNPs")
            flagList = checktable.rdd.map(lambda line: checkAlignmentDFnoMAF(line, bpMap)).collect()
            flagMap = rmDup(flagList)
        else:
            flagMap = checktable.rdd.map(lambda line: checkAlignmentDFnoMAF(line, bpMap)).collectAsMap()

    logger.info("Generating genotype dosage while taking into account difference in strand alignment")
    flagMap=sc.broadcast(flagMap).value
    genotypeMax=genotable.filter(lambda line: line[0] in flagMap and flagMap[line[0]]!="discard" ).map(lambda line: makeGenotypeCheckRef(line, checkMap=flagMap)).cache()

else:
    logger.info("Generating genotype dosage without checking allele alignments")
    genotypeMax=genotable.mapValues(lambda line: makeGenotype(line)).cache()
    flagMap=False
    if checkDup:
        logger.info("Searching and removing duplicated SNPs")
        genotypeCount=genotypeMax.map(lambda line: (line[0], 1)).reduceByKey(lambda a,b: a+b).filter(lambda line: line[1]==1).collectAsMap()
        genotypeMax=genotypeMax.filter(lambda line: line[0] in genotypeCount)

logger.info("Dosage generated in {:.1f} seconds".format(time.time()-tic) )
samplesize=int(len(genotypeMax.first()[1]))
logger.info("Detected {} samples in genotype data" .format(str(samplesize)))

2017-03-21 10:37:53,528 INFO : Determining strand alignment, using MAF


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage 4.0 (TID 9, 172.100.100.101): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/share/apps/spark/python/lib/pyspark.zip/pyspark/worker.py", line 161, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/share/apps/spark/python/lib/pyspark.zip/pyspark/worker.py", line 56, in read_command
    command = serializer.loads(command.value)
  File "/share/apps/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 422, in loads
    return pickle.loads(obj)
ImportError: No module named PRS_run

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:85)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1438)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1437)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1437)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1659)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1884)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1897)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/share/apps/spark/python/lib/pyspark.zip/pyspark/worker.py", line 161, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/share/apps/spark/python/lib/pyspark.zip/pyspark/worker.py", line 56, in read_command
    command = serializer.loads(command.value)
  File "/share/apps/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 422, in loads
    return pickle.loads(obj)
ImportError: No module named PRS_run

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:85)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [19]:
## use the thresholds as bins, put each snp in the corresponding bins

gwasP=gwastable.rdd.filter(lambda line: float(line[gwas_p])< maxThreshold).map(lambda line: (line[gwas_id], float(line[gwas_p]))).collect()

def binTuple(snpwithP, thresholdList):
  results=[]
  snpwithPsorted=sorted(snpwithP,key=lambda x: x[1])
  thresholdSorted=sorted(thresholdList)
  thresholdidx=0
  for snp, p in snpwithPsorted:

    if p>thresholdSorted[thresholdidx]:
      thresholdidx+=1    
    results.append((snp,thresholdSorted[thresholdidx]))
  return results
  
snpBin=binTuple(gwasP, thresholds)

snpBinRDD=sc.parallelize(snpBin)

genotypeMaxRanked=snpBinRDD.join(genotypeMax)

NameError: name 'genotypeMax' is not defined

In [214]:
#genoa1f.map(lambda line:"\t".join([line[0], "\t".join(line[1]), str(line[2])])).saveAsTextFile("../MOMS_info03_maf")

# Calculate PRS at the sepcified thresholds
if flagMap:
  genocalltable=genotable.filter(lambda line: line[0] in flagMap and flagMap[line[0]]!="discard" ).mapValues(lambda geno: getCall(geno)).cache()
else:
  genocalltable=genotable.mapValues(lambda geno: getCall(geno))

assert len(genocalltable.first()[1])==samplesize, "Bug found, size of genotype and call table differ"

In [215]:

genocalltableRanked=snpBinRDD.join(genocalltable)

## multiply each call by the odds
## sum up the score, and the calls, within each rank


def calcIntervals(genotypeRDDRanked, gwasOddsMap, calltableRanked, logsnpON, logger=logger):
  logger.info("Calculating scores in each bin")
  genotypeRDDMultipled=genotypeRDDRanked.map(lambda line: (line[1][0], [x*gwasOddsMap[line[0]] for x in line[1][1]]))
  intervalScoreRDD=genotypeRDDMultipled.reduceByKey(lambda snp1, snp2: map(add, snp1, snp2))
  intervalScores=intervalScoreRDD.collect()
  
  logger.info("Calculating calls in each bin")
  intervalCallsRDD=calltableRanked.map(lambda line:line[1]).reduceByKey(lambda snp1, snp2: map(add, snp1, snp2))
  
  intervalCalls=intervalCallsRDD.collect()

  logger.info("Generating snp list in each bin")
  
  snpLists=False
  if logsnpON:
    snpLists=calltableRanked.map(lambda line:(line[1][0], line[0])).groupByKey().map(lambda line: (line[0], list(line[1]))).collect()
    
  return intervalScores, intervalCalls, snpLists

scoresBin, callsBin, snpBin=calcIntervals(genotypeMaxRanked, gwasOddsMapMaxCA, genocalltableRanked, snp_log)

(u'rs6870608', (0.05, [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))


In [292]:
## Take the sum of scores, calls and snplist in each bin and gather them
def gatherScores(binScores,binCalls,binSNPs, thresholdList, logger=logger):
  prsResults={}
  snpNames={}
  binScoresSorted=sorted(binScores)
  binCallsSorted=sorted(binCalls)
  binSNPsSorted=sorted(binSNPs)
  logger.info("Start gathering scores from each bin")
  binThresholds=[x[0] for x in binScoresSorted]
  for x in binThresholds:
    if x not in thresholdList:
      logger.info("No SNPs exist at threshold {}".format(x))
  
  assert binThresholds==[x[0] for x in binCallsSorted], "Error, scores and calls have different bins"
  assert binThresholds==[x[0] for x in binSNPsSorted], "Error, scores and SNP list have different bins"
  
  binScoresSortedvalues=[x[1] for x in binScoresSorted]
  binCallsSortedvalues=[x[1] for x in binCallsSorted]
  binSnpsSortedvalues=[x[1] for x in binSNPsSorted]
  totalNumbers=len(binScores)
  for i in range(len(binScoresSorted)):
    
    threshold=binThresholds[i]
    scores=[sum(x) for x in zip(*binScoresSortedvalues[:(i+1)])]
    calls=[sum(x) for x in zip(*binCallsSortedvalues[:(i+1)])]
    normalizedScores=[score/call for score, call in zip(scores, calls)]
    
    prsResults[threshold]=[calls,normalizedScores]
    combinedSNPs=reduce(lambda x,y: x+y, binSnpsSortedvalues[:(i+1)])
    snpNames[threshold]=combinedSNPs
    print("Processed {} / {} scores".format(i+1, totalNumbers))
    sys.stdout.flush()

  return prsResults, snpNames

In [287]:
a=[2,3,2,4]
b=[4,3,2,4]
c=[4,3,2,4]
reduce(lambda x,y:  map(add, x,y),[a,b,c])
#[score/call for score, call in zip(a[1], b[1])]

[10, 9, 6, 12]

In [294]:
prsDict, snpids=gatherScores(scoresBin, callsBin, snpBin, thresholds)

2017-03-13 11:58:30,963 INFO : Start gathering scores from each bin


Processed 1 / 9 scores
Processed 2 / 9 scores
Processed 3 / 9 scores
Processed 4 / 9 scores
Processed 5 / 9 scores
Processed 6 / 9 scores
Processed 7 / 9 scores
Processed 8 / 9 scores
Processed 9 / 9 scores


In [295]:
# log which SNPs are used in PRS
if snp_log:
    if flagMap:
        logoutput=writeSNPlog(snpids, outputPath, logger, flagMap=flagMap)
    else:
        logoutput=writeSNPlog(snpids, outputPath, logger)

# generate labels for samples
#if filetype.lower()=="vcf":
    #subjNames=genodata.filter(lambda line: "#CHROM" in line).map(lambda line: line.split(GENO_delim)[9::]).collect()[0]
    #output=writePRS(prsDict,  outputPath, samplenames=subjNames)

2017-03-13 11:58:53,670 INFO : Successfully output log to ../TestResult.snplog


In [298]:
if sampleFilePath!="NOSAMPLE":
    # get sample name from the provided sample file
    subjNames=getSampleNames(sampleFilePath,sampleFileDelim,sampleFileID, skip=sample_skip)

    output=writePRS(prsDict,  outputPath, logger, samplenames=subjNames)
else:
    output=writePRS(prsDict,  outputPath,logger=logger, samplenames=None)

2017-03-13 11:59:41,406 INFO : Collected 260 sample labels
2017-03-13 11:59:41,410 INFO : Successfully wrote scores to ../TestResult.score


In [299]:
if pheno_file is not None:
    phenotypes, thresholds, r2All, pAll=regression(prsDict,pheno_file, pheno_delim, pheno_columns, pheno_no_header, covarColumns=covar_columns, outputName=outputPath, logger=logger)

    r_square_plots(phenotypes,r2All,pAll, thresholds, outputName=outputPath, width = 3,bar_width = step)

#sc.stop()
seconds=time.time()-totalstart
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
logger.info("Total Calculation Time : {:d} hrs {:02d} min {:02d} sec".format(int(h), int(m), int(round(s))))

NameError: name 'totalstart' is not defined