pubParseDb

#!/usr/bin/env python

# load default python packages
import logging, optparse, sys, glob, gzip, collections, copy, gzip, os, doctest, re
from os.path import *
from collections import defaultdict

#try:
#from lxml import etree
#except ImportError:
    ## Python 2.5
    #print "py 2.5 fallback etree"
import xml.etree.cElementTree as etree
    # doesn't work anymore, we need the partial parsing option

# add <scriptDir>/lib/ to package search path
sys.path.insert(0, join(dirname(abspath(__file__)), "lib"))

import pubGeneric, maxCommon, pubConf, maxbio

# example file /hive/data/outside/pdb/aa/pdb1aa0.ent.gz

pdbHeaders = ["acc", "isMain", "authors", "title", "ref", "issn", "pmid", "doi"]
pdbToHeader = {'AUTH' : "authors", "TITL" : "title", "REFN": "issn", "PMID":"pmid", "DOI":"doi", "REF":"ref", "isMain":"isMain", "acc": "acc"}
PdbRefRec  = collections.namedtuple("pdbRef", pdbHeaders)

def parsePdbRefLine(data, line):
    " add line to data dict "
    keyword, entry = line[12:16].strip(), line[19:].strip()
    if keyword in data:
        data[keyword] = data[keyword]+" "+entry
    else:
        data[keyword] = entry

def parsePdb(pdbDir, outDir):
    " write pdb.tab to outDir, parsing an ftp mirror from PDB "
    # get list of infnames
    if isdir(pdbDir):
        logging.info("Scanning for input files in %s" % pdbDir)
        inDirs = [d for d in glob.glob(pdbDir+"/*") if isdir(d)]
        inFnames = [] 
        for inDir in inDirs:
            dirFnames = glob.glob(inDir+"/*.ent.gz")
            inFnames.extend(dirFnames)
        logging.info("Found %d input files under %s" % (len(inFnames), pdbDir))
    elif isfile(pdbDir):
        inFnames = [pdbDir]
    else:
        raise Exception("pdbDir %s does not exist" % pdbDir)

    # write headers and open outfile
    outFname = join(outDir, "pdb.tab")
    logging.info("Writing to %s" % outFname)
    ofh = open(outFname, "w")
    ofh.write("\t".join(pdbHeaders))
    ofh.write("\n")

    tp = maxCommon.ProgressMeter(len(inFnames))
    for inFname in inFnames:
        logging.debug("Parsing %s" % inFname)
        ifh = gzip.open(inFname)
        refs = []
        refData = {}
        for line in ifh:
            if line.startswith("HEADER "):
                acc = line.split()[-1]

            if line.startswith("JRNL"):
                refData["isMain"]="1"
                refData["acc"] = acc
                parsePdbRefLine(refData, line)

            elif line.startswith("REMARK   1 "):
                if line[11:].startswith("REFERENCE"):
                    refs.append(refData)
                    refData = {}
                    refData["isMain"] = "0"
                    refData["acc"] = acc
                    continue
                parsePdbRefLine(refData, line)
        refs.append(refData)

        # translate keys from PDB to our own ones and write to outfile
        newRefs = []
        for ref in refs:
            if '' in ref:
                del ref['']
            if 'EDIT' in ref:
                del ref['EDIT']
            if 'PUBL' in ref:
                del ref['PUBL']
            if 'REFE' in ref: # looks like a typo in /hive/data/outside/pdb/o9/pdb1o91.ent.gz
                logging.warn("REFE typo ignored")
                del ref['REFE']
            newRef = {}
            for k, v, in ref.iteritems():
                newRef[pdbToHeader[k]] = v
            for h in pdbHeaders:
                if not h in newRef:
                    newRef[h] = ""
            newRef["issn"] = newRef["issn"].replace("ISSN ","")
            row = PdbRefRec(**newRef)
            ofh.write("\t".join(row))
            ofh.write("\n")

        tp.taskCompleted()
        

# UNIPROT PARSING 

# remove these PMIDs from all evidences
pmidBlackList = set([17344846]) # high-throughput study

# only parse these feature types
featTypes = {
    "sequence variant": "variant",
    "mutagenesis site": "mutagen",
    "modified residue": "modif",
    "cross-link": "cross-link",
    "region of interest": "interest",
    "short sequence motif": "motif",
    "metal ion-binding site": "ion-binding",
    "site": "site",
    "topological domain" : "topo",
    "transmembrane region" : "transmem",
    "disulfide bond" : "disulf bond",
    "glycosylation site" : "glyco",
    "binding site" : "bind",
    "active site" : "enzyme act site",
    "signal peptide" : "signal pep",
    "transit peptide" : "trans pep",
    "calcium-binding region" : "calcium bind",
    "lipid moiety-binding region" : "lipid",
    "propeptide" : "propep",
    "intramembrane region" : "intramem",
    "peptide" : "peptide",
    "nucleotide phosphate-binding region" : "nucl phos bind",
    "helix" : "helix",
    "chain" : "chain",
    "coiled-coil region" : "coiled-coil",
    "turn" : "turn",
    "strand" : "beta",
    "domain" : "domain",
    "zinc finger region" : "zinc finger",
    "repeat" : "repeat",
    "compositionally biased region" : "biased",
    "initiator methionine" : "init Met",
    "non-standard amino acid" : "non-std",
    "non-consecutive residues" : "non-consec",
    "unsure residue" : "unsure",
    "DNA-binding region" : "DNA-binding",
    "non-terminal residue" : "nonTerm"
}

# main record info
entryHeaders = ["dataset", "acc", "mainIsoAcc", "orgName", "orgCommon", "taxonId", "name", "accList", \
    "protFullNames", "protShortNames", "protAltFullNames", "protAltShortNames", \
    "geneName", "geneSynonyms", "isoNames", \
    "geneOrdLocus", "geneOrf", \
    "hgncSym", "hgncId", "refSeq", "refSeqProt", "entrezGene", "ensemblGene", "ensemblProt", \
    "kegg", "emblMrna", "emblMrnaProt", "emblDna", "emblDnaProt", \
    "pdb", "ec", \
    "uniGene", "omimGene", "omimPhenotype", "subCellLoc", "functionText", "mainSeq", "isoIds", "isoSeqs"]
EntryRec = collections.namedtuple("uprec", entryHeaders)

# disease associated mutation
mutHeaders = ["acc", "mainIsoAcc", "varId", "featType", "shortFeatType", "begin", "end", "origAa", "mutAa", "dbSnpId", "disRelated", "disease", "disCode", "pmid", "comment"]
MutRec = collections.namedtuple("mutrec", mutHeaders)

# references from record
refHeaders = ["name", "citType", "year", "journal", "vol", "page", \
        "title", "authors", "doi", "pmid", "scopeList"]
RefRec = collections.namedtuple("refRec", refHeaders)
emptyRef = dict(zip(refHeaders, len(refHeaders)*[""]))

def strip_namespace_inplace(etree, namespace=None,remove_from_attr=True):
    """ Takes a parsed ET structure and does an in-place removal of all namespaces,
        or removes a specific namespacem (by its URL).
        
        Can make node searches simpler in structures with unpredictable namespaces
        and in content given to be non-mixed.

        By default does so for node names as well as attribute names.       
        (doesn't remove the namespace definitions, but apparently
         ElementTree serialization omits any that are unused)

        Note that for attributes that are unique only because of namespace,
        this may attributes to be overwritten. 
        For example: <e p:at="bar" at="quu">   would become: <e at="bar">

        I don't think I've seen any XML where this matters, though.
    """
    if namespace==None: # all namespaces                               
        for elem in etree.getiterator():
            tagname = elem.tag
            if not isinstance(elem.tag, basestring):
                continue
            if tagname[0]=='{':
                elem.tag = tagname[ tagname.index('}',1)+1:]

            if remove_from_attr:
                to_delete=[]
                to_set={}
                for attr_name in elem.attrib:
                    if attr_name[0]=='{':
                        old_val = elem.attrib[attr_name]
                        to_delete.append(attr_name)
                        attr_name = attr_name[attr_name.index('}',1)+1:]
                        to_set[attr_name] = old_val
                for key in to_delete:
                    elem.attrib.pop(key)
                elem.attrib.update(to_set)

    else: # asked to remove specific namespace.
        ns = '{%s}' % namespace
        nsl = len(ns)
        for elem in etree.getiterator():
            if elem.tag.startswith(ns):
                elem.tag = elem.tag[nsl:]

            if remove_from_attr:
                to_delete=[]
                to_set={}
                for attr_name in elem.attrib:
                    if attr_name.startswith(ns):
                        old_val = elem.attrib[attr_name]
                        to_delete.append(attr_name)
                        attr_name = attr_name[nsl:]
                        to_set[attr_name] = old_val
                for key in to_delete:
                    elem.attrib.pop(key)
                elem.attrib.update(to_set)


def parseDiseases(fname):
    " parse the file humanDiseases.txt from uniprot to resolve disease IDs to disease names "
    dis = {}
    for line in open(fname).read().splitlines():
        if line.startswith("ID"):
            name = line[5:].strip(".")
        if line.startswith("AR"):
            code = line[5:].strip(".")
            dis[code]=name
    return dis

def findSaveList(el, path, dataDict, key, attribKey=None, attribVal=None, useAttrib=None, subSubEl=None):
    """ find all text of subelemets matching path with given optionally attrib and save into dataDict with key
    You can specify a subSubEl of the element to get the text from.
    """
    l = []
    for se in el.findall(path):
        if attribKey!=None and se.attrib.get(attribKey, None)!=attribVal:
            continue
        if useAttrib:
            val = se.attrib[useAttrib]
        else:
            if subSubEl:
                val = se.find(subSubEl).text
            else:
                val = se.text
        l.append(val)
    s = "|".join(l)
    dataDict[key] = s

def openOutTabFile(subDir, outName, headers):
    " create outdir and open outfile, write headers "
    #subDir = join(outDir, outSubDir) 
    if not isdir(subDir):
        logging.info("Creating dir %s" % subDir)
        os.makedirs(subDir)
    outPath = join(subDir, outName)
    logging.info("Writing output to %s" % outPath)
    ofh = open(outPath, "w")
    ofh.write("\t".join(headers)+"\n")
    return ofh

def findDisCodes(text, disToName):
    """ find disease codes in text, return as a set of disease codes 
    >>> findDiseases("Defects in HAL are the cause of histidinemia (HISTID) ")
    set(['HISTID'])
    """
    disSet = set()
    for m in re.finditer("[(]([a-zA-Z0-9- ]+)[)]", text):
        word = m.group(1)
        if word in disToName:
            disSet.add(word)
    return disSet
            

#def findDiseases(text):
#    """ find disease codes and their names in text, return as dict code -> name 
#    >>> findDiseases("Defects in CEACAM16 are the cause of deafness autosomal dominant type 4B (DFNA4B) [MIM:614614].")
#    {'DFNA4B': 'deafness autosomal dominant type 4B'}
#    >>> findDiseases("Defects in ALX4 are the cause of parietal foramina 2 (PFM2) [MIM:609597]; also known as foramina parietalia permagna (FPP). PFM2 is an autosomal dominant disease characterized by oval defects of the parietal bones caused by deficient ossification around the parietal notch, which is normally obliterated during the fifth fetal month. PFM2 is also a clinical feature of Potocki-Shaffer syndrome.")
#    {'PFM2': 'parietal foramina 2', 'FPP': 'foramina parietalia permagna'}
#
#    # disease is only one word, but long enough
#    >>> findDiseases("Defects in HAL are the cause of histidinemia (HISTID) ")
#    {'HISTID': 'histidinemia'}
#    """
#    result = {}
#    phrases = re.split("[;.] ", text)
#    notDisease = set(["of", "with", "to", "as", "or", "also", "in"])
#
#    for phrase in phrases:
#        words = phrase.split()
#        revWords = reversed(words)
#
#        grabWords = False
#        disWords = []
#        disCode = None
#        # go backwords over words and look for acronym, then grab all words before that
#        # until we find a common English word
#        for word in revWords:
#            m = re.match("[(]([A-Z0-9-]+)[)]", word)
#            if m!=None:
#                disCode = m.group(1)
#                grabWords = True
#                continue
#
#            if word in notDisease and (len(disWords)>1 or len("".join(disWords))>=9):
#                disName = " ".join(list(reversed(disWords)))
#                if disCode==None:
#                    logging.debug("Found disease %s, but no code for it" % disName)
#                    continue
#                result[disCode] = disName
#                disCode = None
#                disWords = []
#                grabWords = False
#
#            if grabWords:
#                disWords.append(word)
#
#    return result

#def parseDiseaseComment(entryEl):
#    """ return two dicts 
#    one with evidence code -> disease code
#    one with disease code -> disease name 
#    """
#    disRefs = {}
#    disCodes = {}
#    for commentEl in entryEl.findall("comment"):
#        textEl = commentEl.find("text")
#        if commentEl.attrib["type"]=="disease":
#            refStr = commentEl.attrib.get("evidence", None)
#            # website xml is different, has evidence attribute on text element
#            if refStr==None:
#                refStr = textEl.attrib.get("evidence", None)
#                if refStr==None:
#                    continue
#
#            refs = refStr.split(" ")
#
#            text = textEl.text
#            logging.debug("Disease comment: %s, evidence %s" % (text, refStr))
#            disCodes.update(findDiseases(text))
#
#            for refId in refs:
#                disRefs[refId] = disCodes
#
#    logging.debug("Found disease evidences: %s" % disRefs)
#    logging.debug("Found disease names: %s" % disCodes)
#    return disRefs, disCodes

def parseDiseaseComment(entryEl, disToName):
    """ 
    parse the general comments, disease section from up record 
    return evidence codes that refer to diseases 
    also return disease codes 
    """
    disRefs = {}
    disCodes = set()
    for commentEl in entryEl.findall("comment"):
        textEl = commentEl.find("text")
        if commentEl.attrib["type"]=="disease":
            refStr = commentEl.attrib.get("evidence", None)
            # website xml is different, has evidence attribute on text element
            if refStr==None:
                refStr = textEl.attrib.get("evidence", None)
                if refStr==None:
                    continue

            refs = refStr.split(" ")

            text = textEl.text
            logging.debug("Disease comment: %s, evidence %s" % (text, refStr))
            disCodes.update(findDisCodes(text, disToName))

            for refId in refs:
                disRefs[refId] = disCodes

    logging.debug("Found disease evidences: %s" % disRefs)
    #logging.debug("Found disease names: %s" % disCodes)
    return disRefs, disCodes

def parseIsoforms(acc, mainSeq, entryEl, isoSeqs):
    " parse sequences of isoforms, returns lists: isoIds, isoNames, seqs "
    isoDefined = False
    isoIds = []
    isoNames = []
    seqs = []
    for isoEl in entryEl.findall("comment/isoform"):
        isoDefined = True
        # get id
        idEl = isoEl.find("id")
        isoId = idEl.text

        # get names (just as gene synonyms)
        for nameEl in isoEl.find("name"):
            isoNames.append(nameEl.text)
        seqEl = isoEl.find("sequence")

        # get sequences
        seqType = seqEl.attrib["type"]
        if seqType=="displayed":
            seqs.append(mainSeq)
            isoIds.append(isoId)
        elif seqType=="external":
            pass # weird anyways
        else:
            if isoId not in isoSeqs:
                logging.debug("sequence %s does not exist" % isoId)
            else:
                seqs.append(isoSeqs[isoId])
                isoIds.append(isoId)

    if not isoDefined:
        isoIds = [acc]
        seqs = [mainSeq]

    assert(len(seqs)==len(isoIds))

    return isoIds, isoNames, seqs

def parseDbRefs(entryEl):
    " return dict with db -> id (various special cases) "
    dbRefs = defaultdict(set)
    dbRefs["emblMrna"] =  []
    dbRefs["emblMrnaProt"] =  []
    dbRefs["emblDna"] =  []
    dbRefs["emblDnaProt"] =  []
    for dbRefEl in entryEl.findall("dbReference"):
        db = dbRefEl.attrib["type"]
        mainId = dbRefEl.attrib["id"]
        if db=="EMBL": # special case, don't add yet
            emblId = mainId
        else:
            dbRefs[db].add(mainId)
        propEls = dbRefEl.findall("property")
        emblProtId = "na"
        id = None
        for propEl in propEls:
            propType = propEl.attrib["type"]
            propDb = db
            if (db, propType) ==("RefSeq", "nucleotide sequence ID"):
                id = propEl.attrib["value"]
                propDb = "refseqNucl"
            elif db=="HGNC" and propType=="gene designation":
                id = propEl.attrib["value"]
                propDb = "hgncGene"
            elif db=="Ensembl" and propType=="gene ID":
                id = propEl.attrib["value"]
                propDb = "ensemblGene"
            elif db=="Ensembl" and propType=="protein sequence ID":
                id = propEl.attrib["value"]
                propDb = "ensemblProt"
            elif db=="EMBL" and propType=="protein sequence ID":
                emblProtId = propEl.attrib["value"]
                continue # don't add yet
            elif db=="MIM" and propType=="type":
                omimCat = propEl.attrib["value"]
                if omimCat=="phenotype":
                    dbRefs["omimPhenotype"].add(mainId)
                elif omimCat=="gene" or omimCat=="gene+phenotype":
                    dbRefs["omimGene"].add(mainId)
                else:
                    assert(False)
            elif db=="EMBL" and propType=="molecule type":
                val = propEl.attrib["value"]
                if val=="mRNA":
                    # add now
                    dbRefs["emblMrna"].append(emblId)
                    dbRefs["emblMrnaProt"].append(emblProtId)
                else:
                    dbRefs["emblDna"].append(emblId)
                    dbRefs["emblDnaProt"].append(emblProtId)
                continue # don't add any id
            else:
                id = dbRefEl.attrib["id"]
            if id!=None:
                dbRefs[propDb].add(id)

    result = {}
    for db, valList in dbRefs.iteritems():
        result[db] = "|".join(valList)
        
    logging.debug("dbRefs: %s" % result)
    return result

def splitAndResolve(disName, disCodes, splitWord):
    " split and split word, try to resolve via disCodes and rejoin again "
    subDises = disName.split(splitWord)
    newDises = []
    for subDis in subDises:
        subDis = subDis.strip()
        if subDis in disCodes:
            newDises.append(disCodes[subDis])
        else:
            newDises.append(subDis)
    disName = ",".join(newDises)
    return disName

def parseFeatDesc(text, disToName):
    """ 
    parse the description of a feature to find code name of disease, snpId and comments 
    return tuple: (disease name, dbSnpId, otherComments)
    >>> parseFeatDesc("In sporadic cancers; somatic mutation; dbSNP:rs11540654.", {})
    ('sporadic cancers', 'rs11540654', 'somatic mutation')
    >>> parseFeatDesc("In RIEG1; pointless comment", {"RIEG1" : "Axel-Riegerfeldt syndrome"})
    ('Axel-Riegerfeldt syndrome', '', 'pointless comment')
    """
    # find disease name and try to resolve via disToNames
    logging.debug("Feature description: %s " % (text))
    text = text.strip(".").strip()
    parts = text.split("; ")
    disCode = ""
    comments = []
    for part in parts:
        part = part.replace("a patient with", "")
        part = part.replace("in a ", "in ")
        partLow = part.lower()
        if partLow.startswith("in ") and "dbSNP" not in part and "allele" not in part:
            disCode = " ".join(part.split()[1:])
            # some entries contain two disease names
        else:
            if "dbSNP" not in part:
                comments.append(part)
                    
    # we got a plain disease code
    if disCode in disToName:
        disLongName = disToName[disCode]
    # or two dis codes with and
    elif " and " in disCode:
        disLongName = splitAndResolve(disCode, disToName, " and ")
    else:
        # there are dis code somewhere inside the text
        intDisCodes = findDisCodes(disCode, disToName)
        if len(intDisCodes)!=0:
            disLongName = disCode
            disCode = ",".join(intDisCodes)
        # ok nothing worked, keep it as it is
        else:
            disLongName = disCode

    # find snpId
    snpId = ""
    for m in re.finditer("dbSNP:(rs[0-9]+)", text):
        if m!=None:
            #assert(snpId=="")
            snpId = m.group(1)

    logging.debug("Disease: %s, snpId: %s" % (disLongName, snpId))
    return disCode, disLongName, snpId, "; ".join(comments)


ignoredTypes = collections.Counter()

def parseFeatures(entryEl, disRefs, defaultDisCodes, disToName, evidPmids, mainIsoAcc):
    " go over features and yield mutation records "

    acc = entryEl.find("accession").text

    mutations = []
    for featEl in entryEl.findall("feature"):
        featType = featEl.attrib["type"]
        if featType not in featTypes:
            ignoredTypes[featType] += 1 
            continue
        if featType in ["sequence variant"]:
            isVariant = True
        else:
            isVariant = False
        shortFeatType = featTypes[featType]
        logging.debug("type: %s" % featType)

        varId = featEl.attrib.get("id", "")
        logging.debug("Variant ID %s" % varId)

        origEl = featEl.find("original")
        if origEl==None:
            #logging.debug("No original residue")
            #continue
            orig = ""
        else:
            orig = origEl.text

        varEl = featEl.find("variation")
        if varEl==None:
            variant = ""
        else:
            variant = varEl.text
            logging.debug("residue change: %s->%s" % (orig, variant))

        posEl = featEl.find("location/position")
        if posEl!=None:
            begin = posEl.attrib["position"]
            end = str(int(begin)+1)
        else:
            beginEl = featEl.find("location/begin")
            begin = beginEl.attrib.get("position", None)
            if begin==None:
                logging.debug("Unknown start, skipping a feature")
                continue
            endEl = featEl.find("location/end")
            end = endEl.attrib.get("position", None)
            if end==None:
                logging.debug("Unknown end, skipping a feature")
                continue
            end = str(int(end)+1)

        desc = featEl.attrib.get("description", None)
        if desc==None:
            #logging.debug("No description")
            #continue
            desc = ""
        if "sulfinic" in desc:
            shortFeatType = "sulfo"

        descWords = desc.split()
        if len(descWords)>0:
            desc1 = descWords[0].lower()
            if "phos" in desc1:
                shortFeatType = "phos"
            elif "acetyl" in desc1:
                shortFeatType = "acetyl"
            elif "methyl" in desc1:
                shortFeatType = "methyl"
            elif "lipo" in desc1:
                shortFeatType = "lipo"
            elif "hydroxy" in desc1:
                shortFeatType = "hydroxy"
            elif "nitro" in desc1:
                shortFeatType = "nitro"

        evidStr = featEl.attrib.get("evidence", "")
        logging.debug("variant pos %s-%s, desc %s, evidence %s" % (begin, end, desc, evidStr))
        desc = desc.strip("() ")
        #if desc=="":
            #logging.debug("No description")
            #continue
        #if evidStr==None:
            #logging.debug("No evidence")
            #continue
        evidList = evidStr.split()

        if isVariant:
            # only do this for natural variants
            disCode, disName, snpId, comments = parseFeatDesc(desc, disToName)
            # if no disease annotated to feature, use the one from the record
            if disCode=="" and len(defaultDisCodes)==1:
                disCode = list(defaultDisCodes)[0]
                disName = disToName.get(disCode, disCode)+" (not annotated on variant but on gene record)"
                disCode = disCode + "?"
        else:
            disCode, disName, snpId, comments = "", "", "", desc

        varPmids = []
        if disCode!="":
            diseaseRelated = "disRelated"
        else:
            diseaseRelated = "noEvidence"
        for evidId in evidList:
            if evidId in disRefs:
                diseaseRelated="disRelated"
            else:
                diseaseRelated="notDisRelated"
                logging.debug("evidence is not a disease evidence or blacklisted, check description")

            pmids = evidPmids.get(evidId, [])
            assert(len(pmids)<=1)
            if len(pmids)>0:
                pmid = list(pmids)[0]
                varPmids.append(pmid)

        if len(varPmids)==1 and set(varPmids).intersection(pmidBlackList)==len(varPmids):
            logging.debug("only blacklist pmids, skipping feature")

        mut = MutRec(acc, mainIsoAcc, varId, featType, shortFeatType, begin, end, orig, variant, snpId, diseaseRelated, disName, disCode, ",".join(varPmids), comments)
        logging.debug("Accepted variant: %s" % str(mut))

        # rewrite disulfide bonds to two separate features, one for each cysteine involved
        if featType=="disulfide bond":
            end = mut.end
            comment = mut.comment
            if comment!="":
                comment+= "; "
            newComment = comment + "disulfide bond to position %s" % str(int(mut.end)-1)
            mut1 = mut._replace(end=str(int(mut.begin)+1), comment=newComment)
            yield mut1
            newComment = comment + "disulfide bond to position %s" % mut.begin
            mut2 = mut._replace(begin=str(int(mut.end)-1), comment=newComment)
            yield mut2
        else:
            yield mut

def parseEvidence(entryEl):
    " return a dict with evidCode -> PMID "
    result = {}
    for evidEl in entryEl.findall("evidence"):
        evidCode = evidEl.attrib["key"]
        for dbRefEl in evidEl.findall("source/dbReference"):
            dbType = dbRefEl.attrib["type"]
            if dbType=="PubMed":
                pmid = dbRefEl.attrib["id"]
                #if pmid in pmidBlackList:
                    #continue
                result.setdefault(evidCode, [])
                result[evidCode].append(pmid)
    return result
    
def parseVariants(entryEl, mainIsoAcc, disToName):
    " return MutRecs with disease associated variants "
    # parse the general record comment about diseases
    disRefs, allDiseaseCodes = parseDiseaseComment(entryEl, disToName)

    #if len(disRefs)==0:
        #logging.debug("No disease evidence")
        #return []
    acc = entryEl.find("accession").text
    logging.debug("Diseases in %s" % acc)

    evidPmids = parseEvidence(entryEl)
    mutRecs = list(parseFeatures(entryEl, disRefs, allDiseaseCodes, disToName, evidPmids, mainIsoAcc))
    return mutRecs

def parseRecInfo(entryEl, entry, isoSeqs):
    """parse uniprot general record info into entry dict
    use isoform sequences from isoSeqs
    only process certain taxonIds
    """
    dataset = entryEl.attrib["dataset"]
    entry["dataset"] = dataset

    findSaveList(entryEl, "name", entry, "name")
    findSaveList(entryEl, "accession", entry, "accList")
    entry["acc"] = entry["accList"].split("|")[0]
    logging.debug("Parsing rec info for acc %s" % entry["acc"])

    findSaveList(entryEl, "protein/recommendedName/fullName", entry, "protFullNames")
    findSaveList(entryEl, "protein/recommendedName/shortName", entry, "protShortNames")
    findSaveList(entryEl, "protein/alternativeName/fullName", entry, "protAltFullNames")
    findSaveList(entryEl, "protein/alternativeName/shortName", entry, "protAltShortNames")
    findSaveList(entryEl, "gene/name", entry, "geneName", attribKey="type", attribVal="primary")
    findSaveList(entryEl, "gene/name", entry, "geneSynonyms", attribKey="type", attribVal="synonym")
    findSaveList(entryEl, "gene/name", entry, "geneOrdLocus", attribKey="type", attribVal="ordered locus")
    findSaveList(entryEl, "gene/name", entry, "geneOrf", attribKey="type", attribVal="ORF")
    findSaveList(entryEl, "organism/name", entry, "orgName", attribKey="type", attribVal="scientific")
    findSaveList(entryEl, "organism/name", entry, "orgCommon", attribKey="type", attribVal="common")
    findSaveList(entryEl, "organism/dbReference", entry, "taxonId", useAttrib="id")
    findSaveList(entryEl, "comment/isoform/id", entry, "isoIds")
    findSaveList(entryEl, "comment/isoform/name", entry, "isoNames")
    findSaveList(entryEl, "comment/subcellularLocation/location", entry, "subCellLoc")
    findSaveList(entryEl, "comment", entry, "functionText", attribKey="type", attribVal="function", subSubEl="text")

    mainSeq = entryEl.find("sequence").text
    entry["mainSeq"] = mainSeq

    isoIds, isoNames, seqs = parseIsoforms(entry["acc"], mainSeq, entryEl, isoSeqs)
    dbRefs = parseDbRefs(entryEl)
    entry["mainIsoAcc"] = isoIds[0]

    entry["hgncSym"] = dbRefs.get("hgncGene", "")
    entry["hgncId"] = dbRefs.get("HGNC", "")
    entry["refSeq"] = dbRefs.get("refseqNucl", "")
    entry["refSeqProt"] = dbRefs.get("RefSeq", "")
    entry["ensemblProt"] = dbRefs.get("ensemblProt", "")
    entry["ensemblGene"] = dbRefs.get("ensemblGene", "")
    entry["entrezGene"] = dbRefs.get("GeneID", "")
    entry["kegg"] = dbRefs.get("KEGG", "")
    entry["uniGene"] = dbRefs.get("UniGene", "")
    entry["omimGene"] = dbRefs.get("omimGene", "")
    entry["omimPhenotype"] = dbRefs.get("omimPhenotype", "")
    entry["emblMrna"] = dbRefs.get("emblMrna", "") # mrnas
    entry["emblMrnaProt"] = dbRefs.get("emblMrnaProt", "") # the protein accessions for mrnas
    entry["emblDna"] = dbRefs.get("EmblDna", "") # anything not an mrna
    entry["emblDnaProt"] = dbRefs.get("EmblDnaProt", "") # protein accessions for non-mrnas
    entry["pdb"] = dbRefs.get("PDB", "")
    entry["ec"] = dbRefs.get("EC", "")
        
    entry["isoIds"]="|".join(isoIds)
    entry["isoSeqs"]="|".join(seqs)
    entry["isoNames"]="|".join(isoNames)

    entryRow = EntryRec(**entry)
    return entryRow

def parseRefInfo(entryEl, recName):
    for refEl in entryEl.findall("reference"):
        ref = copy.copy(emptyRef)
        ref["name"] = recName
        citEl = refEl.find("citation")
        ref["citType"] = citEl.attrib["type"]
        year = citEl.attrib.get("date", "")
        ref["year"] = year.split("-")[0]
        ref["journal"] = citEl.attrib.get("name", "")
        if ref["journal"]=="":
            ref["journal"] = citEl.attrib.get("db", "") # for submissions
        ref["vol"] = citEl.attrib.get("volume", "")
        ref["page"] = citEl.attrib.get("first", "")
        for titleEl in citEl.findall("title"):
            ref["title"] = titleEl.text
        authorList = []
        for personEl in citEl.findall("authorList/person"):
            if "name" in personEl.attrib:
                name = personEl.attrib["name"]
                name = name.replace(" ", ",", 1)
                authorList.append(name)
        ref["authors"]=";".join(authorList)
        for dbRefEl in citEl.findall("dbReference"):
            if "type" in dbRefEl.attrib:
                if dbRefEl.attrib["type"]=="DOI":
                    ref["doi"] = dbRefEl.attrib["id"]
                if dbRefEl.attrib["type"]=="PubMed":
                    ref["pmid"] = dbRefEl.attrib["id"]

        findSaveList(refEl, "scope", ref, "scopeList")
        refRow = RefRec(**ref)
        yield refRow

def readIsoforms(inDir):
    " return all isoform sequences as dict isoName (eg. P48347-2) -> sequence "
    isoFname = join(inDir, "uniprot_sprot_varsplic.fasta.gz")
    isoFile = gzip.open(isoFname)
    logging.info("reading isoform sequences from %s" % isoFname)
    isoSeqs = maxbio.parseFastaAsDict(isoFile)
    result = {}
    seqNames = []
    for id, seq in isoSeqs.iteritems():
        idParts = id.split("|")
        isoName = idParts[1]
        result[isoName] = seq
        seqNames.append(idParts[2].split()[0])
    logging.info("Found %d isoform sequences" % len(result))
    return result, len(set(seqNames))


def writeFaSeqs(entry, faFiles, allVariants=False):
    """ write main sequence to faFile with the right taxonId 
    base sequence always has accession as ID 
    """
    #seqIds = entry.isoIds.split("|")
    if allVariants:
        seqIds = entry.isoIds.split("|")
        seqs = entry.isoSeqs.split("|")
    else:
        seqIds = [entry.acc]
        seqs   = [entry.isoSeqs.split("|")[0]]
    taxonId = entry.taxonId
    if "all" in faFiles:
        ofh = faFiles["all"]
    else:
        ofh = faFiles[taxonId]
    #for seqId, seq in zip(seqIds, seqs):
        #ofh.write(">%s\n%s\n" % (seqId, seq))
    c = 0
    for seqId, seq in zip(seqIds, seqs):
        # this was to make sure that the first variant has the uniprot ID
        # sounds like not such a good idea but maybe necessary for the 
        # uniprot lifter file?
        #if c==0 and allVariants:
            #seqId = entry.acc
        ofh.write(">%s\n%s\n" % (seqId.strip(), seq.strip()))
        c+=1

def openFaFiles(taxonIds, outDir, outPrefix, seqType="base"):
    faFiles = {}
    if taxonIds == None:
        taxonIds = ["all"]

    for taxonId in taxonIds:
        taxonId = str(taxonId)
        seqQual = ""
        if seqType!="base":
            seqQual = "."+seqType
        faFname = join(outDir, outPrefix+"."+taxonId+seqQual+".fa.gz")
        faFiles[taxonId] = gzip.open(faFname, "w")
        logging.info("Writing fasta seqs for taxon %s to %s (seqType: %s)" % (taxonId, faFname, seqType))
    return faFiles

def parseUniprot(db, inDir, outDir, taxonIds):
    " parse uniprot, write records and refs to outdir "

    if options.parse:
        fname = options.parse
        logging.info("Debug parse of %s" % fname)
        xmlFile = open(fname)
        isoSeqs, recCount = {}, 1
        outDir = "."
        outPrefix = "temp"
    else:
        isoSeqs, recCount = readIsoforms(inDir)
        if db=="uniprot":
            xmlBase = "uniprot_sprot.xml.gz"
            outPrefix = "uniprot"
            recCount = 500000
        elif db=="uniprotTrembl":
            xmlBase = "uniprot_trembl.xml.gz"
            outPrefix = "uniprotTrembl"
            recCount = 500000*37
        else:
            raise Exception("unknown db")
        xmlFile = gzip.open(join(inDir, xmlBase))
        logging.info("Parsing main XML file %s" % xmlFile.name)

    # create a dict taxonId -> output file handles for record info, pmid reference info and mutation info
    outFhs = {}
    for taxId in taxonIds:
        entryOf = openOutTabFile(outDir, "%s.%s.tab" % (outPrefix, taxId), entryHeaders)
        refOf = openOutTabFile(outDir, "%s.%s.refs.tab" % (outPrefix, taxId), refHeaders)
        mutOf = openOutTabFile(outDir, "%s.%s.mut.tab" % (outPrefix, taxId), mutHeaders)
        outFhs[taxId] = (entryOf, refOf, mutOf)

    disToName = parseDiseases(join(inDir, "humdisease.txt"))
    # base and variant sequence filehandles
    faFiles = openFaFiles(taxonIds, outDir, outPrefix)
    varFaFiles = openFaFiles(taxonIds, outDir, outPrefix, "var")

    emptyEntry = dict(zip(entryHeaders, len(entryHeaders)*[""]))

    pm = maxCommon.ProgressMeter(recCount)
    #for _, entryEl in etree.iterparse(xmlFile.name, tag='{http://uniprot.org/uniprot}entry'):
    for _, entryEl in etree.iterparse(xmlFile):
        if entryEl.tag!="{http://uniprot.org/uniprot}entry":
            continue
        strip_namespace_inplace(entryEl) # die, die stupid namespaces!!
        entry = copy.copy(emptyEntry)

        pm.taskCompleted()

        entryTax = int(entryEl.find("organism/dbReference").attrib["id"])
        if taxonIds==['all']: 
            taxId = "all"
        else:
            if entryTax not in taxonIds:
                continue
        entryOf, refOf, mutOf = outFhs[taxId]

        entryRow = parseRecInfo(entryEl, entry, isoSeqs)
        writeFaSeqs(entryRow, faFiles)
        writeFaSeqs(entryRow, varFaFiles, allVariants=True)

        entryOf.write("\t".join(entryRow)+"\n")
        recName = entryRow.name

        refRows = list(parseRefInfo(entryEl, recName))
        for refRow in refRows:
            refOf.write("\t".join(refRow)+"\n")

        mutRecs = parseVariants(entryEl, entryRow.mainIsoAcc, disToName)
        for mutRow in mutRecs:
            logging.debug("writing row %s" % str(mutRow))
            mutOf.write("\t".join(mutRow)+"\n")

        entryEl.clear()
    logging.info("Skipped annotation types: %s" % ignoredTypes.most_common())

def main(args, options):
    #logFname = join(outDir, "dbParse.log")
    if options.test:
        import doctest
        doctest.testmod()
        sys.exit(0)

    pubGeneric.setupLogging("pubParseDb", options)
    db = args[0]

    refDir = pubConf.dbRefDir
    maxCommon.mustExistDir(refDir, makeDir=True)

    if db=="pdb":
        dbDir = pubConf.pdbBaseDir
        #parsePdb("/hive/data/outside/pdb/o9/pdb1o91.ent.gz", refDir)
        parsePdb(dbDir, refDir)
    elif db in ["uniprot", "uniprotTrembl"]:
        dbDir = pubConf.uniProtBaseDir
        #taxonIds = set(pubConf.uniProtTaxonIds)
        if len(args)==1:
            raise Exception("Please specify a taxonId or list of taxonIds, like 9606, or 'all'")

        taxonIds = args[1]
        if taxonIds=="all":
            taxonIds = ['all']
        else:
            taxonIds=[int(x) for x in taxonIds.split(",")]
        parseUniprot(db, dbDir, refDir, taxonIds)
    else:
        assert(False) # illegal db arg


# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] pdb or uniprot - parse PDB or UniProt to tab-sep files

uniprot parser:
- goes over the disease comment evidences and tries to
  classify annotations as disease-related or not.  
- resolves disease codes to full disease names
- blacklists some PMIDs, annotations from these are skipped


""")

parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="show more debug messages")
parser.add_option("-t", "--test", dest="test", action="store_true", help="run tests")
parser.add_option("-p", "--parse", dest="parse", action="store", help="parse a single uniprot xml file (debugging)")
parser.add_option("", "--taxon", dest="taxon", action="store", help="uniprot: only parse specified comma-sep list of taxons and add suffix .<taxonId>. to output files, default %default", default="9606")
(options, args) = parser.parse_args()

if args==[] and not options.test:
    parser.print_help()
    exit(1)

main(args, options)