In [61]:
from seqann.gfe import GFE

dbversion = "3420"

verbose = True
verbosity = 1

gfe_maker = GFE(verbose=verbose, 
    verbosity=verbosity,
    load_features=False, 
    store_features=True,
    loci=hla_loci)

# gfedb_utils.py

In [18]:
import os
import sys

# Uncomment for Jupyter Notebook
sys.path.append(['../','../src/'])

import logging
import re
import ast
import time
import urllib.request
from Bio import AlignIO
from Bio.SeqFeature import SeqFeature
from Bio.SeqRecord import SeqRecord
from seqann.models.annotation import Annotation
from Bio import SeqIO
from pyard import ARD
from csv import DictWriter
from pathlib import Path
from constants import *
import hashlib

In [19]:
# Output memory profile to check for leaks
_mem_profile = True if '-p' in sys.argv else False

if _mem_profile:
    from pympler import tracker, muppy, summary
    tr = tracker.SummaryTracker()

logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO)


logging.debug(f'args: {sys.argv}')

In [20]:
def seq_hasher(seq, n=32):
    """Takes a nucleotide or amino acid sequence and returns a reproducible
    integer UUID. Used to create shorter unique IDs since Neo4j cannot index 
    a full sequence. Can be also be used for any string."""

    m = hashlib.md5()
    m.update(seq)

    return str(int(m.hexdigest(), 16))[:n]

In [21]:
def hla_alignments(dbversion):
    gen_aln = {l: {} for l in hla_loci}
    nuc_aln = {l: {} for l in hla_loci}
    prot_aln = {l: {} for l in hla_loci}

    #logging.info(f'HLA alignments:\n{hla_align}')

    for loc in hla_align:
        msf_gen = ''.join([data_dir, dbversion, "/", loc.split("-")[1], "_gen.msf"])
        msf_nuc = ''.join([data_dir, dbversion, "/", loc.split("-")[1], "_nuc.msf"])
        msf_prot = ''.join([data_dir, dbversion, "/", loc.split("-")[1], "_prot.msf"])

        logging.info(f'Loading {"/".join(msf_gen.split("/")[-3:])}')
        align_gen = AlignIO.read(open(msf_gen), "msf")
        gen_seq = {"HLA-" + a.name: str(a.seq) for a in align_gen}
        del align_gen
        logging.info(f'{str(len(gen_seq))} genomic alignments loaded')
        gen_aln.update({loc: gen_seq})

        logging.info(f'Loading {"/".join(msf_nuc.split("/")[-3:])}')
        align_nuc = AlignIO.read(open(msf_nuc), "msf")
        nuc_seq = {"HLA-" + a.name: str(a.seq) for a in align_nuc}
        del align_nuc
        logging.info(f'{str(len(nuc_seq))} nucleotide alignments loaded')
        nuc_aln.update({loc: nuc_seq})

        # https://github.com/ANHIG/IMGTHLA/issues/158
        # if str(dbversion) == ["3320", "3360"]:
        #    continue

        logging.info(f'Loading {"/".join(msf_prot.split("/")[-3:])}')
        align_prot = AlignIO.read(open(msf_prot), "msf")
        prot_seq = {"HLA-" + a.name: str(a.seq) for a in align_prot}
        del align_prot
        logging.info(f'{str(len(prot_seq))} protein alignments loaded')
        prot_aln.update({loc: prot_seq})

    return gen_aln, nuc_aln, prot_aln

In [22]:
def get_features(seqrecord):
    j = 3 if len(seqrecord.features) > 3 else len(seqrecord.features)
    fiveutr = [["five_prime_UTR", SeqRecord(seq=seqrecord.features[i].extract(seqrecord.seq), id="1")] for i in
               range(0, j) if seqrecord.features[i].type != "source"
               and seqrecord.features[i].type != "CDS" and isinstance(seqrecord.features[i], SeqFeature)
               and not seqrecord.features[i].qualifiers]
    feats = [[''.join([str(feat.type), "_", str(feat.qualifiers['number'][0])]), SeqRecord(seq=feat.extract(seqrecord.seq), id="1")]
             for feat in seqrecord.features if feat.type != "source"
             and feat.type != "CDS" and isinstance(feat, SeqFeature)
             and 'number' in feat.qualifiers]

    threeutr = []
    if len(seqrecord.features) > 1:
        threeutr = [["three_prime_UTR", SeqRecord(seq=seqrecord.features[i].extract(seqrecord.seq), id="1")] for i in
                    range(len(seqrecord.features) - 1, len(seqrecord.features)) if
                    seqrecord.features[i].type != "source"
                    and seqrecord.features[i].type != "CDS" and isinstance(seqrecord.features[i], SeqFeature)
                    and not seqrecord.features[i].qualifiers]

    feat_list = fiveutr + feats + threeutr
    annotation = {k[0]: k[1] for k in feat_list}

    return annotation

In [23]:
# Returns base pair and amino acid sequences from CDS data
def get_cds(allele):

    feat_types = [f.type for f in allele.features]
    bp_seq = None
    aa_seq = None
    
    if "CDS" in feat_types:
        cds_features = allele.features[feat_types.index("CDS")]
        if 'translation' in cds_features.qualifiers:

            if cds_features.location is None:
                logging.info(f"No CDS location for feature in allele: {allele.name}")
            else:
                bp_seq = str(cds_features.extract(allele.seq))
                aa_seq = cds_features.qualifiers['translation'][0]
                
    return bp_seq, aa_seq

In [24]:
# Streams dictionaries as rows to a file
def append_dict_as_row(file_path, dict_row):

    header = list(dict_row.keys())

    # Check if file exists
    csv_file = Path(file_path)
    if not csv_file.is_file():

        # Create the file and add the header
        with open(file_path, 'a+', newline='') as write_obj:
            dict_writer = DictWriter(write_obj, fieldnames=header)
            dict_writer.writeheader()

    # Do not add an else statement or the first line will be skipped
    with open(file_path, 'a+', newline='') as write_obj:
        dict_writer = DictWriter(write_obj, fieldnames=header)
        dict_writer.writerow(dict_row)

    return

In [25]:
# Outputs memory of objects during execution to sheck for memory leaks
if _mem_profile:
    def memory_profiler(mode='all'):

        # Print a summary of memory usage every n alleles
        all_objects = muppy.get_objects()
        sum2 = summary.summarize(all_objects)

        original_stdout = sys.stdout

        if mode == 'all' or mode == 'agg':
            with open("summary_agg.txt", "a+") as f:
                sys.stdout = f
                summary.print_(sum2)
                sys.stdout = original_stdout;

        if mode == 'all' or mode == 'diff':
            with open("summary_diff.txt", "a+") as f:
                sys.stdout = f
                tr.print_diff()
                sys.stdout = original_stdout;    

        return

In [49]:
### Refactor build alignments

def build_alignments(allele, gfe, dbversion, stream=False):
    """Builds genomic, nucleotide and protein alignments"""
    
    hla_name = allele.description.split(",")[0]
    loc = allele.description.split(",")[0].split("*")[0]
    imgt_release = f'{dbversion[0]}.{dbversion[1:3]}.{dbversion[3]}'
    
    if allele.description.split(",")[0] in gen_aln[loc]:
        aligned_gen = gen_aln[loc][allele.description.split(",")[0]]
        gen_alignment = {
            "label": "GEN_ALIGN",
            "seq_id": seq_hasher(aligned_gen.encode('utf-8')),
            "gfe_name": gfe,
            "hla_name": hla_name,
            "a_name": hla_name.split("-")[1],
            "length": len(aligned_gen),
            "rank": "0", # TO DO: confirm how this value is derived
            "bp_sequence": aligned_gen,
            "aa_sequence": "",
            "imgt_release": imgt_release # 3.24.0 instead of 3240
        }       
        
    if allele.description.split(",")[0] in nuc_aln[loc]:
        aligned_nuc = nuc_aln[loc][allele.description.split(",")[
            0]]

        nuc_alignment = {
            "label": "NUC_ALIGN",
            "seq_id": seq_hasher(aligned_nuc.encode('utf-8')),
            "gfe_name": gfe,
            "hla_name": hla_name,
            "a_name": a_name, # hla_name.split("-")[1]
            "length": len(aligned_nuc),
            "rank": "0", # TO DO: confirm how this value is derived
            "bp_sequence": aligned_nuc,
            "aa_sequence": "",
            "imgt_release": imgt_release
        }

    if allele.description.split(",")[0] in prot_aln[loc]:
        aligned_prot = prot_aln[loc][allele.description.split(",")[
            0]]

        prot_alignment = {
            "label": "PROT_ALIGN",
            "seq_id": seq_hasher(aligned_prot.encode('utf-8')),
            "gfe_name": gfe,
            "hla_name": hla_name,
            "a_name": a_name, # hla_name.split("-")[1]
            "length": len(aligned_prot),
            "rank": "0", # TO DO: confirm how this value is derived
            "bp_sequence": "",
            "aa_sequence": aligned_prot,
            "imgt_release": imgt_release
        }

        
    if stream:
        logging.info(f'Streaming alignments to file...')
        
        file_path = f'{data_dir}csv/all_alignments.{dbversion}.csv'
        
        for alignment in [gen_alignment, nuc_alignment, prot_alignment]:
            append_dict_as_row(file_path, alignment)
    
        del aligned_nuc
        del aligned_gen
        del aligned_prot
        
        return
    else:
        return gen_alignment, nuc_alignment, prot_alignment

In [None]:
# Build the datasets for the HLA graph
def build_hla_graph(**kwargs):

    dbversion, alignments, verbose, debug, gfe_maker, limit = \
        kwargs.get("dbversion"), \
        kwargs.get("alignments", False), \
        kwargs.get("verbose", False), \
        kwargs.get("debug", False), \
        kwargs.get("gfe_maker"), \
        kwargs.get("limit", None) #, \
        # kwargs.get("mem_profile", False)
    
    #num_alleles = limit if limit else kwargs.get("num_alleles")

    def _stream_to_csv(a_gen, alignments, limit):

        i = 0
        total_time_elapsed = 0

        for idx, allele in enumerate(a_gen):
            
            start_time = time.time()
            
            # build_gfe()
            # build_alignments()
            # build_features()

            if hasattr(allele, 'seq'):
                
                # TO DO - remove if not used
                hla_name = allele.description.split(",")[0]
                loc = allele.description.split(",")[0].split("*")[0]

                if (loc in hla_loci or loc == "DRB5") and (len(str(allele.seq)) > 5):

                    try:

                        # Retrieve and stream the genomic, nucleotide and protein alignments
                        if alignments:
                            
                            ###

                    except Exception as err:
                        logging.error(f'Failed to get data for allele ID {allele.id}')
                        logging.error(err)                        
                
                # Build and stream the GFE rows
                try:

                    _seq = str(allele.seq)

                    gfe_sequence = {
                        "gfe_name": gfe,
                        "allele_id": allele.id,
                        "locus": loc,
                        "hla_name": hla_name,
                        "a_name": a_name, # hla_name.split("-")[1]
                        "seq_id": seq_hasher(_seq.encode('utf-8')),
                        "sequence": _seq,
                        "length": len(_seq),
                        "imgt_release": imgt_release
                    }

                    del _seq

                    logging.info(f'Streaming GFEs to file...')
                    file_name = ''.join([data_dir, f'csv/gfe_sequences.{dbversion}.csv'])
                    append_dict_as_row(file_name, gfe_sequence)

                except Exception as err:
                    logging.error(f'Failed to write GFE data for allele ID {allele.id}')
                    logging.error(err)   

                # Build and stream the ARD group rows
                try:
                    logging.info(f'Streaming groups to file...')
                    for group in groups:
                        group_dict = {
                            "gfe_name": gfe,
                            "allele_id": allele.id,
                            "hla_name": hla_name,
                            "a_name": a_name,
                            "ard_id": group[0],
                            "ard_name": group[1],
                            "locus": loc,
                            "imgt_release": imgt_release
                        }

                        file_path = f'{data_dir}csv/all_groups.{dbversion}.csv'
                        append_dict_as_row(file_path, group_dict)

                    del groups

                except Exception as err:
                    logging.error(f'Failed to write groups for allele {allele.id}')
                    logging.error(err)

                # Build and stream the CDS rows
                try:
                    # Build CDS dict for CSV export, foreign key: allele_id, hla_name
                    bp_seq, aa_seq = get_cds(allele)

                    cds = {
                        "gfe_name": gfe,
                        # "gfe_sequence": str(allele.seq),
                        # "allele_id": allele.id,
                        # "hla_name": hla_name,
                        "bp_seq_id": seq_hasher(bp_seq.encode('utf-8')),
                        "bp_sequence": bp_seq,
                        "aa_seq_id": seq_hasher(aa_seq.encode('utf-8')),
                        "aa_sequence": aa_seq,
                        # "imgt_release": imgt_release
                    }

                    logging.info(f'Streaming CDS to file...')
                    file_path = f'{data_dir}csv/all_cds.{dbversion}.csv'
                    append_dict_as_row(file_path, cds)

                    del bp_seq
                    del aa_seq

                except Exception as err:
                    logging.error(f'Failed to write CDS data for allele {allele.id}')
                    logging.error(err)

                # Build and stream the features rows
                def build_features()
                try:
                    # features preprocessing steps
                    # 1) Convert seqann type to python dict using literal_eval
                    # 2) add GFE foreign keys: allele_id, hla_name
                    # 3) calculate columns: length

                    # features contains list of seqann objects, converts to dict, destructive step
                    features = \
                        [ast.literal_eval(str(feature) \
                            .replace('\'', '"') \
                            .replace('\n', '')) \
                            for feature in features]               

                    # Append allele id's
                    # Note: Some alleles may have the same feature, but it may not be the same rank, 
                    # so a feature should be identified with its allele by allele_id or HLA name
                    
                    logging.info(f'Streaming features to file...')
                    for feature in features:
                        feature["gfe_name"] = gfe
                        feature["term"] = feature["term"].upper()
                        feature["allele_id"] = allele.id 
                        feature["hla_name"] = hla_name
                        feature["imgt_release"] = imgt_release

                        # Avoid null values in CSV for Neo4j import
                        feature["hash_code"] = "none" if not feature["hash_code"] else feature["hash_code"]

                        file_path = f'{data_dir}csv/all_features.{dbversion}.csv'
                        append_dict_as_row(file_path, feature)

                    del features

                except Exception as err:
                    logging.error(f'Failed to write features for allele {allele.id}')
                    logging.error(err)

            elapsed_time = time.time() - start_time
            # alleles_remaining = num_alleles - (idx + 1)
            total_time_elapsed += elapsed_time
            avg_time_elapsed = total_time_elapsed / (idx + 1)
            # total_time_elapsed += ((alleles_remaining * elapsed_time) / 60)
            # avg_time_elapsed = total_time_elapsed / num_alleles
            # time_remaining = elapsed_time * alleles_remaining
            
            logging.info(f'Alleles processed: {idx + 1}')
            # logging.info(f'Alleles remaining: {alleles_remaining}')
            logging.info(f'Elapsed time: {round(elapsed_time, 4)}')
            logging.info(f'Avg elapsed time: {round(avg_time_elapsed, 4)}')
            #logging.info(f'Estimated time remaining: {time.strftime("%H:%M:%S", time.gmtime(time_remaining))} minutes')
            
            # Break point for testing
            if limit and idx + 1 == limit:
                    break

            # Output memory profile to check for leaks; TO DO: make a parameter for frequency of profiling
            if _mem_profile and idx % 20 == 0:
                memory_profiler()

        return


    # TO DO - remove if imgt_release is not used
    imgt_release = f'{dbversion[0]}.{dbversion[1:3]}.{dbversion[3]}'
    # dbversion = ''.join(dbversion.split("."))
    
    logging.debug(f'dbversion: {dbversion}')
    logging.debug(f'imgt_release: {imgt_release}')
    logging.debug(f'dbversion: {dbversion}')

    if alignments:
        gen_aln, nuc_aln, prot_aln = hla_alignments(dbversion)
        # TO DO - build alignments data
        # build_alignments()



    ###### TO DO: move DAT download to build.sh ######
    # Downloading DAT file
    # The github URL changed from 3350 to media
    if int(dbversion) < 3350:
        dat_url = ''.join([imgt_hla_raw_url, dbversion, '/hla.dat'])
    else:
        dat_url = ''.join([imgt_hla_media_url, dbversion, '/hla.dat'])

        
    # TO DO - delete
    dat_file = ''.join([data_dir, 'hla.', dbversion, ".dat"])

    logging.info("Downloading DAT file...")
    if not os.path.isfile(dat_file):
        if verbose:
            logging.info("Downloading dat file from " + dat_url)
            
        urllib.request.urlretrieve(dat_url, dat_file)

    ###################################################    

    logging.info("Streaming rows CSV files...")
    
    # TO DO - refactor this into multiple functions for each process: 
    _stream_to_csv(
        a_gen=a_gen, 
        alignments=alignments, 
        limit=limit)

    return

In [39]:
def parse_dat(data_dir, dbversion):
    
    logging.info("Parsing DAT file...")
    dat_file = ''.join([data_dir, 'hla.', dbversion, ".dat"])
    
    return SeqIO.parse(dat_file, "imgt")

In [51]:
def get_groups(allele):
    a_name = allele.description.split(",")[0].split("-")[1]
    groups = [["HLA-" + ard.redux(a_name, grp), grp] if ard.redux(a_name, grp) != a_name else None for
                grp in ard_groups]

    # expre_chars = ['N', 'Q', 'L', 'S']
    # to_second = lambda a: ":".join(a.split(":")[0:2]) + \
    #    list(a)[-1] if list(a)[-1] in expre_chars and \
    #    len(a.split(":")) > 2 else ":".join(a.split(":")[0:2])
    # seco = [[to_second(a_name), "2nd_FIELD"]]

    return list(filter(None, groups)) # + seco

## Testing

In [62]:
alleles = parse_dat(data_dir, "3420")

logging.info("Loading ARD...")
ard = ARD(dbversion)

2021-04-17 17:56:07 - root - INFO - Parsing DAT file...
2021-04-17 17:56:07 - root - INFO - Loading ARD...


In [63]:
for allele in alleles:
    
    if hasattr(allele, 'seq'):
        
        locus = allele.description.split(",")[0].split("*")[0]
        
        if (locus in hla_loci or loc == "DRB5") and (len(str(allele.seq)) > 5):
            
            groups = get_groups(allele)
            
            complete_annotation = get_features(allele)

            ann = Annotation(annotation=complete_annotation,
                    method='match',
                    complete_annotation=True)

            # This process takes a long time
            logging.info(f"Getting GFE data for allele {allele.id}...")
            features, gfe = gfe_maker.get_gfe(ann, loc)
            
            alignments = build_alignments(allele, gfe, dbversion, stream=False)
            
            print(allele.seq)
    
    break

2021-04-17 17:56:12 - root - INFO - Getting GFE data for allele HLA00001.1...
2021-04-17 17:56:14 - Logger.seqann.gfe - INFO - GFE = HLA-Aw2-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-4


NameError: name 'gen_aln' is not defined