# gfe-db / Build graph

This notebook contains an updated data pipeline for [`nmdp-bioinformatics/gfe-db`](https://github.com/nmdp-bioinformatics/gfe-db). The goal of the pipeline is to create flat CSV files that can be read and parsed by Cypher's `LOAD CSV` clause to either create new nodes and relationships or merge existing ones.

# Libraries

In [1]:
import os
import sys
sys.path[0] = '../'
import logging

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from bin.build_gfedb import *

# Environment

In [2]:
imgt_hla = 'https://www.ebi.ac.uk/ipd/imgt/hla/docs/release.html'
imgt_hla_media_url = 'https://media.githubusercontent.com/media/ANHIG/IMGTHLA/'
imgt_hla_raw_url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'

imgt_kir = 'https://www.ebi.ac.uk/ipd/kir/docs/version.html'
kir_url = 'ftp://ftp.ebi.ac.uk/pub/databases/ipd/kir/KIR.dat'


data_dir = "../../data/" #os.path.dirname(__file__) + "/../../data/"

expre_chars = ['N', 'Q', 'L', 'S']

In [3]:
lastseqid = 1
lastid = 1
lastcdsid = 1

seqids = {}
cdsids = {}
alleleids = {}
group_edges = {}
trans_edges = {}

# The alleles are removed when the allele_nodes.csv is built
skip_alleles = ["HLA-DRB5*01:11", "HLA-DRB5*01:12", "HLA-DRB5*01:13",
                "HLA-DRB5*02:03", "HLA-DRB5*02:04", "HLA-DRB5*02:05",
                "HLA-DRB5*01:01:02", "HLA-DRB5*01:03", "HLA-DRB5*01:05",
                "HLA-DRB5*01:06", "HLA-DRB5*01:07", "HLA-DRB5*01:09",
                "HLA-DRB5*01:10N", "HLA-C*05:208N", "HLA-C*05:206"]

hla_loci = ['HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1',
            'HLA-DPB1', 'HLA-DQA1', 'HLA-DPA1', 'HLA-DRB3',
            'HLA-DRB4', 'HLA-DRB5']

hla_align = ['HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1',
             'HLA-DPB1', 'HLA-DQA1', 'HLA-DPA1']

kir_loci = ["KIR3DS1", "KIR3DP1", "KIR3DL3", "KIR3DL2", "KIR3DL1",
            "KIR2DS5", "KIR2DS4", "KIR2DS3", "KIR2DS2", "KIR2DS1",
            "KIR2DP1", "KIR2DL5B", "KIR2DL5A", "KIR2DL4"]

kir_aligloci = ["KIR2DL4", "KIR2DP1", "KIR2DS1", "KIR2DS2", "KIR2DS3",
                "KIR2DS4", "KIR2DS5", "KIR3DL1", "KIR3DL2", "KIR3DL3",
                "KIR3DP1"]

ard_groups = ['G', 'lg', 'lgx']

align = True

In [4]:
kir = None

if kir:
    load_loci = hla_loci + kir_loci
else:
    load_loci = hla_loci

from seqann import gfe
gfe_maker = gfe.GFE(verbose=True, verbosity=2,
                load_features=False, store_features=True,
                loci=load_loci)

# Processes for KIR & HLA

## Setup

In [5]:
dbversions = ["3360"]

verbose = True

In [6]:
out_dir = '' #args.out_dir
release_n = 1 #args.number
releases = '3360'#args.releases
verbosity = 1

align = True
kir = True
debug = False
verbose = True

#if args.kir:
kir = True

#if args.align:
align = True

#if args.verbose:
verbose = True

if kir:
    load_loci = hla_loci + kir_loci
else:
    load_loci = hla_loci

#if args.debug:
#    logging.info("Running in debug mode")
#    load_loci = ["HLA-A"]
#    kir = False
#    debug = True
#    verbose = True
#    verbosity = 2
#    release_n = 1

gfe_e = []
seq_e = []
seq_n = []
cds_n = []
grp_e = []
trs_e = []
allele_n = []

# Get last five IMGT/HLA releases
if releases:
    dbversions = [db for db in releases.split(",")]
else:
    dbversions = pd.read_html(imgt_hla)[0]['Release'][0:release_n].tolist()

# Get latest IMGT/KIR release
kir_release = pd.read_html(imgt_kir)[0]['Release'][0]

from seqann import gfe
gfe_maker = gfe.GFE(verbose=verbose, verbosity=verbosity,
                    load_features=False, store_features=True,
                    loci=load_loci)

## KIR process

Not working due to error:
```python
ValueError: Problem with 'CDS' feature:
join(269..302,1267..1302,3751..4049,5579..5872,9027..9077,
13337..13438,13901..13953,14052..14228)
/codon_start=1
/gene="KIR2DL1"
/allele="KIR2DL1*0450101N"
/product="KIR2DL1 Killer-cell Immunoglobulin-like Receptor"
/translation="MSLLFVSMACVGFFLLQGAWPHEGVHRNLPSWPTQVPWX
```

In [20]:
# KIR process (broken)
if kir:
    if verbose:
        logging.info("Adding KIR to GFE DB")

    kir_file = data_dir + 'KIR.dat'

    if align:
        aligned = kir_alignments()

    # Downloading KIR
    if not os.path.isfile(kir_file):
        if verbose:
            logging.info("Downloading KIR dat file from " + kir_url)
        urllib.request.urlretrieve(kir_url, kir_file)

    kir_gen = SeqIO.parse(kir_file, "imgt")
    if verbose:
        logging.info("Finished parsing KIR dat file")

    i = 0
    for idx, allele in enumerate(kir_gen):
    
        # Breakpoint for development testing
        if idx == 1:
                break
        
        if hasattr(allele, 'seq'):
            loc = allele.description.split(",")[0].split("*")[0]
            if loc in kir_loci and len(str(allele.seq)) > 5:
                if debug:
                    logging.info(
                        "KIR = " + allele.description.split(",")[0] + " " + kir_release)

                groups = []
                complete_annotation = get_features(allele)
                ambigs = [
                    a for a in complete_annotation if re.search("/", a)]

                aligned_seq = ''
                if align:
                    if allele.description.split(",")[0] in aligned[loc]:
                        aligned_seq = aligned[loc][allele.description.split(",")[
                            0]]

                if ambigs:
                    logging.info(
                        "AMBIGS " + allele.description.split(",")[0] + " " + kir_release)
                    annotations = []
                    for ambig in ambigs:
                        if debug:
                            logging.info("AMBIG = " + ambig)
                        aterm = ambig.split("/")[0].split("_")[0]
                        anno = {
                            a: complete_annotation[a] for a in complete_annotation if a not in ambigs}
                        anno.update(
                            {ambig.split("/")[0]: complete_annotation[ambig]})
                        annotations.append(anno)

                        anno2 = {
                            a: complete_annotation[a] for a in complete_annotation if a not in ambigs}
                        anno2.update(
                            {aterm + "_" + ambig.split("/")[1]: complete_annotation[ambig]})
                        annotations.append(anno2)

                    for annotation in annotations:
                        ann = Annotation(annotation=annotation,
                                         method='match',
                                         complete_annotation=True)

                        features, gfe = gfe_maker.get_gfe(ann, loc)

01/30/2021 02:48:52 PM - root - INFO - Adding KIR to GFE DB
01/30/2021 02:48:53 PM - root - INFO - Finished parsing KIR dat file


ValueError: Problem with 'CDS' feature:
join(269..302,1267..1302,3751..4049,5579..5872,9027..9077,
13337..13438,13901..13953,14052..14228)
/codon_start=1
/gene="KIR2DL1"
/allele="KIR2DL1*0450101N"
/product="KIR2DL1 Killer-cell Immunoglobulin-like Receptor"
/translation="MSLLFVSMACVGFFLLQGAWPHEGVHRNLPSWPTQVPWX

## HLA process

Code in this process contains variables that define nodes.

In [123]:
#dbversions = ["3410", "3420", "3430"]
dbversions = ["3360"]

In [130]:
import ast

# Loop through DB versions
for dbversion in dbversions:

    db_striped = ''.join(dbversion.split("."))

    if align:
        gen_aln, nuc_aln, prot_aln = hla_alignments(db_striped)

    ard = ARD(db_striped)

    # The github URL changed from 3350 to media
    if int(db_striped) < 3350:
        dat_url = imgt_hla_raw_url + db_striped + '/hla.dat'
    else:
        dat_url = imgt_hla_media_url + db_striped + '/hla.dat'

    dat_file = data_dir + 'hla.' + db_striped + ".dat"

    # Downloading DAT file
    if not os.path.isfile(dat_file):
        if verbose:
            logging.info("Downloading dat file from " + dat_url)
        urllib.request.urlretrieve(dat_url, dat_file)

    # Parse DAT file
    a_gen = SeqIO.parse(dat_file, "imgt")

    if verbose:
        logging.info("Finished parsing dat file")

    i = 0

    ### Initialize lists for CSV output (input to LOAD CSV in Neo4j)
    # Lists contain unique dicts and are converted to dataframes, then output to CSV for Neo4j import
    gfe_sequences = []
    gen_alignments = []
    nuc_alignments = []
    prot_alignments = []
    all_features = []
    ###
    
    for idx, allele in enumerate(a_gen):

        if hasattr(allele, 'seq'):
            hla_name = allele.description.split(",")[0]
            loc = allele.description.split(",")[0].split("*")[0]

            if hla_name in skip_alleles:
                logging.info(
                    "SKIPPING = " + allele.description.split(",")[0] + " " + dbversion)
                continue

            if debug and (loc != "HLA-A" and i > 20):
                continue

            if (loc in hla_loci or loc == "DRB5") and (len(str(allele.seq)) > 5):
                if debug:
                    logging.info(
                        "HLA = " + allele.description.split(",")[0] + " " + dbversion)

                a_name = allele.description.split(",")[0].split("-")[1]
                groups = [["HLA-" + ard.redux(a_name, grp), grp] if ard.redux(a_name, grp) != a_name else None for
                          grp in ard_groups]
                seco = [[to_second(a_name), "2nd_FIELD"]]
                groups = list(filter(None, groups)) + seco
                complete_annotation = get_features(allele)
                ann = Annotation(annotation=complete_annotation,
                                 method='match',
                                 complete_annotation=True)

                # This process takes a long time
                features, gfe = gfe_maker.get_gfe(ann, loc)

                # gen_aln, nuc_aln, prot_aln
                aligned_gen = ''
                aligned_nuc = ''
                aligned_prot = ''

                if align:
                    if allele.description.split(",")[0] in gen_aln[loc]:
                        aligned_gen = gen_aln[loc][allele.description.split(",")[
                            0]]

                    if allele.description.split(",")[0] in nuc_aln[loc]:
                        aligned_nuc = nuc_aln[loc][allele.description.split(",")[
                            0]]

                    if allele.description.split(",")[0] in prot_aln[loc]:
                        aligned_prot = prot_aln[loc][allele.description.split(",")[
                            0]]
            
            ### Build dicts describing nodes and edges for each allele
            
            # Notes:
            # all edges are joined using foreign keys:
            #  - GFE --> SEQUENCE on alleleId
            #  - GFE --> [GEN_ALIGN, NUC_ALIGN, PROT_ALIGN] on a_name
            #  - GFE --> FEATURE on alleleId (or hla_name)
            # "alleleId" is assigned allele.id value
            # "sequenceId" is replaced with alleleId
            # feature key "name" is now "term"
            
            # Questions:
            # - Should GFE name be assigned to each node?

            # Separate CSV file
            gfe_sequence = {
                "alleleId": allele.id,
                "GFE_name": gfe,
                "locus": loc,
                "HLA_name": hla_name,
                "A_name": a_name, # hla_name.split("-")[1]
                "sequence": str(allele.seq),
                "length": len(str(allele.seq))
            }

            # Separate CSV file, GFE foreign key: a_name
            gen_alignment = {
                "label": "GEN_ALIGN",
                "A_name": a_name, # hla_name.split("-")[1]
                "length": len(aligned_gen),
                "rank": "0", # TO DO: confirm how this value is derived
                "sequence": aligned_gen,
            }

            # Separate CSV file, GFE foreign key: a_name
            nuc_alignment = {
                "label": "NUC_ALIGN",
                "A_name": a_name, # hla_name.split("-")[1]
                "length": len(aligned_nuc),
                "rank": "0", # TO DO: confirm how this value is derived
                "sequence": aligned_nuc
            }

            # Separate CSV file, GFE foreign key: a_name
            prot_alignment = {
                "label": "PROT_ALIGN",
                "A_name": a_name, # hla_name.split("-")[1]
                "length": len(aligned_prot),
                "rank": "0", # TO DO: confirm how this value is derived
                "sequence": aligned_prot
            }

            # features preprocessing steps
            # 1) Convert seqann type to python dict using literal_eval
            # 2) add GFE foreign keys: alleleId, hla_name
            # 3) add columns: length

            # features contains list of seqann objects, converts to dict, destructive step
            features = [ast.literal_eval(str(feature).replace('\'', '"').replace('\n', '')) for feature in features]

            # Append allele id's
            # Note: Some alleles may have the same feature, but it may not be the same rank, 
            # so a feature should be identified with its allele by alleleId or HLA name
            for feature in features:
                feature["term"] = feature["term"].upper()
                feature["alleleId"] = allele.id 
                feature["hla_name"] = hla_name

            # Last step: append each dict to its respective list, convert to dataframe, drop duplicates (especially in features, or convert to set) and output to csv
            ### Append to list
            
            # Pre-process gfe, gen/nuc/prot alignments
            data = zip(
                [gfe_sequences, gen_alignments, nuc_alignments, prot_alignments],
                [gfe_sequence, gen_alignment, nuc_alignment, prot_alignment]
            )
            
            for _list, _dict in data:
                _list.append(_dict)
                        
            ### Pre-process features
            # Join
            all_features = all_features + features
            
            
            ### VALIDATE
                
        if idx == 10:
                break
    
    # Remove duplicates
    tables = [gfe_sequences, gen_alignments, nuc_alignments, prot_alignments, all_features]
    files = ["gfe_sequences", "gen_alignments", "nuc_alignments", "prot_alignments", "all_features"]

    dataframes = []

    path = "../../data/csv/update/"

    # Output to CSV, include dbversion in name
    for table, file in zip(tables, files):
        dataframe = pd.DataFrame(table)
        file_name = path + file + f".{dbversion}.csv"
        dataframe.to_csv(file_name, index=False)
    
    

02/03/2021 09:14:08 PM - root - INFO - Loading ../bin/../../data/3360/A_gen.msf
02/03/2021 09:14:08 PM - root - INFO - Loaded 1771 genomic HLA-A sequences
02/03/2021 09:14:08 PM - root - INFO - Loading ../bin/../../data/3360/A_nuc.msf
02/03/2021 09:14:09 PM - root - INFO - Loaded 5016 nuc HLA-A sequences
02/03/2021 09:14:09 PM - root - INFO - Loading ../bin/../../data/3360/A_prot.msf
02/03/2021 09:14:18 PM - root - INFO - Loaded 5016 prot HLA-A sequences
02/03/2021 09:14:18 PM - root - INFO - Loading ../bin/../../data/3360/B_gen.msf
02/03/2021 09:14:19 PM - root - INFO - Loaded 2149 genomic HLA-B sequences
02/03/2021 09:14:19 PM - root - INFO - Loading ../bin/../../data/3360/B_nuc.msf
02/03/2021 09:14:20 PM - root - INFO - Loaded 6094 nuc HLA-B sequences
02/03/2021 09:14:20 PM - root - INFO - Loading ../bin/../../data/3360/B_prot.msf
02/03/2021 09:14:21 PM - root - INFO - Loaded 6094 prot HLA-B sequences
02/03/2021 09:14:21 PM - root - INFO - Loading ../bin/../../data/3360/C_gen.msf
02

## Validate data

In [131]:
gfe_sequences

[{'alleleId': 'HLA00001.1',
  'GFE_name': 'HLA-Aw2-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-4',
  'locus': 'HLA-A',
  'HLA_name': 'HLA-A*01:01:01:01',
  'A_name': 'A*01:01:01:01',
  'sequence': 'CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCCAGACGCCGAGGATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGGGTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCTCCTGGCGGGGGCGCAGGACCGGGGGAGCCGCGCCGGGAGGAGGGTCGGGCAGGTCTCAGCCACTGCTCGCCCCCAGGCTCCCACTCCATGAGGTATTTCTTCACATCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAAGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATATGAAGGCCCACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACGGTGAGTGACCCCGGCCCGGGGCGCAGGTCACGACCCCTCATCC

In [127]:
gen_alignments

[{'label': 'GEN_ALIGN',
  'A_name': 'A*01:01:01:01',
  'length': 3848,
  'rank': '0',
  'sequence': 'CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCC-AGACGCCGAGGATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGGGTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCTCCTGGCGGGGGCGCAGGACCGGGGGAGCCGCGCCGGGAGGAGGGTCGGGCAGGTCTCAGCCACTGCTCGCCCCCAGGCTCCC-ACTCCATGAGGTATTTCTTCACATCCGTGTCCC-----GGCCCGGCCGCGGGGAGCC----CCGCTTCATCGCCGTGGGC-----------------------TACGTGGAC----GACACG-CAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGA--AGATGGAGCCG--------------------CGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGG--------------AGACACGGA-ATATGAAGGCCC-ACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACGGTGAGTGACCCCGGCCCGGGG-CGCAGGTCACGACCCCTCATCCCCC-A

In [128]:
nuc_alignments

[{'label': 'NUC_ALIGN',
  'A_name': 'A*01:01:01:01',
  'length': 1427,
  'rank': '0',
  'sequence': 'ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGGGCTCCC-ACTCCATGAGGTATTTCTTCACATCCGTGTCCC-----GGCCCGGCCGCGGGGAGCC----CCGCTTCATCGCCGTGGGC-----------------------TACGTGGAC----GACACG-CAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGA--AGATGGAGCCG--------------------CGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGG--------------AGACACGGA-ATATGAAGGCCC-ACTCACAGACTGACCGAGCGAACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACGGTTCTCACACC-ATCCAGATAATGTATGGCTGCGACG--------------TGGGGCCGG-ACGGGCGCTTCCTCCGCGGGTACCGGCAGGACGCCTACGACGGCAAGGATTACATCGCCCTGA-----------------------AC-GAGGACCTGCGCTCTTGGACCGCGGCGGACATGGCAG--------CTCAGATCACCAAGCGC-AAGTGGGA--------------------------------------G--GCGGTCCATGCGGC-GG------------------------------------------AGCAGCGGAGAGTCTACCTGGAGGGCCGG--------TGCGTG----GACGGG--------------CTCCGCAGATA-CCTGGAGAACGGGAAGGAGACGCTGCAGC-----------------GCACGGACCCCCCC--AAGACACATATGACCCAC

In [129]:
prot_alignments

[{'label': 'PROT_ALIGN',
  'A_name': 'A*01:01:01:01',
  'length': 379,
  'rank': '0',
  'sequence': 'MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAE--------------QRRVYLEGRCVDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSLTACKV'},
 {'label': 'PROT_ALIGN',
  'A_name': 'A*01:01:01:02N',
  'length': 379,
  'rank': '0',
  'sequence': 'MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRNMKAHSQTDRANLGTLRGYYNQSEDGDPGPGRRSRPLIPHGRARSPTVSGSEIHPEAAGLRDPCPGRGPGAFTRFHFQFRPKIPPGWSG--------------RGGARGTGLTAGSGPGSHTIQX---------------------------------------------------------------------------------------------------------------------------------------------------------------------'},
 {'label': 'PROT_ALIGN',
  'A_n

## Examine variables

The variables returned by the HLA process above correspond to nodes in the graph:
```python
# GFE related nodes
gfe # --> GFE
features # --> FEATURE
aligned_gen # --> ALIGN_GEN
aligned_nuc # --> ALIGN_NUC
aligned_prot # --> ALIGN_PROT

# HLA related nodes
hla_name
groups
loc
```

### GFE related nodes

Cross-referenced with Neo4j

In [75]:
# Validate GFE sequence from Neo4j
'CAGGAGCAGAGGGG' in str(allele.seq)

True

In [76]:
# GFE label
gfe

'HLA-Aw2-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-4'

In [56]:
# Contains list of seqann objects, converts to dict, destructive step
#features = [ast.literal_eval(str(feature).replace('\'', '"').replace('\n', '')) for feature in features]

In [57]:
type(features[0])

dict

### HLA related nodes

Cross-referenced with Neo4j

In [25]:
hla_name

'HLA-A*01:01:01:01'

In [26]:
groups

[['HLA-A*01:01:01G', 'G'],
 ['HLA-A*01:01g', 'lg'],
 ['HLA-A*01:01', 'lgx'],
 ['A*01:01', '2nd_FIELD']]

In [27]:
loc

'HLA-A'

### Build dictionaries

Each node and it's properties is represented as a dictionary.

Foreign keys:
- `GFE` --> `SEQUENCE` on `alleleId`
- `GFE` --> `[GEN_ALIGN, NUC_ALIGN, PROT_ALIGN]` on `a_name`
- `GFE` --> `FEATURE` on `alleleId` (or `hla_name`)

In [45]:
allele.id

'HLA00001.1'

In [53]:
hla_name

'HLA-A*01:01:01:01'

In [62]:
# Notes:
# "alleleId" is assigned allele.id value
# "sequenceId" is replaced with alleleId
# feature key "name" is now "term"

# Step: initialize a list for each respective dictionary before the for loop

# Separate CSV file
gfe_dict = {
    "gfe_alleleId": allele.id,
    "gfe_name": gfe,
    "gfe_locus": loc,
    "hla_name": hla_name,
    "a_name": a_name, # hla_name.split("-")[1]
    "sequence_alleleId": allele.id,
    "sequence_name": "SEQUENCE",
    "sequence_sequence": str(allele.seq),
    "sequence_length": len(str(allele.seq))
}

# Separate CSV file, GFE foreign key: a_name
gen_align_dict = {
    "gen_align_name": "GEN_ALIGN",
    "a_name": a_name, # hla_name.split("-")[1]
    "gen_align_length": len(aligned_gen),
    "gen_align_rank": "0", # TO DO: confirm how this value is derived
    "gen_align_seq": aligned_gen,
}

# Separate CSV file, GFE foreign key: a_name
nuc_align_dict = {
    "nuc_align_name": "NUC_ALIGN",
    "a_name": a_name, # hla_name.split("-")[1]
    "nuc_align_length": len(aligned_nuc),
    "nuc_align_rank": "0", # TO DO: confirm how this value is derived
    "nuc_align_seq": aligned_nuc
}

# Separate CSV file, GFE foreign key: a_name
prot_align_dict = {
    "prot_align_name": "PROT_ALIGN",
    "a_name": a_name, # hla_name.split("-")[1]
    "prot_align_length": len(aligned_prot),
    "prot_align_rank": "0", # TO DO: confirm how this value is derived
    "prot_align_seq": aligned_prot
}

# features preprocessing
# 1) add GFE foreign keys: alleleId, hla_name
# 2) add columns: length

# Contains list of seqann objects, converts to dict, destructive step
#features = [ast.literal_eval(str(feature).replace('\'', '"').replace('\n', '')) for feature in features]

for feature in features:
    feature["alleleId"] = allele.id
    feature["hla_name"] = hla_name
    
# Last step: append each dict to its respective list, (convert to set?), convert to dataframe, drop duplicates (especially in features, or convert to set) and output to csv


In [60]:
features

[{'accession': 2,
  'hash_code': None,
  'locus': 'HLA-A',
  'rank': 1,
  'sequence': 'CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCCAGACGCCGAGG',
  'term': 'five_prime_UTR',
  'alleleId': 'HLA00001.1',
  'hla_name': 'HLA-A*01:01:01:01'},
 {'accession': 1,
  'hash_code': None,
  'locus': 'HLA-A',
  'rank': 1,
  'sequence': 'ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGG',
  'term': 'exon',
  'alleleId': 'HLA00001.1',
  'hla_name': 'HLA-A*01:01:01:01'},
 {'accession': 1,
  'hash_code': None,
  'locus': 'HLA-A',
  'rank': 1,
  'sequence': 'GTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCTCCTGGCGGGGGCGCAGGACCGGGGGAGCCGCGCCGGGAGGAGGGTCGGGCAGGTCTCAGCCACTGCTCGCCCCCAG',
  'term': 'intron',
  'alleleId': 'HLA00001.1',
  'hl

In [42]:
a_name

'A*01:01:01:01'

In [43]:
gfe_dict

{'gfe_alleleId': 'HLA00001.1',
 'gfe_name': 'HLA-Aw2-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-4',
 'gfe_locus': 'HLA-A',
 'gen_align_name': 'GEN_ALIGN',
 'gen_align_length': 3848,
 'gen_align_rank': '0',
 'gen_align_seq': 'CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCC-AGACGCCGAGGATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGGGTGAGTGCGGGGTCGGGAGGGAAACCGCCTCTGCGGGGAGAAGCAAGGGGCCCTCCTGGCGGGGGCGCAGGACCGGGGGAGCCGCGCCGGGAGGAGGGTCGGGCAGGTCTCAGCCACTGCTCGCCCCCAGGCTCCC-ACTCCATGAGGTATTTCTTCACATCCGTGTCCC-----GGCCCGGCCGCGGGGAGCC----CCGCTTCATCGCCGTGGGC-----------------------TACGTGGAC----GACACG-CAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGA--AGATGGAGCCG--------------------CGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGG--------------AGACACGGA-ATATGAAGGCCC-A

In [30]:
vars(features[0])

{'swagger_types': {'locus': 'str',
  'term': 'str',
  'rank': 'int',
  'accession': 'int',
  'sequence': 'str',
  'hash_code': 'int'},
 'attribute_map': {'locus': 'locus',
  'term': 'term',
  'rank': 'rank',
  'accession': 'accession',
  'sequence': 'sequence',
  'hash_code': 'hashCode'},
 '_locus': 'HLA-A',
 '_term': 'five_prime_UTR',
 '_rank': 1,
 '_accession': 2,
 '_sequence': 'CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCCAGACGCCGAGG',
 '_hash_code': None}

In [31]:
str(features[0])

"{'accession': 2,\n 'hash_code': None,\n 'locus': 'HLA-A',\n 'rank': 1,\n 'sequence': 'CAGGAGCAGAGGGGTCAGGGCGAAGTCCCAGGGCCCCAGGCGTGGCTCTCAGGGTCTCAGGCCCCGAAGGCGGTGTATGGATTGGGGAGTCCCAGCCTTGGGGATTCCCCAACTCCGCAGTTTCTTTTCTCCCTCTCCCAACCTACGTAGGGTCCTTCATCCTGGATACTCACGACGCGGACCCAGTTCTCACTCCCATTGGGTGTCGGGTTTCCAGAGAAGCCAATCAGTGTCGTCGCGGTCGCTGTTCTAAAGTCCGCACGCACCCACCGGGACTCAGATTCTCCCCAGACGCCGAGG',\n 'term': 'five_prime_UTR'}"

In [33]:
import ast
feature_dict = ast.literal_eval(feature)

NameError: name 'feature' is not defined

In [34]:
feature_dict

{'feature_alleleId': 'HLA00001.1',
 'feature_sequence': '',
 'feature_name': '',
 'feature_length': '',
 'feature_rank': '',
 'feature_sequenceId': ''}

In [35]:
# build dict
hla_dict = {
    "label": "IMGT_HLA",
    "name": hla_name,
    "locus": loc
}

g_dict = {
    "label": "G",
    "name": groups[0][0],
    "locus": loc
}


lg_dict = {
    "label": "lg",
    "name": groups[1][0],
    "locus": loc
}

lgx_dict = {
    "label": "lgx",
    "name": groups[2][0],
    "locus": loc
}

second_field_dict = {
    "label": "2nd_FIELD",
    "name": groups[3][0],
    "locus": loc
}

hla_data = [
    hla_dict,
    g_dict,
    lg_dict,
    lgx_dict,
    second_field_dict
]

In [36]:
hla_data

[{'label': 'IMGT_HLA', 'name': 'HLA-A*01:01:01:01', 'locus': 'HLA-A'},
 {'label': 'G', 'name': 'HLA-A*01:01:01G', 'locus': 'HLA-A'},
 {'label': 'lg', 'name': 'HLA-A*01:01g', 'locus': 'HLA-A'},
 {'label': 'lgx', 'name': 'HLA-A*01:01', 'locus': 'HLA-A'},
 {'label': '2nd_FIELD', 'name': 'A*01:01', 'locus': 'HLA-A'}]

# ---> PICK UP HERE

In [37]:
from pandas.io.json import json_normalize

json_normalize(features)

  This is separate from the ipykernel package so we can avoid doing imports until


AttributeError: 'Feature' object has no attribute 'values'