# Create conversion maps from human genome files

Human gene info can be downloaded from ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz and gunzipped to a text file. This text file can be parsed with this notebook to create python dict objects used to map any entrez_id to a gene_symbol or vice-versa.

Some of these functions are copied from PyGEST, and cleaned up for more general use.

In [17]:
""" Change this to True to download and unpack a fresh genome to the local directory. """
download_new_file = False
# human_genome_file = '/data/sourcedata/Homo_sapiens.gene_info'
human_genome_file = './Homo_sapiens.gene_info'

In [18]:
""" Download a fresh version of the human genome annotation, if desired. """
if download_new_file:
    import urllib.request as request
    import gzip
    
    baseURL = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia"
    filename = "Homo_sapiens.gene_info.gz"

    response = request.urlopen("/".join([baseURL, filename, ]))
    with open(human_genome_file, "wb") as f:
        f.write(gzip.decompress(response.read()))


In [19]:
""" For this notebook, load the gene_info file and keep it in memory. """

import pandas as pd

gene_info_file = human_genome_file

human_genome_info = pd.read_csv(gene_info_file, delimiter='\t')
human_genome_info = human_genome_info.set_index('GeneID')


In [22]:
human_genome_info

Unnamed: 0_level_0,#tax_id,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
GeneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,9606,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20191012,-
2,9606,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20191012,-
3,9606,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000256069,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20191012,-
9,9606,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171428,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20191103,-
10,9606,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156006,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20191014,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8923215,741158,trnD,-,-,-,MT,-,tRNA,tRNA,-,-,-,-,20180406,-
8923216,741158,trnP,-,-,-,MT,-,tRNA,tRNA,-,-,-,-,20180406,-
8923217,741158,trnA,-,-,-,MT,-,tRNA,tRNA,-,-,-,-,20180406,-
8923218,741158,COX1,-,-,-,MT,-,cytochrome c oxidase subunit I,protein-coding,-,-,-,cytochrome c oxidase subunit I,20180406,-


In [3]:
def id_to_symbol_map():
    return human_genome_info['Symbol'].to_dict()


In [5]:
def symbol_to_id_map(use_synonyms=True, print_dupes=False):
    """
    Load gene info file and convert it to a dictionary allowing rapid entrez_id lookup from symbols

    :param use_synonyms: Set to False to only use gene symbols from the symbol column. By default, synonyms match too.
    :param print_dupes: Set to True to print out each time an entrez_id is overwritten during map creation.
    :return: dictionary mapping symbols to entrez ids
    """

    syn_map = {}
    sid_map = {}
    symbols = set()
    
    # Map synonyms first. They can later be overwritten by primary symbols
    if use_synonyms:
        for i, row in human_genome_info.sort_index(ascending=False).iterrows():
            for symbol in row['Synonyms'].split("|"):
                symbols.add(symbol)
                # Store each synonymous symbol string as a key in the dictionary,
                if symbol in syn_map.keys() and row.name != syn_map[symbol]:
                    if print_dupes:
                        print("  appending synonym '{}' to {{{}:{}}}".format(row.name, symbol, syn_map[symbol]))
                    if isinstance(syn_map[symbol], list):
                        syn_map[symbol].append(row.name)
                    else:
                        syn_map[symbol] = [syn_map[symbol], row.name]
                else:
                    syn_map[symbol] = row.name

    # Reverse-map Entrez IDs and Symbols
    for i, row in human_genome_info.sort_index(ascending=False).iterrows():
        symbols.add(row['Symbol'])
        if row['Symbol'] in sid_map.keys() and row.name != sid_map[row['Symbol']]:
            if print_dupes:
                print("  appending id '{}' to {{{}:{}}}".format(row.name, row['Symbol'], sid_map[row['Symbol']]))
            if isinstance(sid_map[row['Symbol']], list):
                sid_map[row['Symbol']].append(row.name)
            else:
                sid_map[row['Symbol']] = [sid_map[row['Symbol']], row.name, ]
        # Store the canonical symbol string as a key in the dictionary
        else:
            sid_map[row['Symbol']] = row.name

    # Remove the empty symbol
    try:
        symbols.remove("-")
    except KeyError:
        pass

    # Determine the appropriate entrez_id to use for each gene symbol.
    symbol_list = []
    for gene in sorted(list(symbols)):
        this_gene = {'gene': gene, 'syn_hits': 0, 'id_hits': 0}
        # Try synonyms first, then they can be overwritten if necessary.
        try:
            if isinstance(syn_map[gene], list):
                this_gene['entrez_id'] = 0
                this_gene['syn_id'] = None
                this_gene['syn_hits'] = len(syn_map[gene])
            else:
                this_gene['entrez_id'] = int(syn_map[gene])
                this_gene['syn_id'] = int(syn_map[gene])
                this_gene['syn_hits'] = 1
        except KeyError:
            pass
        # IDs are priority. If we find one, overwrite a synonym.
        try:
            if isinstance(sid_map[gene], list):
                this_gene['entrez_id'] = 0
                this_gene['sid_id'] = None
                this_gene['id_hits'] = len(sid_map[gene])
            else:
                this_gene['entrez_id'] = int(sid_map[gene])
                this_gene['sid_id'] = int(sid_map[gene])
                this_gene['id_hits'] = 1
        except KeyError:
            # No id, there may already be a synonym. If so, leave it alone.
            pass
        symbol_list.append(this_gene)

    # Manually add a few that are in AHBA, but not in the NCBI file.
    symbol_list.append({'gene': 'FLJ23867', 'entrez_id': 200058})
    symbol_list.append({'gene': 'FLJ37035', 'entrez_id': 399821})
    symbol_list.append({'gene': 'FLJ21408', 'entrez_id': 400512})
    symbol_list.append({'gene': 'PP14571', 'entrez_id': 100130449})

    # Convert to a dataframe, report stats, and update the dictionary.
    df_symbols = pd.DataFrame(data=symbol_list)
    if use_synonyms:
        print("Individually, {} synonyms, {} ids".format(len(syn_map), len(sid_map)))
        print("Combined, {} synonyms, {} ids".format(
            len(df_symbols[df_symbols['syn_id'].notnull()]), len(df_symbols[df_symbols['sid_id'].notnull()])
        ))
    else:
        print("Individually, {} ids".format(len(sid_map)))
        print("Combined, {} ids".format(len(df_symbols[df_symbols['sid_id'].notnull()])))
    print("{} good keys. {} have ambiguous (multiple) mappings, {} nulls".format(
        len(df_symbols[df_symbols['entrez_id'] > 0]),
        len(df_symbols[df_symbols['entrez_id'] == 0]),
        len(df_symbols[df_symbols['entrez_id'].isnull()]),
    ))

    return df_symbols.set_index('gene')['entrez_id'].to_dict()




In [10]:
""" Create an id to symbol map and save it to disk. """

import pickle

i_s = id_to_symbol_map()
with open("./id_to_symbol_map.dict", "wb") as f:
    pickle.dump(i_s, f)

In [11]:
""" Create a symbol to id map and save it to disk. """

import pickle

s_i = symbol_to_id_map()
with open("./symbol_to_id_map.dict", "wb") as f:
    pickle.dump(s_i, f)

Individually, 64297 synonyms, 61287 ids
Combined, 61056 synonyms, 61236 ids
120953 good keys. 3101 have ambiguous (multiple) mappings, 0 nulls
