# Generating Gene Mapping Dictionary from HGNC

## HGNC (HUGO Gene Nomenclature Committee)
- website https://www.genenames.org/
- archive
  - directory structure and data file description https://www.genenames.org/download/archive/ 
  - download (monthly) https://www.genenames.org/download/archive/monthly/tsv/
  - download (quarterly) https://www.genenames.org/download/archive/quarterly/tsv/

In [None]:

import matplotlib.pyplot as plt
import pandas as pd
from matplotlib_venn import venn3



In [None]:
# load from url, about 10 seconds
# the 2024-10-01 version of hgnc seems to have missing data
hgnc_url = 'https://storage.googleapis.com/public-download-files/hgnc/archive/archive/monthly/tsv/hgnc_complete_set_2024-08-23.tsv'
hgnc = pd.read_csv(hgnc_url, delimiter="\t")

### The HGNC table holds multiple name entries for each gene as shown below:
See this page for description https://www.genenames.org/download/archive/

In [None]:
hgnc.head().transpose()

In [None]:
hgnc_crosstab = pd.crosstab(hgnc['locus_group'], hgnc['status']) 
hgnc_crosstab 

## Use "protein-coding genes"

In [None]:
hgnc_pc = hgnc[hgnc['locus_group'] == 'protein-coding gene']
hgnc_pc

In [None]:
# the protein coding gene symbols are unique
assert len({*hgnc_pc['symbol']}) == len(hgnc_pc)

In [None]:
def add_alias_prev_to_dict(original: pd.DataFrame):
    # Initialize an empty dictionary to store the mapping
    symbol_dict = {}

    # Iterate through each row of the DataFrame
    for _, row in original.iterrows():
        # Get the standard symbol for this row
        standard_symbol = row['symbol']

        # Helper function to add a key-value pair to the dictionary
        def add_to_dict(key, value):
            if key in symbol_dict:
                symbol_dict[key].append(value)
            else:
                symbol_dict[key] = [value]

        # Map the standard symbol to itself
        add_to_dict(standard_symbol, standard_symbol)

        # Only process 'alias_symbol' if it's not NaN
        if pd.notna(row['alias_symbol']):
            alias_values = [alias for alias in str(row['alias_symbol']).split('|') if alias]
            for alias in alias_values:
                add_to_dict(alias, standard_symbol)

        # Only process 'prev_symbol' if it's not NaN
        if pd.notna(row['prev_symbol']):
            prev_values = [prev for prev in str(row['prev_symbol']).split('|') if prev]
            for prev in prev_values:
                add_to_dict(prev, standard_symbol)

    return symbol_dict

mapping_dic = add_alias_prev_to_dict(hgnc_pc)

In [None]:
mapping_dic["NG2"]

In [None]:
mapping_dic

In [None]:
from collections import Counter

# Count the length of the lists in the dictionary
length_distribution = Counter(len(v) for v in mapping_dic.values())

# Display the distribution
print(length_distribution)

In [None]:
def plot_alias_prev_standard(original: pd.DataFrame):
    # Initialize an empty dictionary to store the mapping
    standard = set()
    prev = set()
    alias = set()

    # Iterate through each row of the DataFrame
    for _, row in original.iterrows():
        # Get the standard symbol for this row
        standard.add(row['symbol'])

    
        # Split the 'alias' column by "|" and map each alias to the standard symbol
        alias_values = [alias for alias in str(row['alias_symbol']).split('|') if alias]
        alias.update(alias_values)

        # Split the 'prev' column by "|" and map each previous symbol to the standard symbol
        prev_values = [prev for prev in str(row['prev_symbol']).split('|') if prev]
        prev.update(prev_values)

    # Create a Venn diagram to compare the categorical values
    plt.figure(figsize=(6,6))
    venn3([standard, alias, prev], set_labels=('standard', 'alias', 'prev'))

    # Show the Venn diagram
    plt.show()

plot_alias_prev_standard(hgnc_pc)

In [None]:
import json

# Replace each list with its first value
modified_dict = {key: value[0] if isinstance(value, list) and value else value for key, value in mapping_dic.items()}

# Save the modified dictionary to a JSON file
with open('protein_coding_gene_mapping_uppercase_hgnc_2024_08_23.1.json', 'w') as json_file:
    json.dump(modified_dict, json_file, indent=4)

print("Dictionary saved to JSON file.")