### Make a mapping database for human genes and their corresponding mouse orthologs

Download Website: https://www.informatics.jax.org/downloads/reports

File: HOM_MouseHumanSequence.rpt.txt


In [1]:
!wget -O HOM_MouseHumanSequence.rpt.txt http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt

--2024-08-14 08:31:43--  http://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt
Resolving www.informatics.jax.org (www.informatics.jax.org)... 64.147.54.32
Connecting to www.informatics.jax.org (www.informatics.jax.org)|64.147.54.32|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15095260 (14M)
Saving to: ‘HOM_MouseHumanSequence.rpt.txt’


2024-08-14 08:31:44 (7.84 MB/s) - ‘HOM_MouseHumanSequence.rpt.txt’ saved [15095260/15095260]



In [7]:
import pandas as pd
genes = pd.read_csv('HOM_MouseHumanSequence.rpt.txt', sep='\t')
genes.rename(columns = {
    "DB Class Key": "key",
    "Common Organism Name": "organism",
    "NCBI Taxon ID": "taxid",
    "Symbol": "symbol",
	"EntrezGene ID": "entrez_id",
    "Mouse MGI ID": "mgi_id",
    "HGNC ID": "hgnc_id",
    "OMIM Gene ID": "omim_gene_id",
    "Genetic Location": "location",
    "Genome Coordinates (mouse: GRCm39 human: GRCh38)": "genome_coordinates",
    "Nucleotide RefSeq IDs": "nucleotide_refseq_id",
    "Protein RefSeq IDs": "protein_refseq_id",
    "SWISS_PROT IDs": "swiss_prot_id",
}, inplace=True)
mouse_df = genes[genes["organism"] == "mouse, laboratory"]
human_df = genes[genes["organism"] == "human"]

merged_df = pd.merge(
    mouse_df, human_df, on="key", suffixes=("_mouse", "_human")
)
merged_df["entrez_id_human"] = merged_df["entrez_id_human"].apply(
    lambda x: f"ENTREZ:{int(x)}" if pd.notna(x) else x
)
merged_df["entrez_id_mouse"] = merged_df["entrez_id_mouse"].apply(
    lambda x: f"ENTREZ:{int(x)}" if pd.notna(x) else x
)
merged_df.to_csv("human_mouse_gene_mappings.tsv", sep="\t", index=False)