# Get Gene Info
This notebook retrieves information from the NIH database and adds it to the **"Secretory Pathway Recon" Google Sheet**.

In [1]:
import pandas as pd
from Bio import Entrez
import Request_Utilis
from google_sheet import GoogleSheet

Entrez.email = "a.antonakoudis@sartorius.com"

In [2]:
##### ----- Generate datasets from Google Sheet ----- #####

#Credential file
KEY_FILE_PATH = 'credentials.json'

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
SPREADSHEET_ID = '1DaAdZlvMYDqb7g31I5dw-ZCZH52Xj_W3FnQMFUzqmiQ'

# Initialize the GoogleSheet object
gsheet_file = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sec_recon_sheet = 'SecRecon'
sec_recon = gsheet_file.read_google_sheet(sec_recon_sheet)
# Create a copy of the dataset
sec_recon_dc = sec_recon.copy()

## 1. Retrieve Human Entrez IDs based on Gene Symbol

In [3]:
# Update Human Entrez IDs
for i,row in sec_recon_dc.iterrows():
    if pd.isnull(row['HUMAN ENTREZID']) or row['HUMAN ENTREZID'] == '':
        human_entrez = Request_Utilis.get_entrez_id(row['GENE SYMBOL'])
        sec_recon_dc.at[i, 'HUMAN ENTREZID'] = human_entrez

if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated.")
else:
    print('Human Entrez IDs are up-to-date')

Human Entrez IDs are up-to-date


## 2. CHO and Mouse Orthologs
Here we used the Human Entrez IDs from the step before to get the CHO and Mouse orthologs

In [4]:
# Map Human IDs to CHO IDs from the "cho2human_mapping" dataset

cho2human_mapping = pd.read_csv("../Orthologs/cho2human_mapping.tsv", sep='\t')
cho2human_mapping2 = pd.read_excel("../Orthologs/orthologs.xlsx", index_col=0)
cho2human_mapping2['Human GeneID'] = pd.to_numeric(cho2human_mapping2['Human GeneID'], errors='coerce')
cho2human_mapping2['Human GeneID'] = cho2human_mapping2['Human GeneID'].astype('Int64')

cho_id_lookup = dict(zip(cho2human_mapping['HUMAN_ID'], cho2human_mapping['CHO_ID'])) #convert to dict for mapping
cho_id_lookup2 = dict(zip(cho2human_mapping2['Human GeneID'], cho2human_mapping2['CHO GeneID'])) #convert to dict for mapping

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['CHO ENTREZID']) or row['CHO ENTREZID'] == '':
        try:
            human_id = int(row['HUMAN ENTREZID'])
            cho_id = cho_id_lookup.get(human_id)
            if cho_id is not None:
                sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_id
            else:
                try:
                    cho_id = cho_id_lookup2.get(human_id)
                    if cho_id is not None:
                        sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_id
                except ValueError:
                    print(f'{human_id} is not a valid Human Entrez ID')      
        except ValueError:
            print(f'{human_id} is not a valid Human Entrez ID')
            continue        

if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated on CHO Entrez IDs from cho2human dataset")
else:
    print('CHO Entrez IDs from "cho2human_mapping" dataset are up-to-date')

2528 is not a valid Human Entrez ID
8677 is not a valid Human Entrez ID
Google Sheet updated on CHO Entrez IDs from cho2human dataset


In [5]:
## -- CHO Entrez IDs -- ##

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['CHO ENTREZID']) or row['CHO ENTREZID'] == '':
        human_id = row['HUMAN ENTREZID']
        cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(human_id, '10029')
        if cho_ortholog_EntrezID is not None:
            sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_ortholog_EntrezID
            
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated on CHO Entrez IDs from NIH database")
else:
    print('CHO Entrez IDs from NIH database are up-to-date')

No accession for gene No ID found for given gene symbol
No accession for gene No ID found for given gene symbol
Google Sheet updated on CHO Entrez IDs from NIH database


In [6]:
## -- Mouse Entrez IDs -- ##

loop_counter = 0
update_threshold = 50

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['MOUSE ENTREZID']) or row['MOUSE ENTREZID'] == '':
        human_id = row['HUMAN ENTREZID']
        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(human_id, '10090')
        if mouse_ortholog_EntrezID is not None:
            sec_recon_dc.at[index, 'MOUSE ENTREZID'] = mouse_ortholog_EntrezID
            loop_counter += 1

        if loop_counter >= update_threshold:
            if not sec_recon_dc.equals(sec_recon):
                gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
                print(f"Google Sheet updated on Mouse Entrez IDs from NIH database after {loop_counter} updates")
            else:
                print('Mouse Entrez IDs from NIH database are up-to-date')
            loop_counter = 0

# Check if there are any remaining updates after exiting the loop
if loop_counter > 0 and not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print(f"Google Sheet updated on Mouse Entrez IDs from NIH database after {loop_counter} updates")


No accession for gene No ID found for given gene symbol
No accession for gene 6729
No accession for gene No ID found for given gene symbol
No accession for gene 7009
No accession for gene 23473
No accession for gene 29097
No accession for gene 9382
No accession for gene 51138
Google Sheet updated on Mouse Entrez IDs from NIH database after 50 updates
No accession for gene No ID found for given gene symbol
No accession for gene 51303
No accession for gene No ID found for given gene symbol
No accession for gene No ID found for given gene symbol
No accession for gene No ID found for given gene symbol
No accession for gene 5045
No accession for gene 90411
Google Sheet updated on Mouse Entrez IDs from NIH database after 50 updates
No accession for gene 57147
No accession for gene 10206
No accession for gene 140823
No accession for gene No ID found for given gene symbol
No accession for gene No ID found for given gene symbol
No accession for gene No ID found for given gene symbol
No accessio

## 3. Ensembl IDs

In [7]:
# Collect missing information from NIH database
updates = []
for i, gene in sec_recon_dc.iterrows():
    human_entrezID = gene['HUMAN ENTREZID']
    gene_symbol = gene['GENE SYMBOL']
    if gene['ALIAS'] == '' or gene['GENENAME'] == '' or gene['HUMAN ENSEMBL'] == '':
        print(gene_symbol)
        try:
            org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(human_entrezID)
            updates.append((i, gene_synonyms, gene_name, gene_ensemble))
        except ValueError:
            print(f'No valid Entrez ID for gene {gene_symbol}')

# Apply the updates outside the loop
for i, gene_synonyms, gene_name, gene_ensemble in updates:
    sec_recon_dc.at[i, 'ALIAS'] = gene_synonyms
    sec_recon_dc.at[i, 'GENENAME'] = gene_name
    sec_recon_dc.at[i, 'HUMAN ENSEMBL'] = gene_ensemble
    
sec_recon_dc['ALIAS'] = sec_recon_dc['ALIAS'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['GENENAME'] = sec_recon_dc['GENENAME'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['HUMAN ENSEMBL'] = sec_recon_dc['HUMAN ENSEMBL'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
 
    
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated.")
else:
    print('Human identifiers are up-to-date')
    
'''
        org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)

    # Get CHO and Mouse Orthologs
    try:
        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10090')
        cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10029')
        if mouse_ortholog_EntrezID == '':
            mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10090')
    except:
    
    # print Human Info
    print(gene_EntrezID)
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    
    # print Mouse Info
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(mouse_ortholog_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    
    # print CHO Info
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(cho_ortholog_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    print("---------------------------------")

'''

GALNT19
No valid Entrez ID for gene GALNT19
HSP90AA4P
Gene HSP90AA4P has no products
Gene HSP90AA4P has no products
Gene HSP90AA4P has no products
HSP90AA5P
Gene HSP90AA5P has no products
Gene HSP90AA5P has no products
HSP90AB4P
NAT8B
RNF126
SLC35A4
SLC35A5
TMED11
No valid Entrez ID for gene TMED11
ATG5
ATG7
AUP1
BAG3
BAP29
BECLIN-1
TMBIM6
BICD1
BICD2
BIM
BTK
CAPN10
CAPN11
CAPN12
CAPN13
CAPN3
CAPN5
CAPN6
CAPN7
CAPN8
CAPN9
CASP12
CASQ1
CASQ2
CLGN
CLPB
CNIH
CNIH2
CNIH3
CNIH4
COG1
COG2
COG3
COG5
COG7
COG8
COPG2
COPS1
COPS2
COPS3
COPS4
COPS5
COPS6
COPS7A
COPS7B
COPS8
CALR3
PPIE
PPIF
PPIG
PPIC
PPID
DNAJA4
DNAJB13
DNAJB14
DNAJB3
DNAJB4
DNAJB5
DNAJB6
DNAJB7
DNAJB8
DNAJC14
DNAJC7
DNAJC8
DNAJC9
DR5
EGASYN
No valid Entrez ID for gene EGASYN
ERFAD
EXOC3L1
EXOC3L4
FBXO1
FKBP11
FKBP14
FKBP15
FKBP1A
FKBP1B
FKBP2/FKBP13
No valid Entrez ID for gene FKBP2/FKBP13
FKBP23/FKBP7
No valid Entrez ID for gene FKBP23/FKBP7
FKBP3
FKBP4
FKBP5
FKBP6
FKBP65/FKBP10
No valid Entrez ID for gene FKBP65/FKBP10
FKBP8
FK

'\n        org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)\n\n    # Get CHO and Mouse Orthologs\n    try:\n        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, \'10090\')\n        cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, \'10029\')\n        if mouse_ortholog_EntrezID == \'\':\n            mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, \'10090\')\n    except:\n    \n    # print Human Info\n    print(gene_EntrezID)\n    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)\n    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)\n    \n    # print Mouse Info\n    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(mouse_ortholog_EntrezID)\n    print(org, gene_symbol, gene_nam

In [8]:
## -- CHO Ensembl IDs and Gene Symbol -- ##

loop_counter = 0
update_threshold = 50

# Collect missing information for CHO identifiers
for i, gene in sec_recon_dc.iterrows():
    cho_entrezID = str(gene['CHO ENTREZID'])
    if cho_entrezID != '':
        if (pd.isna(gene['CHO ENSEMBL']) or gene['CHO ENSEMBL'] == '') or (pd.isna(gene['CHO GENE SYMBOL']) or gene['CHO GENE SYMBOL'] == ''):
            try:
                org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(cho_entrezID)
                if (pd.isna(gene['CHO GENE SYMBOL']) or gene['CHO GENE SYMBOL'] == ''):
                    sec_recon_dc.at[i, 'CHO GENE SYMBOL'] = gene_symbol
                if (pd.isna(gene['CHO ENSEMBL']) or gene['CHO ENSEMBL'] == ''):
                    sec_recon_dc.at[i, 'CHO ENSEMBL'] = gene_ensemble
            except ValueError:
                print(f'No valid Entrez ID for gene {gene_symbol}')
            loop_counter += 1

            if loop_counter >= update_threshold:
                if not sec_recon_dc.equals(sec_recon):
                    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
                    print(f"Google Sheet updated on CHO Ensembl IDs after {loop_counter} updates")
                else:
                    print('CHO Ensembl IDs are up-to-date')
                loop_counter = 0

# Check if there are any remaining updates after exiting the loop
if loop_counter > 0 and not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print(f"Google Sheet updated on CHO Ensembl IDs after {loop_counter} updates")

No ENSEMBL ID for gene A4gnt
No ENSEMBL ID for gene Abo
No ENSEMBL ID for gene Adrm1
No ENSEMBL ID for gene Agap3
No valid Entrez ID for gene Agap3
No ENSEMBL ID for gene LOC103160092
No ENSEMBL ID for gene Arpc2
No ENSEMBL ID for gene LOC103161689
No ENSEMBL ID for gene B4galt4
No ENSEMBL ID for gene Bag1
No ENSEMBL ID for gene Bet1l
No ENSEMBL ID for gene LOC100760153
No ENSEMBL ID for gene Chpf
No ENSEMBL ID for gene Chpf2
No ENSEMBL ID for gene Chst10
No ENSEMBL ID for gene Chst14
No ENSEMBL ID for gene LOC103162159
No ENSEMBL ID for gene Copz2
No ENSEMBL ID for gene Dnajc5
No ENSEMBL ID for gene Dnajc5g
No ENSEMBL ID for gene Dpm2
No ENSEMBL ID for gene LOC103163675
No ENSEMBL ID for gene Fut7
No ENSEMBL ID for gene Galnt16
No ENSEMBL ID for gene Gbgt1
No ENSEMBL ID for gene LOC100764057
Gene ID 100754848; An error occurred: HTTP Error 400: Bad Request
No ENSEMBL ID for gene Get1
No ENSEMBL ID for gene Get4
No ENSEMBL ID for gene LOC100761532
No ENSEMBL ID for gene LOC103162307
No

In [9]:
## -- Mouse Ensembl IDs and Gene Symbol-- ##

loop_counter = 0
update_threshold = 50

# Collect missing information for CHO identifiers
for i, gene in sec_recon_dc.iterrows():
    mouse_entrezID = str(gene['MOUSE ENTREZID'])
    if mouse_entrezID != '':
        if (pd.isna(gene['MOUSE ENSEMBL']) or gene['MOUSE ENSEMBL'] == '') or (pd.isna(gene['MOUSE GENE SYMBOL']) or gene['MOUSE GENE SYMBOL'] == ''):
            try:
                org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(mouse_entrezID)
                if (pd.isna(gene['MOUSE GENE SYMBOL']) or gene['MOUSE GENE SYMBOL'] == ''):
                    sec_recon_dc.at[i, 'MOUSE GENE SYMBOL'] = gene_symbol
                if (pd.isna(gene['MOUSE ENSEMBL']) or gene['MOUSE ENSEMBL'] == ''):
                    sec_recon_dc.at[i, 'MOUSE ENSEMBL'] = gene_ensemble
            except ValueError:
                print(f'No valid Entrez ID for gene {gene_symbol}')
            loop_counter += 1

            if loop_counter >= update_threshold:
                if not sec_recon_dc.equals(sec_recon):
                    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
                    print(f"Google Sheet updated on Mouse Ensembl IDs after {loop_counter} updates")
                else:
                    print('Mouse Ensembl IDs are up-to-date')
                loop_counter = 0

# Check if there are any remaining updates after exiting the loop
if loop_counter > 0 and not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print(f"Google Sheet updated on Mouse Ensembl IDs after {loop_counter} updates")

No valid Entrez ID for gene Vps51
Gene ID 12337; An error occurred: HTTP Error 400: Bad Request
Google Sheet updated on Mouse Ensembl IDs after 50 updates
Gene ID 70604; An error occurred: HTTP Error 400: Bad Request
Gene ID 26918; An error occurred: HTTP Error 400: Bad Request
Google Sheet updated on Mouse Ensembl IDs after 50 updates
Google Sheet updated on Mouse Ensembl IDs after 50 updates
Gene ID 66622; An error occurred: HTTP Error 400: Bad Request
Google Sheet updated on Mouse Ensembl IDs after 27 updates


## 4. Gene Products