# Get Gene Info
This notebook retrieves information from the NIH database and adds it to the **"Secretory Pathway Recon" Google Sheet**.

In [None]:
import pandas as pd
from Bio import Entrez
import Request_Utilis
from google_sheet import GoogleSheet

Entrez.email = "a.antonakoudis@sartorius.com"

In [None]:
##### ----- Generate datasets from Google Sheet ----- #####

#Credential file
KEY_FILE_PATH = 'credentials.json'

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
SPREADSHEET_ID = '1DaAdZlvMYDqb7g31I5dw-ZCZH52Xj_W3FnQMFUzqmiQ'

# Initialize the GoogleSheet object
gsheet_file = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sec_recon_sheet = 'SecRecon'
sec_recon = gsheet_file.read_google_sheet(sec_recon_sheet)
# Create a copy of the dataset
sec_recon_dc = sec_recon.copy()

## 1. Retrieve Human Entrez IDs based on Gene Symbol

In [None]:
# Update Human Entrez IDs
for i,row in sec_recon_dc.iterrows():
    if pd.isnull(row['HUMAN ENTREZID']) or row['HUMAN ENTREZID'] == '':
        human_entrez = Request_Utilis.get_entrez_id(row['GENE SYMBOL'])
        sec_recon_dc.at[i, 'HUMAN ENTREZID'] = human_entrez

if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated.")
else:
    print('Human Entrez IDs are up-to-date')

## 2. CHO and Mouse Orthologs
Here we used the Human Entrez IDs from the step before to get the CHO and Mouse orthologs

In [None]:
# Map Human IDs to CHO IDs from the "cho2human_mapping" dataset

cho2human_mapping = pd.read_csv("../Orthologs/cho2human_mapping.tsv", sep='\t')
cho2human_mapping2 = pd.read_excel("../Orthologs/orthologs.xlsx", index_col=0)
cho2human_mapping2['Human GeneID'] = pd.to_numeric(cho2human_mapping2['Human GeneID'], errors='coerce')
cho2human_mapping2['Human GeneID'] = cho2human_mapping2['Human GeneID'].astype('Int64')

cho_id_lookup = dict(zip(cho2human_mapping['HUMAN_ID'], cho2human_mapping['CHO_ID'])) #convert to dict for mapping
cho_id_lookup2 = dict(zip(cho2human_mapping2['Human GeneID'], cho2human_mapping2['CHO GeneID'])) #convert to dict for mapping

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['CHO ENTREZID']) or row['CHO ENTREZID'] == '':
        try:
            human_id = int(row['HUMAN ENTREZID'])
            cho_id = cho_id_lookup.get(human_id)
            if cho_id is not None:
                sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_id
            else:
                try:
                    cho_id = cho_id_lookup2.get(human_id)
                    if cho_id is not None:
                        sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_id
                except ValueError:
                    print(f'{human_id} is not a valid Human Entrez ID')      
        except ValueError:
            print(f'{human_id} is not a valid Human Entrez ID')
            continue        

if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated on CHO Entrez IDs from cho2human dataset")
else:
    print('CHO Entrez IDs from "cho2human_mapping" dataset are up-to-date')

In [None]:
## -- CHO Entrez IDs -- ##

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['CHO ENTREZID']) or row['CHO ENTREZID'] == '':
        human_id = row['HUMAN ENTREZID']
        cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(human_id, '10029')
        if cho_ortholog_EntrezID is not None:
            sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_ortholog_EntrezID
            
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated on CHO Entrez IDs from NIH database")
else:
    print('CHO Entrez IDs from NIH database are up-to-date')

In [None]:
## -- Mouse Entrez IDs -- ##

loop_counter = 0
update_threshold = 50

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['MOUSE ENTREZID']) or row['MOUSE ENTREZID'] == '':
        human_id = row['HUMAN ENTREZID']
        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(human_id, '10090')
        if mouse_ortholog_EntrezID is not None:
            sec_recon_dc.at[index, 'MOUSE ENTREZID'] = mouse_ortholog_EntrezID
            loop_counter += 1

        if loop_counter >= update_threshold:
            if not sec_recon_dc.equals(sec_recon):
                gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
                print(f"Google Sheet updated on Mouse Entrez IDs from NIH database after {loop_counter} updates")
            else:
                print('Mouse Entrez IDs from NIH database are up-to-date')
            loop_counter = 0

# Check if there are any remaining updates after exiting the loop
if loop_counter > 0 and not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print(f"Google Sheet updated on Mouse Entrez IDs from NIH database after {loop_counter} updates")


## 3. Ensembl IDs

In [None]:
# Collect missing information from NIH database
updates = []
for i, gene in sec_recon_dc.iterrows():
    human_entrezID = gene['HUMAN ENTREZID']
    gene_symbol = gene['GENE SYMBOL']
    if gene['ALIAS'] == '' or gene['GENENAME'] == '' or gene['HUMAN ENSEMBL'] == '':
        print(gene_symbol)
        try:
            org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(human_entrezID)
            updates.append((i, gene_synonyms, gene_name, gene_ensemble))
        except ValueError:
            print(f'No valid Entrez ID for gene {gene_symbol}')

# Apply the updates outside the loop
for i, gene_synonyms, gene_name, gene_ensemble in updates:
    sec_recon_dc.at[i, 'ALIAS'] = gene_synonyms
    sec_recon_dc.at[i, 'GENENAME'] = gene_name
    sec_recon_dc.at[i, 'HUMAN ENSEMBL'] = gene_ensemble
    
sec_recon_dc['ALIAS'] = sec_recon_dc['ALIAS'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['GENENAME'] = sec_recon_dc['GENENAME'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['HUMAN ENSEMBL'] = sec_recon_dc['HUMAN ENSEMBL'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
 
    
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated.")
else:
    print('Human identifiers are up-to-date')
    
'''
        org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)

    # Get CHO and Mouse Orthologs
    try:
        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10090')
        cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10029')
        if mouse_ortholog_EntrezID == '':
            mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10090')
    except:
    
    # print Human Info
    print(gene_EntrezID)
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    
    # print Mouse Info
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(mouse_ortholog_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    
    # print CHO Info
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(cho_ortholog_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    print("---------------------------------")

'''

In [None]:
## -- CHO Ensembl IDs and Gene Symbol -- ##

loop_counter = 0
update_threshold = 50

# Collect missing information for CHO identifiers
for i, gene in sec_recon_dc.iterrows():
    cho_entrezID = str(gene['CHO ENTREZID'])
    if cho_entrezID != '':
        if (pd.isna(gene['CHO ENSEMBL']) or gene['CHO ENSEMBL'] == '') or (pd.isna(gene['CHO GENE SYMBOL']) or gene['CHO GENE SYMBOL'] == ''):
            try:
                org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(cho_entrezID)
                if (pd.isna(gene['CHO GENE SYMBOL']) or gene['CHO GENE SYMBOL'] == ''):
                    sec_recon_dc.at[i, 'CHO GENE SYMBOL'] = gene_symbol
                if (pd.isna(gene['CHO ENSEMBL']) or gene['CHO ENSEMBL'] == ''):
                    sec_recon_dc.at[i, 'CHO ENSEMBL'] = gene_ensemble
            except ValueError:
                print(f'No valid Entrez ID for gene {gene_symbol}')
            loop_counter += 1

            if loop_counter >= update_threshold:
                if not sec_recon_dc.equals(sec_recon):
                    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
                    print(f"Google Sheet updated on CHO Ensembl IDs after {loop_counter} updates")
                else:
                    print('CHO Ensembl IDs are up-to-date')
                loop_counter = 0

# Check if there are any remaining updates after exiting the loop
if loop_counter > 0 and not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print(f"Google Sheet updated on CHO Ensembl IDs after {loop_counter} updates")

In [None]:
## -- Mouse Ensembl IDs and Gene Symbol-- ##

loop_counter = 0
update_threshold = 50

# Collect missing information for CHO identifiers
for i, gene in sec_recon_dc.iterrows():
    mouse_entrezID = str(gene['MOUSE ENTREZID'])
    if mouse_entrezID != '':
        if (pd.isna(gene['MOUSE ENSEMBL']) or gene['MOUSE ENSEMBL'] == '') or (pd.isna(gene['MOUSE GENE SYMBOL']) or gene['MOUSE GENE SYMBOL'] == ''):
            try:
                org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(mouse_entrezID)
                if (pd.isna(gene['MOUSE GENE SYMBOL']) or gene['MOUSE GENE SYMBOL'] == ''):
                    sec_recon_dc.at[i, 'MOUSE GENE SYMBOL'] = gene_symbol
                if (pd.isna(gene['MOUSE ENSEMBL']) or gene['MOUSE ENSEMBL'] == ''):
                    sec_recon_dc.at[i, 'MOUSE ENSEMBL'] = gene_ensemble
            except ValueError:
                print(f'No valid Entrez ID for gene {gene_symbol}')
            loop_counter += 1

            if loop_counter >= update_threshold:
                if not sec_recon_dc.equals(sec_recon):
                    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
                    print(f"Google Sheet updated on Mouse Ensembl IDs after {loop_counter} updates")
                else:
                    print('Mouse Ensembl IDs are up-to-date')
                loop_counter = 0

# Check if there are any remaining updates after exiting the loop
if loop_counter > 0 and not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print(f"Google Sheet updated on Mouse Ensembl IDs after {loop_counter} updates")

## 4. Gene Products