### Get Gene Info

In [1]:
import pandas as pd
from Bio import Entrez
import Request_Utilis
from google_sheet import GoogleSheet

Entrez.email = "a.antonakoudis@sartorius.com"

In [2]:
##### ----- Generate datasets from Google Sheet ----- #####

#Credential file
KEY_FILE_PATH = 'credentials.json'

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
SPREADSHEET_ID = '1DaAdZlvMYDqb7g31I5dw-ZCZH52Xj_W3FnQMFUzqmiQ'

# Initialize the GoogleSheet object
gsheet_file = GoogleSheet(SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from the Google Sheet
sec_recon_sheet = 'Sheet1'
sec_recon = gsheet_file.read_google_sheet(sec_recon_sheet)
# Create a copy of the dataset
sec_recon_dc = sec_recon.copy()

In [3]:
# Update Human Entrez IDs
for i,row in sec_recon_dc.iterrows():
    if pd.isnull(row['HUMAN ENTREZID']) or row['HUMAN ENTREZID'] == '':
        human_entrez = Request_Utilis.get_entrez_id(row['GENE SYMBOL'])
        sec_recon_dc.at[i, 'HUMAN ENTREZID'] = human_entrez

if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated.")
else:
    print('Human Entrez IDs are up-to-date')

Human Entrez IDs are up-to-date


In [4]:
# Map Human IDs to CHO IDs from the "cho2human_mapping" dataset

cho2human_mapping = pd.read_csv("../cho2human_mapping.tsv", sep='\t')

cho_id_lookup = dict(zip(cho2human_mapping['HUMAN_ID'], cho2human_mapping['CHO_ID'])) #convert to dict for mapping

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['CHO ENTREZID']) or row['CHO ENTREZID'] == '':
        try:
            human_id = int(row['HUMAN ENTREZID'])
            cho_id = cho_id_lookup.get(human_id)
            if cho_id is not None:
                sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_id
        except ValueError:
            print(f'{human_id} is not a valid Human Entrez ID')
            continue        

if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated on CHO Entrez IDs from cho2human dataset")
else:
    print('CHO Entrez IDs from "cho2human_mapping" dataset are up-to-date')

2528 is not a valid Human Entrez ID
8677 is not a valid Human Entrez ID
CHO Entrez IDs from "cho2human_mapping" dataset are up-to-date


In [5]:
for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['CHO ENTREZID']) or row['CHO ENTREZID'] == '':
        human_id = row['HUMAN ENTREZID']
        cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(human_id, '10029')
        if cho_ortholog_EntrezID is not None:
            sec_recon_dc.at[index, 'CHO ENTREZID'] = cho_ortholog_EntrezID
            
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated on CHO Entrez IDs from NIH database")
else:
    print('CHO Entrez IDs from NIH database are up-to-date')

No accesion for gene No ID found for given gene symbol
No accesion for gene No ID found for given gene symbol
CHO Entrez IDs from NIH database are up-to-date


In [None]:
for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['MOUSE ENTREZID']) or row['MOUSE ENTREZID'] == '':
        human_id = row['HUMAN ENTREZID']
        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(human_id, '10090')
        if mouse_ortholog_EntrezID is not None:
            sec_recon_dc.at[index, 'MOUSE ENTREZID'] = mouse_ortholog_EntrezID
        
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated on Mouse Entrez IDs from NIH database")
else:
    print('Mouse Entrez IDs from NIH database are up-to-date')

In [6]:
loop_counter = 0
update_threshold = 50

for index, row in sec_recon_dc.iterrows():
    if pd.isna(row['MOUSE ENTREZID']) or row['MOUSE ENTREZID'] == '':
        human_id = row['HUMAN ENTREZID']
        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(human_id, '10090')
        if mouse_ortholog_EntrezID is not None:
            sec_recon_dc.at[index, 'MOUSE ENTREZID'] = mouse_ortholog_EntrezID
            loop_counter += 1

        if loop_counter >= update_threshold:
            if not sec_recon_dc.equals(sec_recon):
                gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
                print(f"Google Sheet updated on Mouse Entrez IDs from NIH database after {loop_counter} updates")
            else:
                print('Mouse Entrez IDs from NIH database are up-to-date')
            loop_counter = 0

# Check if there are any remaining updates after exiting the loop
if loop_counter > 0 and not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print(f"Google Sheet updated on Mouse Entrez IDs from NIH database after {loop_counter} updates")


No accesion for gene 27
No accesion for gene 10690
No accesion for gene 79695
No accesion for gene No ID found for given gene symbol
Google Sheet updated on Mouse Entrez IDs from NIH database after 50 updates


JSONDecodeError: Unterminated string starting at: line 1 column 4176 (char 4175)

In [None]:
from Bio import Entrez
import time
import requests
from bs4 import BeautifulSoup
import json
target_tax_id = '10090'
url = f'https://www.ncbi.nlm.nih.gov/gene/23527/ortholog/'
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    script_tag = soup.find('script', string=lambda text: 'var appData =' in text)
    script_content = script_tag.text if script_tag else ""
    gene_data_start = script_content.find('appData.genes =') + len('appData.genes =')
    gene_data_end = script_content.find(';', gene_data_start)
    gene_data_json = script_content[gene_data_start:gene_data_end]
    genes_data = json.loads(gene_data_json)
    if not genes_data:
        gene_id_orth = None
    else:
        for gene_info in genes_data:
            if gene_info.get('tax_id') == int(target_tax_id):
                gene_id_orth = (gene_info.get('gene_id'))
            else:
                gene_id_orth = None
elif response.status_code != 200:
    print(f'No accesion for gene {gene_entrezID}')
    gene_id_orth = None

In [None]:
for gene_info in genes_data:
    if gene_info.get('tax_id') == int(target_tax_id):
        gene_id_orth = (gene_info.get('gene_id'))

In [None]:
gene_id_orth

In [None]:
# Collect missing information from NIH database
updates = []
for i, gene in sec_recon_dc.iterrows():
    human_entrezID = gene['HUMAN ENTREZID']
    gene_symbol = gene['GENE SYMBOL']
    if gene['ALIAS'] == '' or gene['GENENAME'] == '' or gene['HUMAN ENSEMBL'] == '':
        print(gene_symbol)
        try:
            org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(human_entrezID)
            updates.append((i, gene_synonyms, gene_name, gene_ensemble))
        except ValueError:
            print(f'No valid Entrez ID for gene {gene_symbol}')

# Apply the updates outside the loop
for i, gene_synonyms, gene_name, gene_ensemble in updates:
    sec_recon_dc.at[i, 'ALIAS'] = gene_synonyms
    sec_recon_dc.at[i, 'GENENAME'] = gene_name
    sec_recon_dc.at[i, 'HUMAN ENSEMBL'] = gene_ensemble
    
sec_recon_dc['ALIAS'] = sec_recon_dc['ALIAS'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['GENENAME'] = sec_recon_dc['GENENAME'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['HUMAN ENSEMBL'] = sec_recon_dc['HUMAN ENSEMBL'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
 
    
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated.")
else:
    print('Human identifiers are up-to-date')
    
'''
        org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)

    # Get CHO and Mouse Orthologs
    try:
        mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10090')
        cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10029')
        if mouse_ortholog_EntrezID == '':
            mouse_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10090')
    except:
    
    # print Human Info
    print(gene_EntrezID)
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(gene_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    
    # print Mouse Info
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(mouse_ortholog_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    
    # print CHO Info
    org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(cho_ortholog_EntrezID)
    print(org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products)
    print("---------------------------------")

'''

In [None]:
# Collect missing information for CHO identifiers
updates = []
for i, gene in sec_recon_dc.iterrows():
    human_entrezID = gene['HUMAN ENTREZID']
    gene_symbol = gene['GENE SYMBOL']
    if gene['ALIAS'] == '' or gene['GENENAME'] == '' or gene['HUMAN ENSEMBL'] == '':
        print(gene_symbol)
        try:
            org, gene_symbol, gene_name, gene_synonyms, gene_ensemble, gene_products = Request_Utilis.Gene_Info_from_EntrezID(human_entrezID)
            updates.append((i, gene_synonyms, gene_name, gene_ensemble))
        except ValueError:
            print(f'No valid Entrez ID for gene {gene_symbol}')

# Apply the updates outside the loop
for i, gene_synonyms, gene_name, gene_ensemble in updates:
    sec_recon_dc.at[i, 'ALIAS'] = gene_synonyms
    sec_recon_dc.at[i, 'GENENAME'] = gene_name
    sec_recon_dc.at[i, 'HUMAN ENSEMBL'] = gene_ensemble
    
sec_recon_dc['ALIAS'] = sec_recon_dc['ALIAS'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['GENENAME'] = sec_recon_dc['GENENAME'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
sec_recon_dc['HUMAN ENSEMBL'] = sec_recon_dc['HUMAN ENSEMBL'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
 
    
if not sec_recon_dc.equals(sec_recon):
    gsheet_file.update_google_sheet(sec_recon_sheet, sec_recon_dc)
    print("Google Sheet updated.")
else:
    print('Human identifiers are up-to-date')

cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(gene_EntrezID, '10029')

In [None]:
cho_ortholog_EntrezID = Request_Utilis.get_gene_ids(116983, '10029') 
cho_ortholog_EntrezID

In [None]:
def read_gene_ID(gene_EntrezID):
    gene_fetch_tries = 0
    gene_fetch_found = True
    record = []
    while gene_fetch_tries < 3 and gene_fetch_found:
        try:
            search_results = Entrez.esearch(db="gene", term = gene_EntrezID)            
            record = Entrez.read(search_results)
            gene_fetch_found = False
        except Exception as e:
            time.sleep(3)
            gene_fetch_tries += 1
    if gene_fetch_tries > 2:        
        print(f"EntrezID of {gene_EntrezID} could not be read; An error occurred")

    if int(record["Count"]) > 0:

        # Get the Gene ID of the first result
        gene_id = record["IdList"][0]

        # Fetch the gene information, maximum 3 tries
        fetch_tries = 0
        fetch_found = True
        while fetch_tries < 3 and fetch_found:
            gene_fetch_tries = 0
            gene_fetch_found = True
            gene_info = []
            while gene_fetch_tries < 3 and gene_fetch_found:
                try:
                    gene_info = Entrez.efetch(db="gene", id=gene_id, retmode="xml")    
                    gene_fetch_found = False
                except Exception as e:
                    time.sleep(3)
                    gene_fetch_tries += 1
                    print(f"Gene ID {gene_id}; An error occurred: {e}")
            if gene_fetch_tries > 2:
                print(f"{gene_EntrezID}. Gene ID {gene_id}; An error occurred")

            if gene_info == []:
                KeyError("sda")
            
            try:
                gene_record = Entrez.read(gene_info)
                fetch_found = False
            except Exception as e:
                time.sleep(3)
                fetch_tries += 1                
    return gene_record

In [None]:
EntrezID = 3323
gene_record = read_gene_ID(int(EntrezID))
org = gene_record[0]['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname']

In [None]:
gene_products = []
org = gene_record[0]['Entrezgene_source']['BioSource']['BioSource_org']['Org-ref']['Org-ref_taxname']
if 'Gene-ref_syn' in gene_record[0]['Entrezgene_gene']['Gene-ref']:
    gene_synonyms = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_syn']
else:
    gene_synonyms = []

# Get the gene name and gene NCBI ID
if 'Gene-ref_locus' in gene_record[0]['Entrezgene_gene']['Gene-ref']:
    gene_symbol = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus']                        
else:
    gene_symbol = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_locus-tag']

# Get the Gene description
if 'Gene-ref_desc' in gene_record[0]['Entrezgene_gene']['Gene-ref']:
    gene_name = gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_desc']
else:
    gene_name = gene_record[0]['Entrezgene_locus'][0]['Gene-commentary_products'][0]['Gene-commentary_label']

# Get Ensemble ID
for gene_db_refs in gene_record[0]['Entrezgene_gene']['Gene-ref']['Gene-ref_db']:
    if gene_db_refs['Dbtag_db'] == 'Ensembl':
        gene_ensemble = (gene_db_refs['Dbtag_tag']['Object-id']['Object-id_str'])

#print("Organism: ", org, "|Entrez ID: ", EntrezID, "|Gene Symbol: ", gene_symbol, "|Gene Name: ", gene_name, "|Gene Synonyms: ", gene_synonyms, "| Gene Ensemble: ", gene_ensemble)

for assembly_specific_info in gene_record[0]['Entrezgene_locus']:
    print(assembly_specific_info)
    if 'Gene-commentary_heading' in assembly_specific_info:
        
        for assembly_specific_transcript in assembly_specific_info['Gene-commentary_products']:
            if 'Gene-commentary_products' in assembly_specific_transcript:
                transcript_sequence_ID = assembly_specific_transcript['Gene-commentary_accession']

                for Entrez_Comments in gene_record[0]['Entrezgene_comments']:
                    if 'Gene-commentary_comment' in Entrez_Comments:
                        for comments in Entrez_Comments['Gene-commentary_comment']:
                            if 'Gene-commentary_products' in comments:
                                for product_per_assembly in comments['Gene-commentary_products']:                                
                                    if product_per_assembly['Gene-commentary_heading'] == 'mRNA Sequence':
                                        mRNA = product_per_assembly['Gene-commentary_accession']
                                        if mRNA == transcript_sequence_ID:  
                                            for protein_per_transcript_per_assemlby in product_per_assembly['Gene-commentary_products']:
                                                protein_sequence_ID = protein_per_transcript_per_assemlby['Gene-commentary_accession']
                                                
                                                uniprotID_list = []
                                                for protein_comments in protein_per_transcript_per_assemlby['Gene-commentary_comment']:
                                                    if protein_comments['Gene-commentary_heading'] == 'UniProtKB':
                                                        for uniprotID_protein_per_assembly in protein_comments['Gene-commentary_comment'][0]['Gene-commentary_source']:
                                                            protein_uniprot_id = uniprotID_protein_per_assembly['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_str']
                                                            uniprotID_list.append(protein_uniprot_id)
                                                gene_products.append(((transcript_sequence_ID, protein_sequence_ID, uniprotID_list)))                                       
                                    else:   
                                        if 'Gene-commentary_products' in product_per_assembly:
                                            for transcript_per_assembly in product_per_assembly['Gene-commentary_products']:      
                                                mRNA = transcript_per_assembly['Gene-commentary_accession']          
                                                if mRNA == transcript_sequence_ID:                                                  
                                                    for protein_per_transcript_per_assemlby in transcript_per_assembly['Gene-commentary_products']:                                                                
                                                        protein_sequence_ID = protein_per_transcript_per_assemlby['Gene-commentary_accession']
                                                        uniprotID_list = []
                                                        if 'Gene-commentary_comment' in protein_per_transcript_per_assemlby:
                                                            for protein_comments in protein_per_transcript_per_assemlby['Gene-commentary_comment']:                                                         
                                                                if protein_comments['Gene-commentary_heading'] == 'UniProtKB':
                                                                    for uniprotID_protein_per_assembly in protein_comments['Gene-commentary_comment'][0]['Gene-commentary_source']:
                                                                        protein_uniprot_id = uniprotID_protein_per_assembly['Other-source_src']['Dbtag']['Dbtag_tag']['Object-id']['Object-id_str']
                                                                        uniprotID_list.append(protein_uniprot_id)
                                                        else:
                                                            uniprotID_list = []
                                                        gene_products.append(((transcript_sequence_ID, protein_sequence_ID, uniprotID_list)))
print(org)
print(EntrezID)
print(gene_symbol)
print(gene_name)
print(gene_synonyms)
print(gene_ensemble)
print(gene_products)

In [None]:
gene_record