In [2]:
from bioservices import *
import pandas as pd
from biomart import BiomartServer
from tqdm import tqdm

In [5]:
ensembl_ids = pd.read_csv("/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/top3000_hv_genes_gr_truth_and_baseline_across_datasets.csv")["EID"].tolist()

In [3]:
new_ensembl_ids = pd.read_csv("/d/hpc/projects/FRI/DL/mo6643/MSC/data/data_update_slack/data_splits/data_splits_train_merge/top3000_hv_genes_just_baseline_across_datasets.csv")["EID"].tolist()

In [20]:
def get_gene_NCBI_Entrez(gene_ids, chunk_size=100):
    from requests.exceptions import HTTPError
    import time

    print("Setting up Ensembl server")
    # Connect to the Ensembl BioMart server
    server = BiomartServer("http://www.ensembl.org/biomart")
    ensembl = server.datasets['mmusculus_gene_ensembl']  # Use 'hsapiens_gene_ensembl' for human genes

    # Function to split gene_ids into smaller chunks
    def chunks(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    results = []  # List to hold the results
    failed_chunks = []  # List to keep track of chunks that fail
    total_chunks = (len(gene_ids) // chunk_size) + (0 if len(gene_ids) % chunk_size == 0 else 1)

    print(f"Total chunks to process: {total_chunks}")

    for i, ids_chunk in tqdm(enumerate(chunks(gene_ids, chunk_size)), total = total_chunks):
        try:
            response = ensembl.search({
                'filters': {'ensembl_gene_id': ids_chunk},
                'attributes': ['ensembl_gene_id', 'external_gene_name', 'entrezgene_id', 'description']
            })

            # Parse the response into a readable format
            for line in response.iter_lines():
                line = line.decode('utf-8')
                row = line.split("\t")
                if len(row) >= 4:  # Ensure there are enough columns
                    result = {
                        'Ensembl Gene ID': row[0],
                        'External Gene Name': row[1],
                        'NCBI Entrez ID': row[2],
                        'Description': row[3]
                    }
                    results.append(result)
        except HTTPError as e:
            print(f"Failed to process chunk {i + 1}/{total_chunks}: {e}")
            failed_chunks.append(ids_chunk)
        except Exception as e:
            print(f"An error occurred: {e}")
        time.sleep(1)  # Sleep to avoid hitting server too quickly

    print("Done")
    return results, failed_chunks


In [9]:
df = pd.DataFrame(ensembl_ids, columns = ["EID"])

In [13]:
ids = df[~df.EID.isin(new_ensembl_ids)].EID.tolist()

In [16]:
len(ids)

647

In [21]:
gene_info_df, failed_ids = get_gene_NCBI_Entrez(ids)

Setting up Ensembl server
Total chunks to process: 7


100%|██████████| 7/7 [01:28<00:00, 12.58s/it]

Done





In [31]:
df = pd.DataFrame(gene_info_df)

In [32]:
df = pd.DataFrame(gene_info_df).sort_values("Ensembl Gene ID")

In [39]:
for gene in df[df["Ensembl Gene ID"].duplicated()]["Ensembl Gene ID"]:
    print("----------------------------------------------")
    print(df[df["Ensembl Gene ID"] == gene][["Ensembl Gene ID", "External Gene Name", "NCBI Entrez ID"]])

In [38]:
import numpy as np
df["associated_pathways"] = np.nan
df.to_csv("KEGG_intermediate_new.csv")

In [34]:
df = df[~ (df["NCBI Entrez ID"] == "624367")]

In [82]:
df = df[~ (df["NCBI Entrez ID"] == "320463")]

In [84]:
df = df[~ (df["NCBI Entrez ID"] == "16429")]

In [86]:
df = df[~ (df["NCBI Entrez ID"] == "78178")]

In [35]:
df = df[~ (df["NCBI Entrez ID"] == "18213")]

In [90]:
df = df[~ (df["NCBI Entrez ID"] == "19702")]

In [92]:
df = df[~ (df["NCBI Entrez ID"] == "100328588")]

In [94]:
df = df[~ (df["NCBI Entrez ID"] == "654309")]

In [120]:
df[df["NCBI Entrez ID"] == ""]

Unnamed: 0,Ensembl Gene ID,External Gene Name,NCBI Entrez ID,Description


In [116]:
df.loc[df["External Gene Name"] == "Fate1", "NCBI Entrez ID"] = "77905"

In [117]:
df.loc[df["External Gene Name"] == "Tmsb10b", "NCBI Entrez ID"] = "100043712"

In [118]:
df.loc[df["External Gene Name"] == "Gm6525", "NCBI Entrez ID"] = "624713"

In [119]:
df

Unnamed: 0,Ensembl Gene ID,External Gene Name,NCBI Entrez ID,Description
0,ENSMUSG00000000049,Apoh,11818,apolipoprotein H [Source:MGI Symbol;Acc:MGI:88...
1,ENSMUSG00000000058,Cav2,12390,caveolin 2 [Source:MGI Symbol;Acc:MGI:107571]
2,ENSMUSG00000000078,Klf6,23849,Kruppel-like factor 6 [Source:MGI Symbol;Acc:M...
3,ENSMUSG00000000093,Tbx2,21385,T-box 2 [Source:MGI Symbol;Acc:MGI:98494]
4,ENSMUSG00000000120,Ngfr,18053,nerve growth factor receptor (TNFR superfamily...
...,...,...,...,...
3003,ENSMUSG00000104043,Gm6525,624713,predicted pseudogene 6525 [Source:MGI Symbol;A...
3004,ENSMUSG00000105504,Gbp5,229898,guanylate binding protein 5 [Source:MGI Symbol...
3005,ENSMUSG00000110195,Pde2a,207728,"phosphodiesterase 2A, cGMP-stimulated [Source:..."
3006,ENSMUSG00000113902,Ndufb1,102631912,NADH:ubiquinone oxidoreductase subunit B1 [Sou...


In [121]:
df.to_csv("mouse_genes_Entrez_IDs.csv")

In [None]:
df = pd.read_csv("mouse_genes_Entrez_IDs.csv")

### Pathways

In [40]:
from bioservices import KEGG

In [None]:
k = KEGG()

In [None]:
k.organism = "mmu"

In [None]:
import numpy as np

In [None]:
df["associated_pathways"] = np.NaN

In [44]:
df.to_csv("KEGG_intermediate.csv")

In [None]:
import time
import numpy as np
from bioservices import KEGG
import pandas as pd

df = pd.read_csv("KEGG_intermediate.csv")
df["associated_pathways"] = np.NaN

k = KEGG()
k.organism = "mmu"

# Initialize counters
genes_without_pathways = 0
total_genes = len(df["NCBI Entrez ID"])
all_pathways = set()
warnings_count = 0
max_retries = 3  # Maximum number of retries

# Create a tqdm iterator object
processed_genes = 1

# Iterate over genes
for gene in df["NCBI Entrez ID"]:
    success = False
    for _ in range(max_retries):
        data = k.get(f"mmu:{gene}")
        dict_data = k.parse(data)
        break  # Break out of the retry loop if successful

    if "PATHWAY" in dict_data.keys():
        pathways = list(dict_data["PATHWAY"].values())
        index = df[df["NCBI Entrez ID"] == gene].index
        if not index.empty:
            df.at[index[0], "associated_pathways"] = pathways
        else:
            print(f"Gene {gene} not found in DataFrame.")
        all_pathways.update(pathways)
    else:
        genes_without_pathways += 1

    percent_without_pathways = (genes_without_pathways / processed_genes) * 100
    print(f"Processed gene {processed_genes} and {percent_without_pathways:.2f}% genes without pathways")
    processed_genes += 1
    
df.to_csv("KEGG_final.csv", index = False)

In [33]:
import logging

# Suppress warnings from bioservices
logging.getLogger("bioservices").setLevel(logging.ERROR)

# Your code here

In [40]:
gene = 917
data = k.get(f"mmu:{gene}")
dict_data = k.parse(data)



In [23]:
type(data)

int

In [35]:
 warnings.filterwarnings("ignore", category=DeprecationWarning)

In [39]:
from bioservices import *

# Create a custom filter to suppress specific warnings
class SuppressBioservicesWarnings(logging.Filter):
    def filter(self, record):
        # Specify the messages or conditions for suppression
        if 'status is not ok with Not Found' in record.getMessage():
            return False
        if 'Could not parse the entry correctly' in record.getMessage():
            return False
        return True

# Get the logger used by bioservices
bioservices_logger = logging.getLogger('bioservices')

# Add the custom filter
bioservices_logger.addFilter(SuppressBioservicesWarnings())

### Try no.2

In [6]:
import kegg_pull.pull as p
single_pull = p.SinglePull()
entry_ids = ['br:br08902']
pull_result = single_pull.pull(entry_ids=entry_ids, output='pull-entries/')
print(pull_result)

Successful Entry Ids: none
Failed Entry Ids: br:br08902
Timed Out Entry Ids: none
