In [3]:
taxid = 332648

- Where are the other gene IDs
- Do this by leiden cluster to see biological underpinning

In [1]:
import pandas as pd
import scanpy as sc
import pickle


adata = sc.read("lyon_transcriptomics_scanpy_normalised.h5")
df_meta = adata.obs
gene_names = adata.var.index
adata = None

gene_mapping = pickle.load(open("gene_mapping_botrytis.pkl", "rb"))

In [19]:
import requests
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to query UniProt for accession IDs based on gene names
def get_uniprot_data(gene_name, retries=3):
    url = f"https://rest.uniprot.org/uniprotkb/search?query={gene_name}"
    
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)  # Set timeout
            response.raise_for_status()  # Raise an error for bad responses
            return response.json()  # Return the JSON response
        except requests.Timeout:
            print(f"Timeout occurred for {gene_name}. Retrying... ({attempt + 1}/{retries})")
        except requests.RequestException as e:
            print(f"Request failed for {gene_name}: {e}")
            return None
    print(f"Failed to retrieve data for {gene_name} after {retries} attempts.")
    return None

# Function to process each gene's UniProt data
def process_gene(gene):
    uniprot_data = get_uniprot_data(gene)
    results = {}

    if uniprot_data and "results" in uniprot_data and uniprot_data["results"]:
        for entry in uniprot_data["results"]:
            # Extract UniProt ID
            uniprot_id = entry["primaryAccession"]
            
            # Extract gene symbols
            
            if "geneName" in entry["genes"][0]:
                gene_symbol = entry["genes"][0]["geneName"]["value"]
            else:
                gene_symbol = gene

            # Extract GO IDs and descriptions
            go_terms = []
            for go in entry.get("uniProtKBCrossReferences", []):
                if go["database"] == "GO":
                    # Extract the GO ID
                    go_id = go["id"]

                    # Extract the description from properties
                    description = "No description available"
                    for prop in go.get("properties", []):
                        if prop["key"] == "GoTerm":
                            description = prop["value"]

                    go_terms.append((go_id, description))

            # Store results
            results[gene] = {
                "uniprot_id": uniprot_id,
                "go_terms": go_terms
            }
    else:
        results[gene] = "No data found"
        gene_symbol = gene

    return results, gene_symbol

def download_gos_from_uniprot(gene_names):
    results = {}
    ensemble2symbol = {}

    # Use ThreadPoolExecutor to make concurrent requests
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_gene = {executor.submit(process_gene, gene): gene for gene in gene_names}
        
        for future in as_completed(future_to_gene):
            gene = future_to_gene[future]
            #try
            data, gene_symbol = future.result()
            results.update(data)
            if gene_symbol != gene:
                ensemble2symbol[gene] = gene_symbol
                
            #except Exception as e:
            #     print(f"Error processing {gene}: {e}")
    
    ensemble2go_term = {}
    for gene_id, values in results.items():
        if values == "No data found":
            ensemble2go_term[gene_id] = {
                "go_terms": set()
            }
        else:
            ensemble2go_term[gene_id] = {
                "go_terms": set([x[0] for x in values["go_terms"]])
            }
    return ensemble2go_term, ensemble2symbol

In [20]:
#ensemble2go_term, ensemble2symbol = download_gos_from_uniprot(gene_names)

In [21]:
#ensemble2go_term = download_gos_from_uniprot(all_gene_names)
#pickle.dump(ensemble2go_term, open("botrytis_ensemble2go_term.pkl", "wb"))

In [22]:
ensemble2go_term = pickle.load(open("botrytis_ensemble2go_term.pkl", "rb"))

In [23]:
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader

obodag = GODag("/home/ec2-user/picassoxbridgedata_coupled/go.obo" )

/home/ec2-user/picassoxbridgedata_coupled/go.obo: fmt(1.2) rel(2024-11-03) 43,983 Terms


In [24]:
category_mapping = {
    "biological_process": "BP",
    "molecular_function": "MF",
    "cellular_component": "CC"
}
ensemble2go_term_by_category = {category: {} for category in category_mapping.values()}

for gene, go_terms in ensemble2go_term.items():
    for go_term in go_terms:
        if go_term not in obodag:
            print(f"GO term {go_term} not found in OBO DAG")
            continue
        category = category_mapping[obodag[go_term].namespace]
        if gene not in ensemble2go_term_by_category[category]:
            ensemble2go_term_by_category[category][gene] = set()
        ensemble2go_term_by_category[category][gene].add(go_term)

GO term GO:0045226 not found in OBO DAG
GO term GO:0061483 not found in OBO DAG
GO term GO:0061780 not found in OBO DAG


In [28]:
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

# Perform GO enrichment analysis

goeaobj = GOEnrichmentStudyNS(
    gene_names,  # List of population gene IDs
    ensemble2go_term_by_category,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
    methods=["fdr_bh"],  # default multipletest correction method
)


Load BP Ontology Enrichment Analysis ...
 39%  4,493 of 11,388 population items found in association

Load CC Ontology Enrichment Analysis ...
 53%  6,067 of 11,388 population items found in association

Load MF Ontology Enrichment Analysis ...
 53%  6,033 of 11,388 population items found in association


In [43]:
import re
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
import os

def go_per_target(target):

    sig_table = f"differential_expression_tables/{target}_differential_expression.csv"
    if not os.path.isfile(sig_table):
        print(f"Missing differential expression table for {target}")
        return
    sig_table = pd.read_csv(sig_table, index_col=0)
    
    target = re.sub(r'\W+', '_', target)

    if os.path.isfile(f"go_plots/{target}_GO_enrichment_terms.png"):
        return
    
    sig_hits = sig_table[
        (sig_table["padj"] < 0.05) & (sig_table["log2FoldChange"].abs() > 1)
    ]
    goea_results_all = goeaobj.run_study( sig_hits.index, prt=None)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    goeaobj.wr_xlsx(f"go_plots/{target}.xlsx", goea_results_sig)
    plot_results(f"go_plots/{target}_GO_enrichment.png", goea_results_sig)
    go_ids = [x.__dict__["GO"] for x in goea_results_sig] 
    plot_gos(
        f"go_plots/{target}_GO_enrichment_terms.png", 
        go_ids, # Source GO ids
        obodag,
        goea_results=goea_results_sig, # use pvals for coloring
        # We can further configure the plot...
        id2symbol=gene_mapping, # Print study gene Symbols, not Entrez GeneIDs
        study_items=6, # Only only 6 gene Symbols max on GO terms
        items_p_line=5, # Print 3 genes per line
        title = f"{target} GO enrichment",
    )
    


In [30]:
leiden_dict = pickle.load(open("leiden_dict_lyon.pkl", "rb"))

In [31]:
df_meta["Metadata_leiden"] = df_meta["Metadata_treatments"].map(leiden_dict)

In [33]:
all_targets = df_meta[df_meta["Metadata_target"] != "DMSO"]["Metadata_leiden"].unique()

In [36]:
# sig_table = pd.read_csv(f"differential_expression_tables/{target}_differential_expression.csv", index_col=0)
# target = re.sub(r'\W+', '_', target)
# sig_hits = sig_table[
#     (sig_table["padj"] < 0.05) & (sig_table["log2FoldChange"].abs() > 1)
# ]

# gene_symbols = [ensemble2symbol[x] for x in sig_hits.index if x in ensemble2symbol]
# all_gene_symbols = [ensemble2symbol[x] for x in gene_names if x in ensemble2symbol]

In [44]:
list(map(go_per_target, all_targets))

Missing differential expression table for Leiden cluster 7
Missing differential expression table for Leiden cluster 4
     25 items WROTE: go_plots/Leiden_cluster_12.xlsx
   25 usr  95 GOs  WROTE: go_plots/Leiden_cluster_12_GO_enrichment.png
   25 usr  95 GOs  WROTE: go_plots/Leiden_cluster_12_GO_enrichment_terms.png
      8 items WROTE: go_plots/Leiden_cluster_13.xlsx
    8 usr  28 GOs  WROTE: go_plots/Leiden_cluster_13_GO_enrichment.png
    8 usr  28 GOs  WROTE: go_plots/Leiden_cluster_13_GO_enrichment_terms.png
    0 usr   0 GOs  WROTE: go_plots/Leiden_cluster_6_GO_enrichment.png
    0 usr   0 GOs  WROTE: go_plots/Leiden_cluster_6_GO_enrichment_terms.png
      7 items WROTE: go_plots/Leiden_cluster_8.xlsx
    7 usr  29 GOs  WROTE: go_plots/Leiden_cluster_8_GO_enrichment.png
    7 usr  29 GOs  WROTE: go_plots/Leiden_cluster_8_GO_enrichment_terms.png
      3 items WROTE: go_plots/Leiden_cluster_5.xlsx
    3 usr  13 GOs  WROTE: go_plots/Leiden_cluster_5_GO_enrichment.png
    3 usr  13 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [45]:
os.system("pwd")

/home/ec2-user/picassoxbridgedata_coupled


0