<a href="https://colab.research.google.com/github/malcolmfisher103/Bioinformatic-Scripts/blob/main/Xenopus_GO_enrichment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install goenrich
!apt-get install graphviz
!apt-get install graphviz-dev
!pip install pygraphviz
!mkdir db
# Ontology
!wget http://purl.obolibrary.org/obo/go/go-basic.obo -O db/go-basic.obo
# UniprotACC
#!wget http://geneontology.org/gene-associations/goa_human.gaf.gz -O db/gene_association.goa_human.gaf.gz
# Yeast SGD
#!wget http://downloads.yeastgenome.org/curation/literature/gene_association.sgd.gz -O db/gene_association.sgd.gz
# Entrez GeneID
#!wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz -O db/gene2go.gz
# Xenopus Xenbase
!wget https://download.xenbase.org/xenbase/GenePageReports/xenbase.gaf.gz -O db/xenbase.gaf.gz
#!cat db/xenbase.gaf.gz | gunzip|head -n45

In [23]:
# @title Default title text
gene_set = ['mix1','LOC121398783','mix1','bix1.3','has2','wnt8a','tbxt','kcnk6','LOC121395429','foxa4','eomes','wnt8a','LOC121399130','tbxt','bix1.2','vegt','hes7.2','bix1.1','mespb'] # @param {type:"raw"}
print(gene_set)

['mix1', 'LOC121398783', 'mix1', 'bix1.3', 'has2', 'wnt8a', 'tbxt', 'kcnk6', 'LOC121395429', 'foxa4', 'eomes', 'wnt8a', 'LOC121399130', 'tbxt', 'bix1.2', 'vegt', 'hes7.2', 'bix1.1', 'mespb']


In [25]:
import gzip
import pandas as pd
import goenrich

# Build the ontology
O = goenrich.obo.ontology('db/go-basic.obo')

# Specify the path to your Xenbase GAF file
xenbase_gaf_path = 'db/xenbase.gaf.gz'
#xenbase_gaf_path = 'db/gene_association.goa_human.gaf.gz'

# Read the Xenbase GAF file
def read_xenbase_gaf(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            #if not line.startswith('!'):
             #   yield line.strip().split('\t')
            if not line.startswith('!'):
                fields = line.strip().split('\t')
                taxon_field = fields[12]  # Assuming taxon field is at index 11 (0-based index)
                taxon_ids = [taxon_id.split(':')[-1] for taxon_id in taxon_field.split('|')]
                filtered_taxon_ids = [taxon_id for taxon_id in taxon_ids if taxon_id in ['8355', '8364','9606']]
                if filtered_taxon_ids:
                    fields[12] = '|'.join(filtered_taxon_ids)
                    yield '\t'.join(fields)

# Extract taxon information from the file
def extract_taxon(file_path, taxon_column=12):  # Assuming taxon ID is in column 13 by default
    taxon_ids = set()
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            if not line.startswith('!'):
                taxon_id = line.strip().split('\t')[taxon_column].split('taxon:')[1].rstrip('|')
                taxon_ids.add(taxon_id)
    return taxon_ids

# Determine the taxon IDs from the file
desired_taxon_ids = '8364' # '8355' for laevis '8364' for trop extract_taxon(xenbase_gaf_path)
#desired_taxon_ids = '9606' #This should be used if the human gaf is being used.

# Read the Xenbase GAF file
annot = list(read_xenbase_gaf(xenbase_gaf_path))
annot = [line.strip().split('\t') for line in annot]

# Filter the rows based on the desired taxon IDs
annot_filtered = [entry for entry in annot if entry[12] in desired_taxon_ids]  # Assuming taxon ID is in column 12
print("Taxon IDs in entries:", set(entry[12] for entry in annot))  # Debug print statement
print("Taxon IDs extracted from file:", desired_taxon_ids)  # Debug print statement
print("Number of entries before filtering:", len(annot))  # Debug print statement
print("Number of entries after filtering:", len(annot_filtered))  # Debug print s

# Convert the filtered annotations to a DataFrame
annot_df = pd.DataFrame(annot_filtered, columns=['DB', 'DB_Object_ID', 'DB_Object_Symbol', 'Qualifier', 'GO_ID', 'DB_Reference', 'Evidence_Code', 'With', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'Taxon', 'Date', 'Assigned_By', 'Annotation_Extension'])

# Use values = {k: set(v) for k, v in annot.groupby('go_id')['db_object_symbol']}
values = {k: set(v) for k, v in annot_df.groupby('GO_ID')['DB_Object_Symbol']}

# Propagate the background through the ontology
background_attribute = 'annot'
goenrich.enrich.propagate(O, values, background_attribute)

# Extract some list of entries as example query
# query = annot_df['DB_Object_Symbol'].unique()[:100]
query = gene_set
#query = [item.upper() for item in query] #this should be used if the human gaf set is the background.
# For additional export to Graphviz, specify the gvfile argument
# The show argument keeps the graph reasonably small
df = goenrich.enrich.analyze(O, query, background_attribute, gvfile='test.dot')

# Generate HTML
df.dropna().head().to_html('example2.html')

# Call Graphviz
import subprocess
subprocess.check_call(['dot', '-Tpng', 'test.dot', '-o', 'test.png'])


Taxon IDs in entries: {'8355', '8364'}
Taxon IDs extracted from file: 8364
Number of entries before filtering: 346733
Number of entries after filtering: 159607


0