In [80]:
from goatools import obo_parser
import os
import wget
import pandas as pd
import re

In [81]:
go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
folder = os.getcwd()

if(not os.path.isfile('./go-basic.obo')):
    go_obo = wget.download(go_obo_url, 'go-basic.obo')
else:
    go_obo = 'go-basic.obo'

In [82]:
go = obo_parser.GODag(go_obo)

go-basic.obo: fmt(1.2) rel(2021-06-16) 47,230 GO Terms


In [83]:
relevant_go_terms = set()

In [84]:
regex = re.compile(r"\b(aging|longevity|lifespan|senescence|stress|ageing|cell death|age)\b", re.IGNORECASE)

In [85]:
for go_term in go.values():
    if (regex.search(go_term.name)):
        relevant_go_terms.add(go_term.id)

In [86]:
all_children = []

In [87]:
for term in relevant_go_terms:
    all_children.append(go[term].get_all_children())

In [88]:
for children in all_children:
    for child in children:
        relevant_go_terms.add(child)

In [89]:
from Bio.UniProt.GOA import gafiterator
import gzip

filename = './data/datastore/wb.gaf.gz'

relevant_genes = set()
with gzip.open(filename, 'rt') as fp:
    for annotation in gafiterator(fp):
        if (annotation['GO_ID'] in relevant_go_terms 
            and annotation['DB_Object_ID'][0:6] == "WBGene"):
                relevant_genes.add(annotation['DB_Object_ID'])

In [90]:
gene_expression_data = pd.read_csv("../common_datastore/raw_gene_expression_no_outliers_and_singles_for_GO_filtering.csv")

In [91]:
gene_expression_data.columns = ['Sample'] + list(gene_expression_data.columns[1:])
gene_expression_data.set_index("Sample", inplace=True)

In [92]:
relevant_genes = [gene for gene in relevant_genes if gene in gene_expression_data.index]

In [93]:
filtered = gene_expression_data.loc[relevant_genes, :]

In [94]:
filtered.to_csv("../common_datastore/GO_filtered_raw_gene_expression_data.csv")