In [1]:
# requirements
# !pip install pandas tqdm

In [2]:
import pandas as pd
import requests as req
import zipfile


from tqdm import tqdm
from pathlib import Path

pd.options.display.min_rows = 100

In [3]:
# download file
url = "https://www.arabidopsis.org/download_files/GO_and_PO_Annotations/Gene_Ontology_Annotations/ATH_GO_GOSLIM.txt.gz"
fname = "ATH_GO_GOSLIM.txt.gz"

# only download if not already done
if not Path(f"./{fname}").is_file():
    chunk_size = 1024
    resp = req.get(url, stream=True)
    total = int(resp.headers.get('content-length', 0))

    with open (fname, "wb+") as f, tqdm(desc=fname, total=total, unit="iB", unit_scale=True, unit_divisor=1024) as bar:
        for data in resp.iter_content(chunk_size=chunk_size):
            size = f.write(data)
            bar.update(size)



In [4]:
# extract txt, file extension is erroneously gz, but its actually a zip file :)
with zipfile.ZipFile(fname, 'r') as z:
    z.extractall(".")

In [5]:
# from ATH_GO.README.txt
columns = [
    "locus_name",
    "tair_acc",
    "obj_name",
    "rel_type",
    "go_term",
    "go_id",
    "tair_id",
    "aspect",
    "go_slim",
    "evidence_code",
    "evidence_desc",
    "evidence_with",
    "reference",
    "annotator",
    "date"
]

fname_txt = "ATH_GO_GOSLIM.txt"

# read into dataframe
df0 = pd.read_csv(fname_txt, sep="\t", names=columns, skiprows=[0,1,2,3], index_col=False, header=0)
    


In [6]:
df0

Unnamed: 0,locus_name,tair_acc,obj_name,rel_type,go_term,go_id,tair_id,aspect,go_slim,evidence_code,evidence_desc,evidence_with,reference,annotator,date
0,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,response to oxidative stress,GO:0006979,6625,P,response to stress,IEA,traceable computational prediction,AGI_LocusCode:AT5G19875,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
1,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,response to abscisic acid,GO:0009737,11395,P,response to chemical,IEA,traceable computational prediction,AGI_LocusCode:AT4G27410,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
2,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,response to lipid,GO:0033993,28865,P,response to chemical,IEA,traceable computational prediction,AGI_LocusCode:AT4G27410|AGI_LocusCode:AT2G0299...,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
3,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,oxoacid metabolic process,GO:0043436,21524,P,other cellular processes,IEA,traceable computational prediction,AGI_LocusCode:AT5G63790,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
4,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,defense response to other organism,GO:0098542,46569,P,response to external stimulus,IEA,traceable computational prediction,AGI_LocusCode:AT2G43510|AGI_LocusCode:AT4G1473...,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
5,AT1G01010,locus:2200935,AT1G01010,involved in,regulation of DNA-templated transcription,GO:0006355,7461,P,other cellular processes,IEA,none,InterPro:IPR003441|InterPro:IPR036093,AnalysisReference:501756966,,2023-09-23
6,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,response to inorganic substance,GO:0010035,14767,P,response to chemical,IEA,traceable computational prediction,AGI_LocusCode:AT3G16857|AGI_LocusCode:AT4G2741...,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
7,AT1G01010,locus:2200935,AT1G01010,involved in,regulation of DNA-templated transcription,GO:0006355,7461,P,biosynthetic process,IEA,none,InterPro:IPR003441|InterPro:IPR036093,AnalysisReference:501756966,,2023-09-23
8,AT1G01010,locus:2200935,AT1G01010,involved in,regulation of DNA-templated transcription,GO:0006355,7461,P,other metabolic processes,IEA,none,InterPro:IPR003441|InterPro:IPR036093,AnalysisReference:501756966,,2023-09-23
9,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,regulation of DNA-templated transcription,GO:0006355,7461,P,biosynthetic process,ISS,none,,Publication:1345963|PMID:11118137,,2021-04-01


In [7]:
select_columns = ["locus_name", "go_term"]
genes = [
    "AT1G49570",
    "AT3G00400",
    "AT2G14610",
    "AT2G44240",
    "AT3G28510",
    "AT1G28590",
    "AT1G14880",
    "AT5G58180",
    "AT1G28005",
    "AT3G22238",
    "AT3G22234",
    "AT1G65483",
    "AT4G10860",
    "AT2G29250",
    "AT4G05250"
]



df = df0[select_columns] # select gene name and function
df = df.drop_duplicates() # drop duplicates
df = df.query("locus_name in @genes") # select the genes
df = df.reset_index(drop=True)

df

Unnamed: 0,locus_name,go_term
0,AT1G14880,defense response to fungus
1,AT1G14880,response to wounding
2,AT1G14880,response to salicylic acid
3,AT1G14880,defense response to bacterium
4,AT1G14880,mRNA binding
5,AT1G14880,regulation of defense response
6,AT1G14880,plant-type vacuole
7,AT1G14880,response to inorganic substance
8,AT1G14880,extracellular region
9,AT1G14880,plant organ senescence


In [8]:
dfg = df.groupby("go_term", as_index=False).apply(lambda x: x)
dfg

Unnamed: 0,Unnamed: 1,locus_name,go_term
0,70,AT5G58180,Golgi apparatus
1,69,AT5G58180,SNAP receptor activity
2,67,AT5G58180,SNARE complex
3,39,AT2G14610,activation of immune response
4,13,AT1G28005,biological_process
4,56,AT3G28510,biological_process
4,66,AT4G10860,biological_process
5,20,AT1G49570,cell wall
6,65,AT4G10860,cellular_component
7,47,AT2G29250,chloroplast


In [12]:
# query by go_term
list(dfg[dfg["go_term"] == "defense response"]["locus_name"])

['AT1G28590', 'AT2G14610']