In [1]:
import pandas as pd
import zipfile

pd.options.display.min_rows = 100

In [3]:
# extract txt, file extension is erroneously gz, but its actually a zip file :)
fname = "ATH_GO_GOSLIM.txt"
fname_gz = f"{fname}.gz"

with zipfile.ZipFile(fname_gz, 'r') as z:
    z.extractall(".")

In [4]:
# from ATH_GO.README.txt
columns = [
    "locus_name",
    "tair_acc",
    "obj_name",
    "rel_type",
    "go_term",
    "go_id",
    "tair_id",
    "aspect",
    "go_slim",
    "evidence_code",
    "evidence_desc",
    "evidence_with",
    "reference",
    "annotator",
    "date"
]


# read into dataframe
df0 = pd.read_csv(fname, sep="\t", names=columns, skiprows=[0,1,2,3], index_col=False, header=0)
df0.head()
    


Unnamed: 0,locus_name,tair_acc,obj_name,rel_type,go_term,go_id,tair_id,aspect,go_slim,evidence_code,evidence_desc,evidence_with,reference,annotator,date
0,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,response to oxidative stress,GO:0006979,6625,P,response to stress,IEA,traceable computational prediction,AGI_LocusCode:AT5G19875,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
1,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,response to abscisic acid,GO:0009737,11395,P,response to chemical,IEA,traceable computational prediction,AGI_LocusCode:AT4G27410,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
2,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,response to lipid,GO:0033993,28865,P,response to chemical,IEA,traceable computational prediction,AGI_LocusCode:AT4G27410|AGI_LocusCode:AT2G0299...,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
3,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,oxoacid metabolic process,GO:0043436,21524,P,other cellular processes,IEA,traceable computational prediction,AGI_LocusCode:AT5G63790,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14
4,AT1G01010,locus:2200935,AT1G01010,acts upstream of or within,defense response to other organism,GO:0098542,46569,P,response to external stimulus,IEA,traceable computational prediction,AGI_LocusCode:AT2G43510|AGI_LocusCode:AT4G1473...,Publication:501796011|PMID:34562334,klaasvdp,2022-11-14


In [5]:
select_columns = ["locus_name", "go_term"]
genes = [
    "AT1G49570",
    "AT3G00400",
    "AT2G14610",
    "AT2G44240",
    "AT3G28510",
    "AT1G28590",
    "AT1G14880",
    "AT5G58180",
    "AT1G28005",
    "AT3G22238",
    "AT3G22234",
    "AT1G65483",
    "AT4G10860",
    "AT2G29250",
    "AT4G05250"
]

df = df0[select_columns] # select gene name and function
df = df.drop_duplicates() # drop duplicates
df = df.query("locus_name in @genes") # select the genes

df

Unnamed: 0,locus_name,go_term
26632,AT1G14880,defense response to fungus
26633,AT1G14880,response to wounding
26635,AT1G14880,response to salicylic acid
26636,AT1G14880,defense response to bacterium
26637,AT1G14880,mRNA binding
26638,AT1G14880,regulation of defense response
26639,AT1G14880,plant-type vacuole
26642,AT1G14880,response to inorganic substance
26644,AT1G14880,extracellular region
26645,AT1G14880,plant organ senescence


In [6]:
dfg = df.groupby("go_term", as_index=False).apply(lambda x: x)
dfg

Unnamed: 0,Unnamed: 1,locus_name,go_term
0,427399,AT5G58180,Golgi apparatus
1,427398,AT5G58180,SNAP receptor activity
2,427395,AT5G58180,SNARE complex
3,127262,AT2G14610,activation of immune response
4,48046,AT1G28005,biological_process
4,236361,AT3G28510,biological_process
4,287775,AT4G10860,biological_process
5,65692,AT1G49570,cell wall
6,287774,AT4G10860,cellular_component
7,150849,AT2G29250,chloroplast


In [7]:
# query by go_term
list(dfg[dfg["go_term"] == "defense response"]["locus_name"])

['AT1G28590', 'AT2G14610']