# Find 1-star annotation score gene in UniProt that have experimental evidence in GO

In [1]:
import pandas as pd

In [3]:
# Downloaded from UniProt table export by searching for UP000000625 and adding a custom set of columns
uniprot = pd.read_table('../sources/uniprot/proteome_UP000000625_110618.tsv.gz', compression='gzip').rename(columns={
    'Entry': 'uniprot_id',
    'Annotation': 'annotation_score',
})

In [6]:
# Downloaded from QuickGo (https://www.ebi.ac.uk/QuickGO) by searching for:
# - taxonId=83333
# - geneProductSubset=Swiss-Prot
# - evidenceCode=ECO:0000315,ECO:0007005,ECO:0006056,ECO:0000314,ECO:0000269
# - qualifier=enables
# https://www.ebi.ac.uk/QuickGO/annotations?taxonId=83333&taxonUsage=descendants&geneProductSubset=Swiss-Prot&geneProductType=protein&evidenceCode=ECO:0000315,ECO:0007005,ECO:0006056,ECO:0000314,ECO:0000269&evidenceCodeUsage=descendants&qualifier=enables=
eco = pd.read_table('../sources/uniprot/go-evidence-20181106.tsv.gz', compression='gzip').rename(columns={
    'GENE PRODUCT ID': 'uniprot_id',
    'SYMBOL': 'primary_name',
    'GO NAME': 'go_name',
    'ECO ID': 'eco_id',
})

In [7]:
df = pd.merge(uniprot[['uniprot_id', 'annotation_score']], eco[['uniprot_id', 'go_name', 'eco_id', 'primary_name']])

In [8]:
df[df.annotation_score.str.contains('1')]

Unnamed: 0,uniprot_id,annotation_score,go_name,eco_id,primary_name
5224,Q46906,1 out of 5,protein binding,ECO:0000353,ygcP
5225,Q46906,1 out of 5,protein binding,ECO:0000353,ygcP
5275,P64455,1 out of 5,protein binding,ECO:0000353,ydcY
5323,P0AFT8,1 out of 5,protein binding,ECO:0000353,yeiW
5337,P0ADX5,1 out of 5,protein binding,ECO:0000353,yhfG
5338,P39336,1 out of 5,protein binding,ECO:0000353,yjgL
5339,P39336,1 out of 5,protein binding,ECO:0000353,yjgL
5340,P39336,1 out of 5,protein binding,ECO:0000353,yjgL
5353,P0A8L7,1 out of 5,protein binding,ECO:0000353,yciU
5370,P0ADX7,1 out of 5,protein binding,ECO:0000353,yhhA


P38394 / ydaE is a good example of the challenges of using just ECO for determining y-ome membership.

In [9]:
df[df.annotation_score.str.contains('2')]

Unnamed: 0,uniprot_id,annotation_score,go_name,eco_id,primary_name
171,P77188,2 out of 5,protein binding,ECO:0000353,ecpB
172,P77188,2 out of 5,protein binding,ECO:0000353,ecpB
173,P77802,2 out of 5,protein binding,ECO:0000353,ecpC
276,P0AC92,2 out of 5,identical protein binding,ECO:0000353,gnsA
370,P0AAN1,2 out of 5,protein binding,ECO:0000353,hybE
371,P0AAN1,2 out of 5,protein binding,ECO:0000353,hybE
372,P0AAN1,2 out of 5,protein binding,ECO:0000353,hybE
373,P0AAN1,2 out of 5,protein binding,ECO:0000353,hybE
374,P0AAN1,2 out of 5,preprotein binding,ECO:0000353,hybE
375,P0AAN1,2 out of 5,preprotein binding,ECO:0000353,hybE
