## Return all GO* annotations for BRCA2

In [66]:
import pandas as pd
import requests
from wikidataintegrator import wdi_core
pd.set_option('display.max_colwidth', -1)

### Get results merging MF, CC, BP together

In [67]:
query = """SELECT ?hgnc ?protein ?go ?goLabel ?goId
WHERE
{
  values ?hgnc {"FANCA" "FANCB" "FANCC" "FANCE" "FANCF" "FANCG" "FANCL" "FANCM" "FANCD2" "FANCI" "UBE2T" "FANCD1" "(BRCA2)" "FANCJ" "FANCN" "FANCO" "FANCP" "FANCQ" "FANCR" "FANCS" "FANCV" "FANCU" "FAAP100" "FAAP24" "FAAP20" "FAAP16" "(MHF1)" "FAAP10" "(MHF2)"}
  ?gene wdt:P353 ?hgnc .  # get gene items with these HGNC symbols
  ?gene wdt:P688 ?protein . # get the protein
  ?protein wdt:P680|wdt:P681|wdt:P682 ?go . # get GO terms
  ?go wdt:P686 ?goId
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}"""
d = wdi_core.WDItemEngine.execute_sparql_query(query)

In [68]:
df = pd.DataFrame([{k:v['value'] for k,v in x.items()} for x in d['results']['bindings']])

In [69]:
df

Unnamed: 0,go,goId,goLabel,hgnc,protein
0,http://www.wikidata.org/entity/Q29548,GO:0005886,plasma membrane,FANCG,http://www.wikidata.org/entity/Q21101237
1,http://www.wikidata.org/entity/Q30869,GO:0005730,nucleolus,FANCD2,http://www.wikidata.org/entity/Q21100488
2,http://www.wikidata.org/entity/Q30869,GO:0005730,nucleolus,FANCG,http://www.wikidata.org/entity/Q21101237
3,http://www.wikidata.org/entity/Q30869,GO:0005730,nucleolus,UBE2T,http://www.wikidata.org/entity/Q21135169
4,http://www.wikidata.org/entity/Q30869,GO:0005730,nucleolus,UBE2T,http://www.wikidata.org/entity/Q21150902
5,http://www.wikidata.org/entity/Q37748,GO:0005694,chromosome,FAAP20,http://www.wikidata.org/entity/Q21101278
6,http://www.wikidata.org/entity/Q39572,GO:0005739,mitochondrion,FANCG,http://www.wikidata.org/entity/Q21101237
7,http://www.wikidata.org/entity/Q40260,GO:0005634,nucleus,FANCD2,http://www.wikidata.org/entity/Q21100488
8,http://www.wikidata.org/entity/Q40260,GO:0005634,nucleus,FANCI,http://www.wikidata.org/entity/Q21101224
9,http://www.wikidata.org/entity/Q40260,GO:0005634,nucleus,FANCG,http://www.wikidata.org/entity/Q21101237


In [71]:
def f(x):
     return pd.Series(dict(goLabel = list(x['goLabel'])[0], 
                        hgnc = ','.join(x['hgnc']),
                          count = len(x)))

In [72]:
df2 = df.groupby("goId").apply(f).sort_values("count", ascending=False)

In [73]:
df2

Unnamed: 0_level_0,count,goLabel,hgnc
goId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GO:0006281,17,DNA repair,"FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T,FANCL,FANCD2,FANCI"
GO:0005634,15,nucleus,"FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T,UBE2T"
GO:0005654,14,nucleoplasm,"FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T"
GO:0006974,14,cellular response to DNA damage stimulus,"FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T"
GO:0036297,14,interstrand cross-link repair,"FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T"
GO:0005515,12,protein binding,"FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCF,FANCL"
GO:0043240,12,Fanconi anaemia nuclear complex,"FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,FANCL"
GO:0005737,5,cytoplasm,"FANCG,FANCA,FANCC,FANCL,UBE2T"
GO:0005829,5,cytosol,"FANCD2,FANCI,FANCG,FANCC,FAAP100"
GO:0016567,4,protein ubiquitination,"FANCF,FANCL,UBE2T,FANCL"
