## Return all GO* annotations for FA genes

In [2]:
import pandas as pd
import requests
from wikidataintegrator import wdi_core
pd.set_option('display.max_colwidth', -1)

### Get results merging MF, CC, BP together

In [3]:
query = """SELECT ?hgnc ?protein ?go ?goLabel ?goId
WHERE
{
  values ?hgnc {"FANCA" "FANCB" "FANCC" "FANCE" "FANCF" "FANCG" "FANCL" "FANCM" "FANCD2" "FANCI" "UBE2T" "FANCD1" "BRCA2" "FANCJ" "FANCN" "FANCO" "FANCP" "FANCQ" "FANCR" "FANCS" "FANCV" "FANCU" "FAAP100" "FAAP24" "FAAP20" "FAAP16" "MHF1" "FAAP10" "MHF2"}
  ?gene wdt:P353 ?hgnc .  # get gene items with these HGNC symbols
  ?gene wdt:P688 ?protein . # get the protein
  ?protein wdt:P680|wdt:P681|wdt:P682 ?go . # get GO terms
  ?go wdt:P686 ?goId
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}"""
d = wdi_core.WDItemEngine.execute_sparql_query(query)

In [4]:
df = pd.DataFrame([{k:v['value'] for k,v in x.items()} for x in d['results']['bindings']])

In [5]:
df

Unnamed: 0,go,goId,goLabel,hgnc,protein
0,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,BRCA2,http://www.wikidata.org/entity/Q421651
1,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FANCD2,http://www.wikidata.org/entity/Q21100488
2,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FANCI,http://www.wikidata.org/entity/Q21101224
3,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FANCG,http://www.wikidata.org/entity/Q21101237
4,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FANCA,http://www.wikidata.org/entity/Q21101242
5,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FANCC,http://www.wikidata.org/entity/Q21101263
6,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FAAP100,http://www.wikidata.org/entity/Q21101264
7,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FAAP20,http://www.wikidata.org/entity/Q21101278
8,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FANCM,http://www.wikidata.org/entity/Q21101297
9,http://www.wikidata.org/entity/Q167149,GO:0005515,protein binding,FANCB,http://www.wikidata.org/entity/Q21101299


In [6]:
def f(x):
     return pd.Series(dict(goLabel = list(x['goLabel'])[0], 
                        hgnc = ','.join(x['hgnc']),
                          count = len(x)))

In [7]:
df2 = df.groupby("goId").apply(f).sort_values("count", ascending=False)

In [8]:
df2

Unnamed: 0_level_0,count,goLabel,hgnc
goId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GO:0006281,18,DNA repair,"FAAP24,FANCE,FANCF,FANCL,UBE2T,FANCL,FANCD2,FANCI,BRCA2,FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB"
GO:0005634,16,nucleus,"BRCA2,FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T,UBE2T"
GO:0006974,15,cellular response to DNA damage stimulus,"FAAP24,FANCE,FANCF,FANCL,UBE2T,BRCA2,FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB"
GO:0005654,15,nucleoplasm,"BRCA2,FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T"
GO:0036297,14,interstrand cross-link repair,"FANCB,FAAP24,FANCE,FANCF,FANCL,UBE2T,FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM"
GO:0005515,13,protein binding,"BRCA2,FANCD2,FANCI,FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCF,FANCL"
GO:0043240,12,Fanconi anaemia nuclear complex,"FANCG,FANCA,FANCC,FAAP100,FAAP20,FANCM,FANCB,FAAP24,FANCE,FANCF,FANCL,FANCL"
GO:0005737,6,cytoplasm,"BRCA2,FANCG,FANCA,FANCC,FANCL,UBE2T"
GO:0005829,6,cytosol,"BRCA2,FANCD2,FANCI,FANCG,FANCC,FAAP100"
GO:0003677,5,DNA binding,"BRCA2,FANCI,FAAP100,FANCM,FAAP24"
