# Objective

- Take a random sample of 100 (or 500, in the case of government buildings)
- Label them by hand to see if they're actually examples of the desired class
- Measure precision

# Results
- 500/500 entities are actually government buildings.
- 18/100 entities are actually occupations.
- 100/100 entities are actually cities.
- 13/100 entities are actually sports.
- 99/100 entities are actually houses.

In [1]:
import sys
!{sys.executable} -m pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
import datetime
import func_lib

endpoint_url = "https://query.wikidata.org/sparql"
item = "item"


class Relation:
    """
    The class returned when createRelation is called.
    It contains string field with query.
    We call Relation.query when we need to do the query.
    """

    def __init__(self, entity_id: str, property_id: str, isSubject: bool, rowVerbose: bool,
                 colVerbose: bool, time_property: str, time: str, name: str, label: bool, limit=10000, subclass=False):
        self.entity_id = entity_id
        self.query_str = ""
        self.dic = {}
        self.result_dic = {"Entity ID": []}
        self.df = pd.DataFrame()
        self.count = 0
        self.time_property = time_property
        self.time = time
        self.limit = limit
        self.subclass = subclass
        self.focus = "Entity ID"
        if property_id:
            self.extend(property_id, isSubject, name, rowVerbose, colVerbose, limit, time_property, time, label, subclass)

    def generate_html(self, name: str):
        html = (self.df).to_html()
        text_file = open(name, "w", encoding='utf-8')
        text_file.write(html)
        text_file.close()

    def query(self, require=None):
        if self.query_str == "":
            self.result_dic = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
            return self.result_dic
        results = get_results(endpoint_url, self.query_str)
        result_dict = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
        for i in range(1, self.count + 1):
            result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id']] = []
            if self.dic[i]["colVerbose"]:
                result_dict[self.dic[i]["name"] + '_rank_' + self.dic[i]['property_id'] + '_rank'] = []
                for key, value in self.dic[i]["property_name_dic"].items():
                    result_dict[
                        self.dic[i]["name"] + "_" + value + '_' + self.dic[i]['property_id'] + '_' + str(key)] = []
                for key, value in self.dic[i]["ref_dic"].items():
                    result_dict[self.dic[i]["name"] + "_ref_" + self.dic[i]['property_id'] + '_' + str(key)] = []

            if self.dic[i]["label"]:
                result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id'] + 'Label'] = []

        for result in results['results']['bindings']:
            for key, value in result_dict.items():
                if key in result.keys():
                    result_dict[key].append(result[key]['value'])
                else:
                    result_dict[key].append('NA')
        result_dict["Entity ID"] = ['http://www.wikidata.org/entity/' + str(self.entity_id)] * len(
            result_dict[self.dic[self.count]["name"] + '_' + self.dic[self.count]["property_id"]])
        self.result_dic = result_dict
        self.df = pd.DataFrame.from_dict(self.result_dic)
        for i in range(1, self.count + 1):
            if self.dic[i]["colVerbose"] and not self.dic[i]["rowVerbose"]:
                col = self.dic[i]['name'] + '_rank_' + self.dic[i]['property_id'] + '_rank'
                if any(self.df[col] == 'http://wikiba.se/ontology#PreferredRank'):
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#PreferredRank']
                else:
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#NormalRank']
#         if require is not None:
#             for r in require:
#                 self.df = self.df.loc[self.df[r] != 'NA']
        self.df = pd.DataFrame(data=self.df)
#         if self.df.shape[0] >= 10000:
#             print("Warning: Your query leads to too many results. Only 10,000 returned.")
        return self.df

    def extend(self, property_id: str, isSubject: bool, name: str, rowVerbose=False, colVerbose=False, limit=None,
               time_property=None, time=None, search=None, label=False, subclass=False):
        self.count += 1
        self.dic[self.count] = {}
        self.dic[self.count]["name"] = name
        self.dic[self.count]["focus"] = self.focus
        self.dic[self.count]["property_id"] = property_id
        self.dic[self.count]["isSubject"] = isSubject
        self.dic[self.count]["limit"] = limit
        self.dic[self.count]["rowVerbose"] = rowVerbose
        self.dic[self.count]["colVerbose"] = colVerbose
        self.dic[self.count]['time_property'] = time_property
        self.dic[self.count]['time'] = time
        self.dic[self.count]['search'] = search
        self.dic[self.count]['label'] = label
        self.dic[self.count]['subclass'] = subclass
        # subclass is specific to P31, when subclass is true, we get intance of all subclasses of the entity
        if rowVerbose or colVerbose:
            self.dic[self.count]["property_name_dic"], self.dic[self.count][
                "ref_dic"] = self.search_property_for_verbose()
        if time_property and time:
            self.time_property = time_property
            self.time = time
        if limit:
            self.limit = limit
        self.query_str = self.define_query_relation()

    def changeFocus(self, name="Entity ID"):
        self.focus = name
        
    def extendWithFunction(self, objcolumn, func, name):
        if type(func) == str:
            if func.startswith('F'):
                try:
                    func_id = int(func[1:])
                    if func_id >= func_lib.func_num():
                        print("Not available.")
                    else:
                        if isinstance(objcolumn, list):
                            self.df[name] = self.df[objcolumn].apply(lambda x: func_lib.func_list[func_id](*x), axis=1)
                        else:
                            self.df[name] = self.df[objcolumn].apply(func_lib.func_list[func_id])
                except:
                    raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
            else:
                raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
        else:
            if isinstance(objcolumn, list):
                self.df[name] = self.df[objcolumn].apply(lambda x: func(*x), axis=1)
            else:
                self.df[name] = self.df[objcolumn].apply(func)

    def define_query_relation(self):
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.count < 1:
            return None
        focusChanges = 0
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                if self.dic[i]["search"] is None and not self.dic[i]["isSubject"]:
                        rdf_triple += """OPTIONAL {"""
                if self.dic[i]["focus"] == "Entity ID":
#                     if self.dic[i]["search"] is None:
#                         rdf_triple += """OPTIONAL {"""
                    rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    rdf_triple += """?""" + self.dic[i]["focus"] + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                for key, value in self.dic[i]["property_name_dic"].items():
                    rdf_triple += """OPTIONAL { """ + """?statement_""" + str(i) + """ pq:""" + str(key) \
                                  + """ ?""" + self.dic[i]['name'] + """_""" + value + """_""" + self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                for key, value in self.dic[i]["ref_dic"].items():
                    rdf_triple += """OPTIONAL { ?statement_""" + str(
                        i) + """ prov:wasDerivedFrom ?refnode_""" + str(
                        i) + """. ?refnode_""" + str(i) \
                                  + """ pr:""" + str(key) + """ ?""" + self.dic[i]['name'] + """_ref_""" + \
                                  self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                rdf_triple += """OPTIONAL { ?statement_""" + str(i) + """ wikibase:rank ?""" + self.dic[i][
                    'name'] + """_rank_""" + self.dic[i]['property_id'] + """_rank. } """
            # none-verbose version
            else:
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
                        if not self.dic[i]['subclass']:
                            rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i][
                                              "property_id"] + """ wd:""" + self.entity_id + """. """
                        else:
                            if self.dic[i]['property_id'] == "P31":
                                rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                    'property_id'] + """ wdt:""" + self.dic[i][
                                                  "property_id"] + """ ?subclasses. """
                                rdf_triple += """?subclasses wdt:P279+ wd:""" + self.entity_id + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    if self.dic[i]["isSubject"]:
                        if not self.dic[i]['subclass']:
                            rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + self.dic[i][
                                              'focus'] + """. """
                        else:
                            if self.dic[i]['property_id'] == "P31":
                                rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i]["property_id"] + """ ?subclasses. """
                                rdf_triple += """?subclasses wdt:P279+ ?""" + self.dic[i]['focus'] + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """?""" + self.dic[i]['focus'] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                          'property_id'] + """. """
            if not self.dic[i]["isSubject"]:
                if i < self.count and self.dic[i]["focus"] != self.dic[i + 1]["focus"] and self.dic[i]["search"] is None:
                    focusChanges += 1
                elif self.dic[i]["search"] is None:
                    rdf_triple += """} """
                
        for i in range(focusChanges):
            rdf_triple += """} """
        for i in range(1, self.count + 1):
            if self.dic[i]['search'] is not None and self.dic[i]["search"] != '!NA':
                if isinstance(self.dic[i]['search'], tuple):
                    if isinstance(self.dic[i]['search'][0], str):
                        rdf_triple += """FILTER (YEAR(?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                            'property_id'] + """) >= """ + \
                                      self.dic[i]['search'][0] + """ && YEAR(?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """) <= """ + self.dic[i]['search'][
                                          1] + """) """
                    else:
                        rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i]['property_id'] + \
                                      """ >= """ + str(self.dic[i]['search'][0]) + """ && ?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """ <= """ + str(
                            self.dic[i]['search'][1]) + """) """
                else:
                    rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                        'property_id'] + """ = """ + \
                                  """wd:""" + self.dic[i]['search'] + """) """
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                          ?pubdateStatement ps:""" + self.time_property + """ ?date	
                          FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT"""
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
                for key, value in self.dic[i]["property_name_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_""" + value + """_""" + self.dic[i][
                        'property_id'] + """_""" + str(key)
                for key, value in self.dic[i]["ref_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_ref_""" + self.dic[i]['property_id'] + """_""" + str(
                        key)
                query += """ ?""" + self.dic[i]["name"] + """_rank_""" + self.dic[i]['property_id'] + """_rank"""
            else:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
        query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
        return query

    def search_property_for_verbose(self):
        property_to_name = {}
        ref_to_name = {}
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            for i in range(1, self.count):
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ wd:""" + self.entity_id + """ ."""
                    else:
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """ ."""
                else:
                    last = self.dic[i]["focus"].rfind('_')
                    focus = self.dic[i]["focus"][:last]
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + focus + """ ."""
                    else:
                        rdf_triple += """?""" + focus + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """ ."""
            if self.dic[self.count]["focus"] == "Entity ID":
                rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
            else:
                last = self.dic[self.count]["focus"].rfind('_')
                focus = self.dic[self.count]["focus"][:last]
                rdf_triple += """?""" + focus + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                                  ?pubdateStatement ps:""" + self.time_property + """ ?date	
                                  FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT """
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            query += """?item""" + """ ?qual""" + """ ?qualLabel""" + """ ?obj """ + """?pr ?prLabel"""
            query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
            query_result = get_results(endpoint_url, query)
            for result in query_result['results']['bindings']:
                if 'qual' in result:
                    property_to_name[result['qual']['value'].split('/')[-1]] = result['qualLabel']['value'].replace(' ',
                                                                                                                    '_')
                if 'pr' in result:
                    ref_to_name[result['pr']['value'].split('/')[-1]] = result['prLabel']['value'].replace(' ', '_')
        else:
            query += """?""" + self.dic[self.count]["name"] + """ """
        return property_to_name, ref_to_name

    def __str__(self):
        return str(self.df)

    def __getattr__(self, col_name):
        if col_name in self.df.columns:
            return self.df[col_name]
        else:
            print(col_name + " has not been found.")
            return None


def createRelation(entity_id: str, property_id=None, isSubject=None, rowVerbose=None, colVerbose=None,
                   time_property=None, time=None, name=None, label=False, limit=None, subclass=False):
    if property_id and not name:
        print("Please specify the name of the first column")
        return None
    return Relation(entity_id, property_id, isSubject, rowVerbose, colVerbose, time_property, time, name, label, limit, subclass)

def get_Firstname(name: str):
    return name.split(' ')[0]

def get_Lastname(name: str):
    return name.split(' ')[-1]

def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def get_name(id: str):
    query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 	
                PREFIX wd: <http://www.wikidata.org/entity/> 	
                select  *	
                where {	
                wd:""" + id + """ rdfs:label ?label .	
                FILTER (langMatches( lang(?label), "EN" ) )	
                } 	
                LIMIT 1"""
    results = get_results(endpoint_url, query)
    result = ''
    for res in results["results"]["bindings"]:
        result = res['label']['value']
    return result




In [2]:
import random

# Government buildings in the US

In [9]:
gov_building_class = 'Gov_Building'
qnum = 'Q16831714' # qnum of government building
r = createRelation(qnum, label=True)
r.extend('P279', True, gov_building_class, label=True) # extend via property P279 = is subclass of
r.query()
r.df

Unnamed: 0,Entity ID,Gov_Building_P279,Gov_Building_P279Label
0,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q481289,official residence
1,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q1407236,weigh house
2,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q1137809,courthouse
3,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q757292,border checkpoint
4,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q218653,custom house
5,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q10369806,Q10369806
6,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q100705206,Prefectural Office Building
7,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q11625075,domain office
8,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q861951,police station
9,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q5469110,Forest Service Guard Station


In [57]:
column_names = [
        'Entity ID', 'gov_building_subclass_P31',
       'gov_building_subclass_P31Label', 'Country_P17', 'Country_P17Label',
       'State_P131', 'State_P131Label', 'Lon_Lat_P625'
]

df_total = pd.DataFrame(columns=column_names)

for ind in range(len(r.df)):
    gov_building_subclass = r.df.Gov_Building_P279Label[ind]
    qnum = r.df.Gov_Building_P279[ind].split('/')[-1]
    r2 = createRelation(qnum, label=True)
    r2.extend('P31', True, 'gov_building_subclass', label=True, subclass=True) # extend via property P31 = is instance of

    r2.changeFocus('gov_building_subclass_P31')
    r2.extend('P17', False, 'Country',label=True, search="Q30") # extend via property P17 = is in country, filter on Q30 = USA
    r2.extend('P131', False, 'State', label=True)
    r2.extend('P625', False, 'Lon_Lat')
    r2.extend('P84', False, 'Architect', label=True)
    r2.extend('P149', False, 'Architectural_Style', label=True)
    r2.extend('P571', False, 'Inception')
    r2.extend('P137', False, 'Operator', label=True)
    r2.extend('P1083', False, 'Maximum_Capacity', label=True)
    r2.extend('P1435', False, 'Heritage_Designation', label=True)
    r2.extend('P6375', False, 'Street_Address')
    r2.query()

    df2 = r2.df
    df2['building_type_label'] = [gov_building_subclass for _ in range(len(df2))]

    print('There are %s instances of %s in the US.' % (str(len(df2)), gov_building_subclass))
    df_total = pd.concat([df_total, df2], axis=0, ignore_index=True)
    

There are 4 instances of official residence in the US.
There are 0 instances of weigh house in the US.
There are 1493 instances of courthouse in the US.
There are 59 instances of border checkpoint in the US.
There are 0 instances of custom house in the US.
There are 0 instances of Q10369806 in the US.
There are 0 instances of Prefectural Office Building in the US.
There are 0 instances of domain office in the US.
There are 0 instances of police station in the US.
There are 0 instances of Forest Service Guard Station in the US.
There are 673 instances of seat of local government in the US.
There are 0 instances of civilian internment enclosure in the US.
There are 0 instances of province building in the US.
There are 0 instances of congressional office building in the US.
There are 138 instances of legislative building in the US.
There are 0 instances of federal building in the US.
There are 0 instances of Q65270763 in the US.
There are 272 instances of prison in the US.
There are 0 ins

In [48]:
df_result = df_total.rename(columns={"Entity ID": "building_type",
                   "gov_building_subclass_P31": "building",
                   "gov_building_subclass_P31Label": "building_label",
                   "Country_P17": "country",
                   "Country_P17Label":"country_label",
                   "State_P131":"administrative_entity",
                   "State_P131Label": "administrative_entity_label",
                   "Lon_Lat_P625": "lon_lat",
                   "Architect_P84": "architect",
                   "Architect_P84Label":"architect_label",
                   "Architectural_Style_P149": "architectural_style",
                   "Architectural_Style_P149Label":"architectural_style_label",
                   "Inception_P571": "inception",
                   "Inception_P571Label": "inception_label",
                   "Operator_P137":"operator",
                   "Operator_P137Label":"operator_label",
                   "Maximum_Capacity_P1083":"maximum_capacity",
                   "Maximum_Capacity_P1083Label":"maximum_capacity_label",
                   "Heritage_Designation_P1435":"heritage_designation",
                   "Heritage_Designation_P1435Label":"heritage_designation_label",
                   "Street_Address_P6375":"address"
                  })

In [49]:
df_result

Unnamed: 0,building_type,building,building_label,country,country_label,administrative_entity,administrative_entity_label,lon_lat,architect,architect_label,...,architectural_style_label,inception,operator,operator_label,maximum_capacity,maximum_capacity_label,heritage_designation,heritage_designation_label,address,building_type_label
0,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q7315412,Residence of the Ambassador of the Netherlands...,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0523 38.9143),,,...,,,,,,,,,"2347 S Street NW, Washington DC, 20008",official residence
1,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q5501970,"French ambassador's residence in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.050715 38.91821),http://www.wikidata.org/entity/Q3188660,Jules Henri de Sibour,...,Tudor Revival architecture,,,,,,,,,official residence
2,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q7382168,"Russian ambassador's residence in Washington, ...",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.035833 38.904444),,,...,Beaux-Arts,,,,,,,,,official residence
3,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q4969437,"British Ambassador's residence in Washington, ...",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.062997 38.921089),http://www.wikidata.org/entity/Q378157,Edwin Lutyens,...,Queen Anne style architecture,,,,,,,,,official residence
4,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q6935179,Multnomah County Courthouse,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q6106,Portland,Point(-122.678056 45.516111),http://www.wikidata.org/entity/Q7993577,Whidden and Lewis,...,Neoclassical architecture,,,,,,http://www.wikidata.org/entity/Q19558910,NRHP listed place,,courthouse
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2634,http://www.wikidata.org/entity/Q40357,http://www.wikidata.org/entity/Q7682587,Tanforan Racetrack,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q99,California,Point(-122.418 37.6364),,,...,,,,,,,,,,prison
2635,http://www.wikidata.org/entity/Q40357,http://www.wikidata.org/entity/Q14679096,"Federal Correctional Institution, Aliceville",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q173,Alabama,Point(-88.1931 33.1759),,,...,,2013-01-01T00:00:00Z,http://www.wikidata.org/entity/Q1400015,Federal Bureau of Prisons,,,,,,prison
2636,http://www.wikidata.org/entity/Q40357,http://www.wikidata.org/entity/Q14686466,"United States Penitentiary, Coleman II",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q812,Florida,Point(-82.0142 28.7628),,,...,,,http://www.wikidata.org/entity/Q1400015,Federal Bureau of Prisons,,,,,,prison
2637,http://www.wikidata.org/entity/Q40357,http://www.wikidata.org/entity/Q106420515,Minidoka War Relocation Center,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1518127,Jerome,Point(-114.245593683 42.678267673),,,...,,1942-01-01T00:00:00Z,,,,,http://www.wikidata.org/entity/Q19558910,NRHP listed place,"Jerome, ID 83338",prison


In [50]:
df_result.to_csv('sparql_index_subclass_trawl_2021_05_05.csv')

In [51]:
from collections import Counter

In [52]:
Counter(df_result.building_type_label)

Counter({'official residence': 4,
         'courthouse': 1493,
         'border checkpoint': 59,
         'seat of local government': 673,
         'legislative building': 138,
         'prison': 272})

In [53]:
df_result.columns

Index(['building_type', 'building', 'building_label', 'country',
       'country_label', 'administrative_entity', 'administrative_entity_label',
       'lon_lat', 'architect', 'architect_label', 'architectural_style',
       'architectural_style_label', 'inception', 'operator', 'operator_label',
       'maximum_capacity', 'maximum_capacity_label', 'heritage_designation',
       'heritage_designation_label', 'address', 'building_type_label'],
      dtype='object')