# Objective

- Take a random sample of 100 (or 500, in the case of government buildings)
- Label them by hand to see if they're actually examples of the desired class
- Measure precision

# Results
- 500/500 entities are actually government buildings.
- 18/100 entities are actually occupations.
- 100/100 entities are actually cities.
- 13/100 entities are actually sports.
- 99/100 entities are actually houses.

In [1]:
import sys
!{sys.executable} -m pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
import datetime
import func_lib

endpoint_url = "https://query.wikidata.org/sparql"
item = "item"


class Relation:
    """
    The class returned when createRelation is called.
    It contains string field with query.
    We call Relation.query when we need to do the query.
    """

    def __init__(self, entity_id: str, property_id: str, isSubject: bool, rowVerbose: bool,
                 colVerbose: bool, time_property: str, time: str, name: str, label: bool, limit=10000, subclass=False):
        self.entity_id = entity_id
        self.query_str = ""
        self.dic = {}
        self.result_dic = {"Entity ID": []}
        self.df = pd.DataFrame()
        self.count = 0
        self.time_property = time_property
        self.time = time
        self.limit = limit
        self.subclass = subclass
        self.focus = "Entity ID"
        if property_id:
            self.extend(property_id, isSubject, name, rowVerbose, colVerbose, limit, time_property, time, label, subclass)

    def generate_html(self, name: str):
        html = (self.df).to_html()
        text_file = open(name, "w", encoding='utf-8')
        text_file.write(html)
        text_file.close()

    def query(self, require=None):
        if self.query_str == "":
            self.result_dic = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
            return self.result_dic
        results = get_results(endpoint_url, self.query_str)
        result_dict = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
        for i in range(1, self.count + 1):
            result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id']] = []
            if self.dic[i]["colVerbose"]:
                result_dict[self.dic[i]["name"] + '_rank_' + self.dic[i]['property_id'] + '_rank'] = []
                for key, value in self.dic[i]["property_name_dic"].items():
                    result_dict[
                        self.dic[i]["name"] + "_" + value + '_' + self.dic[i]['property_id'] + '_' + str(key)] = []
                for key, value in self.dic[i]["ref_dic"].items():
                    result_dict[self.dic[i]["name"] + "_ref_" + self.dic[i]['property_id'] + '_' + str(key)] = []

            if self.dic[i]["label"]:
                result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id'] + 'Label'] = []

        for result in results['results']['bindings']:
            for key, value in result_dict.items():
                if key in result.keys():
                    result_dict[key].append(result[key]['value'])
                else:
                    result_dict[key].append('NA')
        result_dict["Entity ID"] = ['http://www.wikidata.org/entity/' + str(self.entity_id)] * len(
            result_dict[self.dic[self.count]["name"] + '_' + self.dic[self.count]["property_id"]])
        self.result_dic = result_dict
        self.df = pd.DataFrame.from_dict(self.result_dic)
        for i in range(1, self.count + 1):
            if self.dic[i]["colVerbose"] and not self.dic[i]["rowVerbose"]:
                col = self.dic[i]['name'] + '_rank_' + self.dic[i]['property_id'] + '_rank'
                if any(self.df[col] == 'http://wikiba.se/ontology#PreferredRank'):
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#PreferredRank']
                else:
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#NormalRank']
#         if require is not None:
#             for r in require:
#                 self.df = self.df.loc[self.df[r] != 'NA']
        self.df = pd.DataFrame(data=self.df)
#         if self.df.shape[0] >= 10000:
#             print("Warning: Your query leads to too many results. Only 10,000 returned.")
        return self.df

    def extend(self, property_id: str, isSubject: bool, name: str, rowVerbose=False, colVerbose=False, limit=None,
               time_property=None, time=None, search=None, label=False, subclass=False):
        self.count += 1
        self.dic[self.count] = {}
        self.dic[self.count]["name"] = name
        self.dic[self.count]["focus"] = self.focus
        self.dic[self.count]["property_id"] = property_id
        self.dic[self.count]["isSubject"] = isSubject
        self.dic[self.count]["limit"] = limit
        self.dic[self.count]["rowVerbose"] = rowVerbose
        self.dic[self.count]["colVerbose"] = colVerbose
        self.dic[self.count]['time_property'] = time_property
        self.dic[self.count]['time'] = time
        self.dic[self.count]['search'] = search
        self.dic[self.count]['label'] = label
        self.dic[self.count]['subclass'] = subclass
        # subclass is specific to P31, when subclass is true, we get intance of all subclasses of the entity
        if rowVerbose or colVerbose:
            self.dic[self.count]["property_name_dic"], self.dic[self.count][
                "ref_dic"] = self.search_property_for_verbose()
        if time_property and time:
            self.time_property = time_property
            self.time = time
        if limit:
            self.limit = limit
        self.query_str = self.define_query_relation()

    def changeFocus(self, name="Entity ID"):
        self.focus = name
        
    def extendWithFunction(self, objcolumn, func, name):
        if type(func) == str:
            if func.startswith('F'):
                try:
                    func_id = int(func[1:])
                    if func_id >= func_lib.func_num():
                        print("Not available.")
                    else:
                        if isinstance(objcolumn, list):
                            self.df[name] = self.df[objcolumn].apply(lambda x: func_lib.func_list[func_id](*x), axis=1)
                        else:
                            self.df[name] = self.df[objcolumn].apply(func_lib.func_list[func_id])
                except:
                    raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
            else:
                raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
        else:
            if isinstance(objcolumn, list):
                self.df[name] = self.df[objcolumn].apply(lambda x: func(*x), axis=1)
            else:
                self.df[name] = self.df[objcolumn].apply(func)

    def define_query_relation(self):
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.count < 1:
            return None
        focusChanges = 0
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                if self.dic[i]["search"] is None and not self.dic[i]["isSubject"]:
                        rdf_triple += """OPTIONAL {"""
                if self.dic[i]["focus"] == "Entity ID":
#                     if self.dic[i]["search"] is None:
#                         rdf_triple += """OPTIONAL {"""
                    rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    rdf_triple += """?""" + self.dic[i]["focus"] + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                for key, value in self.dic[i]["property_name_dic"].items():
                    rdf_triple += """OPTIONAL { """ + """?statement_""" + str(i) + """ pq:""" + str(key) \
                                  + """ ?""" + self.dic[i]['name'] + """_""" + value + """_""" + self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                for key, value in self.dic[i]["ref_dic"].items():
                    rdf_triple += """OPTIONAL { ?statement_""" + str(
                        i) + """ prov:wasDerivedFrom ?refnode_""" + str(
                        i) + """. ?refnode_""" + str(i) \
                                  + """ pr:""" + str(key) + """ ?""" + self.dic[i]['name'] + """_ref_""" + \
                                  self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                rdf_triple += """OPTIONAL { ?statement_""" + str(i) + """ wikibase:rank ?""" + self.dic[i][
                    'name'] + """_rank_""" + self.dic[i]['property_id'] + """_rank. } """
            # none-verbose version
            else:
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
                        if not self.dic[i]['subclass']:
                            rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i][
                                              "property_id"] + """ wd:""" + self.entity_id + """. """
                        else:
                            if self.dic[i]['property_id'] == "P31":
                                rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                    'property_id'] + """ wdt:""" + self.dic[i][
                                                  "property_id"] + """ ?subclasses. """
                                rdf_triple += """?subclasses wdt:P279+ wd:""" + self.entity_id + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    if self.dic[i]["isSubject"]:
                        if not self.dic[i]['subclass']:
                            rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + self.dic[i][
                                              'focus'] + """. """
                        else:
                            if self.dic[i]['property_id'] == "P31":
                                rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i]["property_id"] + """ ?subclasses. """
                                rdf_triple += """?subclasses wdt:P279+ ?""" + self.dic[i]['focus'] + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """?""" + self.dic[i]['focus'] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                          'property_id'] + """. """
            if not self.dic[i]["isSubject"]:
                if i < self.count and self.dic[i]["focus"] != self.dic[i + 1]["focus"] and self.dic[i]["search"] is None:
                    focusChanges += 1
                elif self.dic[i]["search"] is None:
                    rdf_triple += """} """
                
        for i in range(focusChanges):
            rdf_triple += """} """
        for i in range(1, self.count + 1):
            if self.dic[i]['search'] is not None and self.dic[i]["search"] != '!NA':
                if isinstance(self.dic[i]['search'], tuple):
                    if isinstance(self.dic[i]['search'][0], str):
                        rdf_triple += """FILTER (YEAR(?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                            'property_id'] + """) >= """ + \
                                      self.dic[i]['search'][0] + """ && YEAR(?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """) <= """ + self.dic[i]['search'][
                                          1] + """) """
                    else:
                        rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i]['property_id'] + \
                                      """ >= """ + str(self.dic[i]['search'][0]) + """ && ?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """ <= """ + str(
                            self.dic[i]['search'][1]) + """) """
                else:
                    rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                        'property_id'] + """ = """ + \
                                  """wd:""" + self.dic[i]['search'] + """) """
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                          ?pubdateStatement ps:""" + self.time_property + """ ?date	
                          FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT"""
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
                for key, value in self.dic[i]["property_name_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_""" + value + """_""" + self.dic[i][
                        'property_id'] + """_""" + str(key)
                for key, value in self.dic[i]["ref_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_ref_""" + self.dic[i]['property_id'] + """_""" + str(
                        key)
                query += """ ?""" + self.dic[i]["name"] + """_rank_""" + self.dic[i]['property_id'] + """_rank"""
            else:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
        query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
        return query

    def search_property_for_verbose(self):
        property_to_name = {}
        ref_to_name = {}
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            for i in range(1, self.count):
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ wd:""" + self.entity_id + """ ."""
                    else:
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """ ."""
                else:
                    last = self.dic[i]["focus"].rfind('_')
                    focus = self.dic[i]["focus"][:last]
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + focus + """ ."""
                    else:
                        rdf_triple += """?""" + focus + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """ ."""
            if self.dic[self.count]["focus"] == "Entity ID":
                rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
            else:
                last = self.dic[self.count]["focus"].rfind('_')
                focus = self.dic[self.count]["focus"][:last]
                rdf_triple += """?""" + focus + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                                  ?pubdateStatement ps:""" + self.time_property + """ ?date	
                                  FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT """
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            query += """?item""" + """ ?qual""" + """ ?qualLabel""" + """ ?obj """ + """?pr ?prLabel"""
            query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
            query_result = get_results(endpoint_url, query)
            for result in query_result['results']['bindings']:
                if 'qual' in result:
                    property_to_name[result['qual']['value'].split('/')[-1]] = result['qualLabel']['value'].replace(' ',
                                                                                                                    '_')
                if 'pr' in result:
                    ref_to_name[result['pr']['value'].split('/')[-1]] = result['prLabel']['value'].replace(' ', '_')
        else:
            query += """?""" + self.dic[self.count]["name"] + """ """
        return property_to_name, ref_to_name

    def __str__(self):
        return str(self.df)

    def __getattr__(self, col_name):
        if col_name in self.df.columns:
            return self.df[col_name]
        else:
            print(col_name + " has not been found.")
            return None


def createRelation(entity_id: str, property_id=None, isSubject=None, rowVerbose=None, colVerbose=None,
                   time_property=None, time=None, name=None, label=False, limit=None, subclass=False):
    if property_id and not name:
        print("Please specify the name of the first column")
        return None
    return Relation(entity_id, property_id, isSubject, rowVerbose, colVerbose, time_property, time, name, label, limit, subclass)

def get_Firstname(name: str):
    return name.split(' ')[0]

def get_Lastname(name: str):
    return name.split(' ')[-1]

def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def get_name(id: str):
    query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 	
                PREFIX wd: <http://www.wikidata.org/entity/> 	
                select  *	
                where {	
                wd:""" + id + """ rdfs:label ?label .	
                FILTER (langMatches( lang(?label), "EN" ) )	
                } 	
                LIMIT 1"""
    results = get_results(endpoint_url, query)
    result = ''
    for res in results["results"]["bindings"]:
        result = res['label']['value']
    return result


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import random

In [5]:
r = createRelation("Q1137809")
r.extend('P31',True, 'Courthouse', limit=10, label=True)
r.changeFocus('Courthouse_P31')
r.extend('P17',False, 'Country')
r.query()
r.df

Unnamed: 0,Entity ID,Courthouse_P31,Courthouse_P31Label,Country_P17
0,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7863296,U.S. Post Office and Courthouse,http://www.wikidata.org/entity/Q30
1,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7863297,U.S. Post Office and Courthouse,http://www.wikidata.org/entity/Q30
2,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7889666,United States Court House,http://www.wikidata.org/entity/Q30
3,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7889667,United States Court House,http://www.wikidata.org/entity/Q30
4,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7889682,United States Courthouse,http://www.wikidata.org/entity/Q30
5,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7889683,United States Courthouse,http://www.wikidata.org/entity/Q30
6,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7889685,United States Courthouse,http://www.wikidata.org/entity/Q30
7,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7889703,United States Customhouse,http://www.wikidata.org/entity/Q30
8,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7889714,United States Customhouse,http://www.wikidata.org/entity/Q30
9,http://www.wikidata.org/entity/Q1137809,http://www.wikidata.org/entity/Q7891082,United States Post Office and Courthouse,http://www.wikidata.org/entity/Q30


In [6]:
pip install beautifulsoup4

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


# Government buildings in the US

In [6]:
gov_building_class = 'Gov_Building'
qnum = 'Q16831714' # qnum of government building
r = createRelation(qnum, label=True)
r.extend('P31', True, gov_building_class, label=True, subclass=True) # extend via property P279 = is subclass of
r.changeFocus('Gov_Building_P31')
r.extend('P17', False, 'Country',label=True, search="Q30")
r.query()
r.df

Unnamed: 0,Entity ID,Gov_Building_P31,Gov_Building_P31Label,Country_P17,Country_P17Label
0,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q16891643,Gib Lewis Unit,http://www.wikidata.org/entity/Q30,United States of America
1,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q16892091,Hancock State Prison,http://www.wikidata.org/entity/Q30,United States of America
2,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q16892539,Huerfano County Courthouse and Jail,http://www.wikidata.org/entity/Q30,United States of America
3,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q16894060,Lea County Correctional Center,http://www.wikidata.org/entity/Q30,United States of America
4,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q16895259,Moberly Correctional Center,http://www.wikidata.org/entity/Q30,United States of America
...,...,...,...,...,...
3866,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q5488071,"Frank M. Johnson, Jr., Federal Building and Un...",http://www.wikidata.org/entity/Q30,United States of America
3867,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q6825074,"Metropolitan Detention Center, Los Angeles",http://www.wikidata.org/entity/Q30,United States of America
3868,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q4325144,New Russia,http://www.wikidata.org/entity/Q30,United States of America
3869,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q91468230,Matagorda County Courthouse,http://www.wikidata.org/entity/Q30,United States of America


In [8]:
random.seed(100)
random_indices = random.sample(list(r.df.index), 500)
for ind in random_indices:
    print(r.df.Gov_Building_P31[ind], '\t', r.df.Gov_Building_P31Label[ind])

http://www.wikidata.org/entity/Q28446385 	 Big Muddy River Correctional Center
http://www.wikidata.org/entity/Q30624657 	 Grey Eagle Village Hall
http://www.wikidata.org/entity/Q96694097 	 Houghton Masonic Temple Building
http://www.wikidata.org/entity/Q14689588 	 Hodgeman County Courthouse
http://www.wikidata.org/entity/Q7338733 	 Riverview Correctional Facility
http://www.wikidata.org/entity/Q7443606 	 Second Tompkins County Courthouse
http://www.wikidata.org/entity/Q1340072 	 Metropolitan Correctional Center, San Diego
http://www.wikidata.org/entity/Q6163932 	 Jasper County Courthouse
http://www.wikidata.org/entity/Q4746049 	 Amherst Town Hall
http://www.wikidata.org/entity/Q7797658 	 Three Nations Crossing
http://www.wikidata.org/entity/Q7891079 	 United States Post Office and Court House
http://www.wikidata.org/entity/Q16199079 	 Allendale County Courthouse
http://www.wikidata.org/entity/Q28206745 	 Howard R. Young Correctional Institution
http://www.wikidata.org/entity/Q56278651 

http://www.wikidata.org/entity/Q14689375 	 Atchison County Courthouse
http://www.wikidata.org/entity/Q16903207 	 Whiteville Correctional Facility
http://www.wikidata.org/entity/Q93092543 	 Q93092543
http://www.wikidata.org/entity/Q7891102 	 United States Post Office and Courthouse
http://www.wikidata.org/entity/Q7230711 	 Port Isabel Detention Center
http://www.wikidata.org/entity/Q65120623 	 Hibbing City Hall
http://www.wikidata.org/entity/Q18205736 	 C.F. Haynsworth Federal Building and United States Courthouse
http://www.wikidata.org/entity/Q5003136 	 Butts County Courthouse
http://www.wikidata.org/entity/Q5562496 	 Gilmer County Courthouse
http://www.wikidata.org/entity/Q5026776 	 Camino Nuevo Correctional Center
http://www.wikidata.org/entity/Q85788351 	 Nevada County Courthouse
http://www.wikidata.org/entity/Q48816196 	 Brown County Courthouse
http://www.wikidata.org/entity/Q2985180 	 Montana State Capitol
http://www.wikidata.org/entity/Q16889777 	 White County Jail
http://www.wi

### Precision: 500/500 entities are actually government buildings.

# Occupations

In [10]:
qnum = 'Q12737077' # qnum of occupation
r = createRelation(qnum, label=True)
r.extend('P31', True, 'Occupation', label=True, subclass=True)
r.query()
df_occupation = r.df
df_occupation

Unnamed: 0,Entity ID,Occupation_P31,Occupation_P31Label
0,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q201788,historian
1,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q250566,Q250566
2,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q212980,psychologist
3,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q268190,Q268190
4,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q245068,comedian
...,...,...,...
362249,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681058,Q97681058
362250,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681196,Q97681196
362251,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681207,Q97681207
362252,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681221,Q97681221


In [17]:
'engineer' in df_occupation.Occupation_P31Label.values

True

In [18]:
'data scientist' in df_occupation.Occupation_P31Label.values

True

In [19]:
'tax collector' in df_occupation.Occupation_P31Label.values

True

In [20]:
'doctor' in df_occupation.Occupation_P31Label.values

False

In [22]:
'physician' in df_occupation.Occupation_P31Label.values

True

In [35]:
random.seed(100)
random_indices = random.sample(list(df_occupation.index), 50)
for ind in random_indices:
    print(df_occupation.Occupation_P31[ind], '\t', df_occupation.Occupation_P31Label[ind])

http://www.wikidata.org/entity/Q63597861 	 A Study to Assess Bioquivalence Between a Novel Naproxen Sodium 275 mg Film-coated Tablet and Nalgesin Naproxen Sodium 275 mg Film-coated Tablet in Healthy Adult Volunteers
http://www.wikidata.org/entity/Q65470361 	 CRAIL - Controlled Reperfusion of the Acutely Ischemic Limb
http://www.wikidata.org/entity/Q65399668 	 Ameparomo Capsules 250 mg Drug Use Investigation
http://www.wikidata.org/entity/Q63834343 	 Circulating Endothelial Compartment During Normal and Pathological Aging
http://www.wikidata.org/entity/Q65348618 	 Study Evaluating Pyrotinib in Combination With Capecitabine In Patients With HER2 Positive Metastatic Breast Cancer
http://www.wikidata.org/entity/Q64790812 	 Contrast-Enhanced Subharmonic Ultrasound Imaging in Improving Characterization of Adnexal Masses in Patients Undergoing Surgery
http://www.wikidata.org/entity/Q65380811 	 Efficacy of Coreg CR and Lisinopril on Markers for Cardiovascular Functional and Structural Disease


In [33]:
df_occupation_filtered = df_occupation[df_occupation.Occupation_P31Label.str.len() < 40]
df_occupation_filtered

Unnamed: 0,Entity ID,Occupation_P31,Occupation_P31Label
0,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q201788,historian
1,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q250566,Q250566
2,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q212980,psychologist
3,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q268190,Q268190
4,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q245068,comedian
...,...,...,...
362249,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681058,Q97681058
362250,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681196,Q97681196
362251,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681207,Q97681207
362252,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q97681221,Q97681221


In [36]:
df_occupation_filtered = df_occupation_filtered[df_occupation_filtered.Occupation_P31Label.str[0] != 'Q']
df_occupation_filtered

Unnamed: 0,Entity ID,Occupation_P31,Occupation_P31Label
0,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q201788,historian
2,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q212980,psychologist
4,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q245068,comedian
5,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q201948,sniper
6,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q215536,merchant
...,...,...,...
362226,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q65548935,Dual Frequency Synthetic Aperture Radar
362227,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q67226868,Prince Edward station attack
362228,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q62024396,Radartoren Tweede Maasvlakte
362232,http://www.wikidata.org/entity/Q12737077,http://www.wikidata.org/entity/Q49819,Charles Simonyi


In [38]:
random.seed(100)
random_indices = random.sample(list(df_occupation_filtered.index), 100)
for ind in random_indices:
    print(df_occupation_filtered.Occupation_P31[ind], '\t', df_occupation_filtered.Occupation_P31Label[ind])

http://www.wikidata.org/entity/Q60244908 	 plant physiologist
http://www.wikidata.org/entity/Q64622931 	 Amphetamine-Enhanced Stroke Recovery
http://www.wikidata.org/entity/Q64611437 	 Role of BCG as Booster Vaccination
http://www.wikidata.org/entity/Q66545898 	 Ocular Imaging Study Using Advanced OCT
http://www.wikidata.org/entity/Q876535 	 Federal Criminal Police Office
http://www.wikidata.org/entity/Q66068417 	 Monitoring Physical Activity (Fitbit)
http://www.wikidata.org/entity/Q64186642 	 AcoArt III / AVF China
http://www.wikidata.org/entity/Q66083962 	 Extending the hCG OPU Interval
http://www.wikidata.org/entity/Q63816058 	 VIBration Training in EpicondylitiS
http://www.wikidata.org/entity/Q64376017 	 Oxytocin and the Social Brain
http://www.wikidata.org/entity/Q64680854 	 Chromium and Insulin Resistance
http://www.wikidata.org/entity/Q76980528 	 Alpha-1 Research Registry
http://www.wikidata.org/entity/Q16736386 	 Corregedor
http://www.wikidata.org/entity/Q64789629 	 BCAA Supple

### Precision: 18/100 entities are actually occupations.

# Cities in the US

In [40]:
qnum = 'Q515' # qnum of city
r = createRelation(qnum, label=True)
r.extend('P31', True, 'city', label=True, subclass=True) # extend via property P279 = is subclass of
r.changeFocus('city_P31')
r.extend('P17', False, 'Country',label=True, search="Q30")
r.query()
df_US_cities = r.df
df_US_cities

Unnamed: 0,Entity ID,city_P31,city_P31Label,Country_P17,Country_P17Label
0,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q975,San Antonio,http://www.wikidata.org/entity/Q30,United States of America
1,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q16568,Jacksonville,http://www.wikidata.org/entity/Q30,United States of America
2,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q16739,Long Beach,http://www.wikidata.org/entity/Q30,United States of America
3,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q17042,Oakland,http://www.wikidata.org/entity/Q30,United States of America
4,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q28218,Baton Rouge,http://www.wikidata.org/entity/Q30,United States of America
...,...,...,...,...,...
10338,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q25395,Newark,http://www.wikidata.org/entity/Q30,United States of America
10339,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q2056387,Taylor Mill,http://www.wikidata.org/entity/Q30,United States of America
10340,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q544890,Plainfield,http://www.wikidata.org/entity/Q30,United States of America
10341,http://www.wikidata.org/entity/Q515,http://www.wikidata.org/entity/Q2055351,Badger,http://www.wikidata.org/entity/Q30,United States of America


This is actually not off by orders of magnitude: As of 2018, there are 19,502 incorporated cities, towns and villages in the United States. (https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-cities-and-towns.html)

In [42]:
random.seed(100)
random_indices = random.sample(list(df_US_cities.index), 100)
for ind in random_indices:
    print(df_US_cities.city_P31[ind], '\t', df_US_cities.city_P31Label[ind])

http://www.wikidata.org/entity/Q950926 	 Pleasant Hill
http://www.wikidata.org/entity/Q1890310 	 Webster
http://www.wikidata.org/entity/Q1931777 	 Dayton
http://www.wikidata.org/entity/Q771422 	 Viburnum
http://www.wikidata.org/entity/Q1925833 	 Schaller
http://www.wikidata.org/entity/Q1762858 	 Eufaula
http://www.wikidata.org/entity/Q1928324 	 Missouri Valley
http://www.wikidata.org/entity/Q2251642 	 Bock
http://www.wikidata.org/entity/Q483349 	 North Ogden
http://www.wikidata.org/entity/Q2374507 	 Redfield
http://www.wikidata.org/entity/Q661006 	 Bluff City
http://www.wikidata.org/entity/Q482523 	 Cottonwood Heights
http://www.wikidata.org/entity/Q1928512 	 Leighton
http://www.wikidata.org/entity/Q963694 	 Arbyrd
http://www.wikidata.org/entity/Q575097 	 Winnebago
http://www.wikidata.org/entity/Q807830 	 Barberton
http://www.wikidata.org/entity/Q1055861 	 Monroe
http://www.wikidata.org/entity/Q846426 	 Brentwood
http://www.wikidata.org/entity/Q1022748 	 Eastpointe
http://www.wikidata.

### Precision: 100/100 entities are actually cities.

# Sports

In [4]:
qnum = 'Q349' # qnum of sport
r = createRelation(qnum, label=True)
r.extend('P31', True, 'sport', label=True, subclass=True) # extend via property P279 = is subclass of
# r.changeFocus('sport_P31')
# r.extend('P17', False, 'Country',label=True, search="Q30")
r.query()
df_sports = r.df
df_sports

Unnamed: 0,Entity ID,sport_P31,sport_P31Label
0,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q280513,Rallye International du Valais
1,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q1414198,2012 Catalunya GP2 Series round
2,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q1415126,2005 Catalunya GP2 Series round
3,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q1486401,2011 GP2 Final
4,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q3418204,Q3418204
...,...,...,...
13735,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q4583876,1987 Rugby World Cup Final
13736,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q4600642,2002 Winter Olympics torch relay
13737,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q4601780,2003 Rugby World Cup Final
13738,http://www.wikidata.org/entity/Q349,http://www.wikidata.org/entity/Q4618095,2010 Olympics torch relay


In [5]:
random.seed(100)
random_indices = random.sample(list(df_sports.index), 100)
for ind in random_indices:
    print(df_sports.sport_P31[ind], '\t', df_sports.sport_P31Label[ind])

http://www.wikidata.org/entity/Q7579428 	 sport in Chennai
http://www.wikidata.org/entity/Q2937998 	 Q2937998
http://www.wikidata.org/entity/Q2838442 	 Q2838442
http://www.wikidata.org/entity/Q2748030 	 1985 London Marathon
http://www.wikidata.org/entity/Q18433701 	 Rajd Wisły 2014
http://www.wikidata.org/entity/Q33832436 	 Q33832436
http://www.wikidata.org/entity/Q2327670 	 Q2327670
http://www.wikidata.org/entity/Q56011146 	 Q56011146
http://www.wikidata.org/entity/Q2065062 	 Q2065062
http://www.wikidata.org/entity/Q2642519 	 Q2642519
http://www.wikidata.org/entity/Q13758784 	 Q13758784
http://www.wikidata.org/entity/Q3947239 	 Q3947239
http://www.wikidata.org/entity/Q5089139 	 Checkless chess
http://www.wikidata.org/entity/Q13761443 	 Q13761443
http://www.wikidata.org/entity/Q5717479 	 basketball in Slovenia
http://www.wikidata.org/entity/Q2397681 	 golf in France
http://www.wikidata.org/entity/Q1341108 	 Rotterdam Marathon
http://www.wikidata.org/entity/Q2860712 	 Q2860712
http://ww

### Precision: 13/100 entities are actually sports.

# Houses

In [7]:
qnum = 'Q3947' # qnum of house
r = createRelation(qnum, label=True)
r.extend('P31', True, 'house', label=True, subclass=True) # extend via property P279 = is subclass of
r.changeFocus('house_P31')
r.extend('P17', False, 'Country',label=True, search="Q30")
r.query()
df_houses = r.df
df_houses

Unnamed: 0,Entity ID,house_P31,house_P31Label,Country_P17,Country_P17Label
0,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q4165429,Hasbrouck House,http://www.wikidata.org/entity/Q30,United States of America
1,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q4165373,Plumbush,http://www.wikidata.org/entity/Q30,United States of America
2,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q4668163,Abner Woodworth House,http://www.wikidata.org/entity/Q30,United States of America
3,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q817711,Benjamin Ten Broeck House,http://www.wikidata.org/entity/Q30,United States of America
4,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q1433592,Walter Hand House,http://www.wikidata.org/entity/Q30,United States of America
...,...,...,...,...,...
5293,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q85770685,James M. Fisher House,http://www.wikidata.org/entity/Q30,United States of America
5294,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q68829356,Governor Seay Mansion,http://www.wikidata.org/entity/Q30,United States of America
5295,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q7350903,Robert Waugh House,http://www.wikidata.org/entity/Q30,United States of America
5296,http://www.wikidata.org/entity/Q3947,http://www.wikidata.org/entity/Q47106956,Trube Castle,http://www.wikidata.org/entity/Q30,United States of America


In [8]:
random.seed(100)
random_indices = random.sample(list(df_houses.index), 100)
for ind in random_indices:
    print(df_houses.house_P31[ind], '\t', df_houses.house_P31Label[ind])

http://www.wikidata.org/entity/Q6816445 	 Menands Manor
http://www.wikidata.org/entity/Q5232715 	 David Cummins Octagon House
http://www.wikidata.org/entity/Q5304363 	 Dr. Samuel Catlin House
http://www.wikidata.org/entity/Q8006330 	 William C. Jayne House
http://www.wikidata.org/entity/Q24041329 	 Aldridge H. Vann House
http://www.wikidata.org/entity/Q23016860 	 Basil Doerhoefer House
http://www.wikidata.org/entity/Q25203571 	 Hudson House
http://www.wikidata.org/entity/Q7913732 	 Van Vleck House and Barn
http://www.wikidata.org/entity/Q5914194 	 House at 285 Sea Cliff Avenue
http://www.wikidata.org/entity/Q6140363 	 James Nicholson House
http://www.wikidata.org/entity/Q5902266 	 Horace and Grace Bush House
http://www.wikidata.org/entity/Q4887856 	 Benham House
http://www.wikidata.org/entity/Q5322214 	 E. W. Marland Mansion
http://www.wikidata.org/entity/Q34831176 	 Barbeau House
http://www.wikidata.org/entity/Q7856928 	 Tuthill-Lapham House
http://www.wikidata.org/entity/Q6206703 	 J

### Precision: 99/100 entities are actually houses.