# Welcome to Jupyter!

This repo contains an introduction to [Jupyter](https://jupyter.org) and [IPython](https://ipython.org).

Outline of some basics:

* [Notebook Basics](../examples/Notebook/Notebook%20Basics.ipynb)
* [IPython - beyond plain python](../examples/IPython%20Kernel/Beyond%20Plain%20Python.ipynb)
* [Markdown Cells](../examples/Notebook/Working%20With%20Markdown%20Cells.ipynb)
* [Rich Display System](../examples/IPython%20Kernel/Rich%20Output.ipynb)
* [Custom Display logic](../examples/IPython%20Kernel/Custom%20Display%20Logic.ipynb)
* [Running a Secure Public Notebook Server](../examples/Notebook/Running%20the%20Notebook%20Server.ipynb#Securing-the-notebook-server)
* [How Jupyter works](../examples/Notebook/Multiple%20Languages%2C%20Frontends.ipynb) to run code in different languages.

You can also get this tutorial and run it on your laptop:

    git clone https://github.com/ipython/ipython-in-depth

Install IPython and Jupyter:

with [conda](https://www.anaconda.com/download):

    conda install ipython jupyter

with pip:

    # first, always upgrade pip!
    pip install --upgrade pip
    pip install --upgrade ipython jupyter

Start the notebook in the tutorial directory:

    cd ipython-in-depth
    jupyter notebook

In [3]:
import sys
!{sys.executable} -m pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
import datetime
# import func_lib

endpoint_url = "https://query.wikidata.org/sparql"
item = "item"


class Relation:
    """
    The class returned when createRelation is called.
    It contains string field with query.
    We call Relation.query when we need to do the query.
    """

    def __init__(self, entity_id: str, property_id: str, isSubject: bool, rowVerbose: bool,
                 colVerbose: bool, time_property: str, time: str, name: str, label: bool, limit=10000):
        self.entity_id = entity_id
        self.query_str = ""
        self.dic = {}
        self.result_dic = {"Entity ID": []}
        self.df = pd.DataFrame()
        self.count = 0
        self.time_property = time_property
        self.time = time
        self.limit = limit
        self.focus = "Entity ID"
        if property_id:
            self.extend(property_id, isSubject, name, rowVerbose, colVerbose, limit, time_property, time, label)

    def generate_html(self, name: str):
        html = (self.df).to_html()
        text_file = open(name, "w", encoding='utf-8')
        text_file.write(html)
        text_file.close()

    def query(self, require=None):
        if self.query_str == "":
            self.result_dic = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
            return self.result_dic
        results = get_results(endpoint_url, self.query_str)
        result_dict = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
        for i in range(1, self.count + 1):
            result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id']] = []
            if self.dic[i]["colVerbose"]:
                result_dict[self.dic[i]["name"] + '_rank_' + self.dic[i]['property_id'] + '_rank'] = []
                for key, value in self.dic[i]["property_name_dic"].items():
                    result_dict[
                        self.dic[i]["name"] + "_" + value + '_' + self.dic[i]['property_id'] + '_' + str(key)] = []
                for key, value in self.dic[i]["ref_dic"].items():
                    result_dict[self.dic[i]["name"] + "_ref_" + self.dic[i]['property_id'] + '_' + str(key)] = []

            if self.dic[i]["label"]:
                result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id'] + 'Label'] = []

        for result in results['results']['bindings']:
            for key, value in result_dict.items():
                if key in result.keys():
                    result_dict[key].append(result[key]['value'])
                else:
                    result_dict[key].append('NA')
        result_dict["Entity ID"] = ['http://www.wikidata.org/entity/' + str(self.entity_id)] * len(
            result_dict[self.dic[self.count]["name"] + '_' + self.dic[self.count]["property_id"]])
        self.result_dic = result_dict
        self.df = pd.DataFrame.from_dict(self.result_dic)
        for i in range(1, self.count + 1):
            if self.dic[i]["colVerbose"] and not self.dic[i]["rowVerbose"]:
                col = self.dic[i]['name'] + '_rank_' + self.dic[i]['property_id'] + '_rank'
                if any(self.df[col] == 'http://wikiba.se/ontology#PreferredRank'):
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#PreferredRank']
                else:
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#NormalRank']
#         if require is not None:
#             for r in require:
#                 self.df = self.df.loc[self.df[r] != 'NA']
        self.df = pd.DataFrame(data=self.df)
#         if self.df.shape[0] >= 10000:
#             print("Warning: Your query leads to too many results. Only 10,000 returned.")
        return self.df

    def extend(self, property_id: str, isSubject: bool, name: str, rowVerbose=False, colVerbose=False, limit=None,
               time_property=None, time=None, search=None, label=False):
        self.count += 1
        self.dic[self.count] = {}
        self.dic[self.count]["name"] = name
        self.dic[self.count]["focus"] = self.focus
        self.dic[self.count]["property_id"] = property_id
        self.dic[self.count]["isSubject"] = isSubject
        self.dic[self.count]["limit"] = limit
        self.dic[self.count]["rowVerbose"] = rowVerbose
        self.dic[self.count]["colVerbose"] = colVerbose
        self.dic[self.count]['time_property'] = time_property
        self.dic[self.count]['time'] = time
        self.dic[self.count]['search'] = search
        self.dic[self.count]['label'] = label
        if rowVerbose or colVerbose:
            self.dic[self.count]["property_name_dic"], self.dic[self.count][
                "ref_dic"] = self.search_property_for_verbose()
        if time_property and time:
            self.time_property = time_property
            self.time = time
        if limit:
            self.limit = limit
        self.query_str = self.define_query_relation()

    def changeFocus(self, name="Entity ID"):
        self.focus = name
        
    def applyFunction(self, objcolumn, func, name):
        if type(func) == str:
            if func.startswith('F'):
                try:
                    func_id = int(func[1:])
                    if func_id == 0:
                        self.df[name] = self.df[objcolumn]
                    else:
                        if func_id >= func_lib.func_num():
                            print("Not available.")
                        else:
                            self.df[name] = self.df[objcolumn].apply(func_lib.func_list[func_id])
                except:
                    raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
            else:
                raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
        else:
            self.df[name] = self.df[objcolumn].apply(func)

    def define_query_relation(self):
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.count < 1:
            return None
        focusChanges = 0
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                if self.dic[i]["search"] is None and not self.dic[i]["isSubject"]:
                        rdf_triple += """OPTIONAL {"""
                if self.dic[i]["focus"] == "Entity ID":
#                     if self.dic[i]["search"] is None:
#                         rdf_triple += """OPTIONAL {"""
                    rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    rdf_triple += """?""" + self.dic[i]["focus"] + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                for key, value in self.dic[i]["property_name_dic"].items():
                    rdf_triple += """OPTIONAL { """ + """?statement_""" + str(i) + """ pq:""" + str(key) \
                                  + """ ?""" + self.dic[i]['name'] + """_""" + value + """_""" + self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                for key, value in self.dic[i]["ref_dic"].items():
                    rdf_triple += """OPTIONAL { ?statement_""" + str(
                        i) + """ prov:wasDerivedFrom ?refnode_""" + str(
                        i) + """. ?refnode_""" + str(i) \
                                  + """ pr:""" + str(key) + """ ?""" + self.dic[i]['name'] + """_ref_""" + \
                                  self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                rdf_triple += """OPTIONAL { ?statement_""" + str(i) + """ wikibase:rank ?""" + self.dic[i][
                    'name'] + """_rank_""" + self.dic[i]['property_id'] + """_rank. } """
            # none-verbose version
            else:
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
#                         if self.dic[i]["search"] is None:
#                             rdf_triple += """OPTIONAL {"""
                        rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                            'property_id'] + """ wdt:""" + self.dic[i][
                                          "property_id"] + """ wd:""" + self.entity_id + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    if self.dic[i]["isSubject"]:
#                         if self.dic[i]["search"] is None:
#                             rdf_triple += """OPTIONAL {"""
                        rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                            'property_id'] + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + self.dic[i][
                                          'focus'] + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """?""" + self.dic[i]['focus'] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                          'property_id'] + """. """
            if not self.dic[i]["isSubject"]:
                if i < self.count and self.dic[i]["focus"] != self.dic[i + 1]["focus"] and self.dic[i]["search"] is None:
                    focusChanges += 1
                elif self.dic[i]["search"] is None:
                    rdf_triple += """} """
        for i in range(focusChanges):
            rdf_triple += """} """
        for i in range(1, self.count + 1):
            if self.dic[i]['search'] is not None and self.dic[i]["search"] != '!NA':
                if isinstance(self.dic[i]['search'], tuple):
                    if isinstance(self.dic[i]['search'][0], str):
                        rdf_triple += """FILTER (YEAR(?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                            'property_id'] + """) >= """ + \
                                      self.dic[i]['search'][0] + """ && YEAR(?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """) <= """ + self.dic[i]['search'][
                                          1] + """) """
                    else:
                        rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i]['property_id'] + \
                                      """ >= """ + str(self.dic[i]['search'][0]) + """ && ?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """ <= """ + str(
                            self.dic[i]['search'][1]) + """) """
                else:
                    rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                        'property_id'] + """ = """ + \
                                  """wd:""" + self.dic[i]['search'] + """) """
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                          ?pubdateStatement ps:""" + self.time_property + """ ?date	
                          FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT"""
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
                for key, value in self.dic[i]["property_name_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_""" + value + """_""" + self.dic[i][
                        'property_id'] + """_""" + str(key)
                for key, value in self.dic[i]["ref_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_ref_""" + self.dic[i]['property_id'] + """_""" + str(
                        key)
                query += """ ?""" + self.dic[i]["name"] + """_rank_""" + self.dic[i]['property_id'] + """_rank"""
            else:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
        query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
        return query

    def search_property_for_verbose(self):
        property_to_name = {}
        ref_to_name = {}
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            for i in range(1, self.count):
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ wd:""" + self.entity_id + """ ."""
                    else:
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """ ."""
                else:
                    last = self.dic[i]["focus"].rfind('_')
                    focus = self.dic[i]["focus"][:last]
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + focus + """ ."""
                    else:
                        rdf_triple += """?""" + focus + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """ ."""
            if self.dic[self.count]["focus"] == "Entity ID":
                rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
            else:
                last = self.dic[self.count]["focus"].rfind('_')
                focus = self.dic[self.count]["focus"][:last]
                rdf_triple += """?""" + focus + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                                  ?pubdateStatement ps:""" + self.time_property + """ ?date	
                                  FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT """
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            query += """?item""" + """ ?qual""" + """ ?qualLabel""" + """ ?obj """ + """?pr ?prLabel"""
            query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
            query_result = get_results(endpoint_url, query)
            for result in query_result['results']['bindings']:
                if 'qual' in result:
                    property_to_name[result['qual']['value'].split('/')[-1]] = result['qualLabel']['value'].replace(' ',
                                                                                                                    '_')
                if 'pr' in result:
                    ref_to_name[result['pr']['value'].split('/')[-1]] = result['prLabel']['value'].replace(' ', '_')
        else:
            query += """?""" + self.dic[self.count]["name"] + """ """
        return property_to_name, ref_to_name

    def __str__(self):
        return str(self.df)

    def __getattr__(self, col_name):
        if col_name in self.df.columns:
            return self.df[col_name]
        else:
            print(col_name + " has not been found.")
            return None


def createRelation(entity_id: str, property_id=None, isSubject=None, rowVerbose=None, colVerbose=None,
                   time_property=None, time=None, name=None, label=False, limit=None):
    if property_id and not name:
        print("Please specify the name of the first column")
        return None
    return Relation(entity_id, property_id, isSubject, rowVerbose, colVerbose, time_property, time, name, label, limit)

def get_Firstname(name: str):
    return name.split(' ')[0]

def get_Lastname(name: str):
    return name.split(' ')[-1]

def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def get_name(id: str):
    query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 	
                PREFIX wd: <http://www.wikidata.org/entity/> 	
                select  *	
                where {	
                wd:""" + id + """ rdfs:label ?label .	
                FILTER (langMatches( lang(?label), "EN" ) )	
                } 	
                LIMIT 1"""
    results = get_results(endpoint_url, query)
    result = ''
    for res in results["results"]["bindings"]:
        result = res['label']['value']
    return result


You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m


# Query US Gov Buildings

## Experiment with just 10 types of government buildings

### Find every direct subclass of `government building` in the world.

In [86]:
gov_building_class = 'Gov_Building'
qnum = 'Q16831714' # qnum of government building
r = createRelation(qnum, label=True)
r.extend('P279', True, gov_building_class, label=True) # extend via property P279 = is subclass of
r.query()
r.df

Unnamed: 0,Entity ID,Gov_Building_P279,Gov_Building_P279Label
0,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q481289,official residence
1,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q1407236,weigh house
2,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q1137809,courthouse
3,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q757292,border checkpoint
4,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q218653,custom house
5,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q5469110,Forest Service Guard Station
6,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q100705206,Prefectural Office Building
7,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q3250715,province building
8,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q861951,police station
9,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q30124446,legislative building


### Grab 10 direct subclasses of `government building`.

In [87]:
df = r.df[:10]
df

Unnamed: 0,Entity ID,Gov_Building_P279,Gov_Building_P279Label
0,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q481289,official residence
1,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q1407236,weigh house
2,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q1137809,courthouse
3,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q757292,border checkpoint
4,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q218653,custom house
5,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q5469110,Forest Service Guard Station
6,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q100705206,Prefectural Office Building
7,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q3250715,province building
8,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q861951,police station
9,http://www.wikidata.org/entity/Q16831714,http://www.wikidata.org/entity/Q30124446,legislative building


### Find every instance of each of these 10 subclasses of government buildings.

I don't actually need to do the operations to check whether lon and lat are in the US because these instances tend to have country as an attribute, so I can simply:
- change focus
- get country (and state and lon_lat)
- filter

### Concatenate all these dataframes together.

Maybe when this is working for real, it is likely we would publish each individual subclass's dataframe to the KNP individually and then union them as another step later.

In [94]:
column_names = [
        'Entity ID', 'gov_building_subclass_P31',
       'gov_building_subclass_P31Label', 'Country_P17', 'Country_P17Label',
       'State_P131', 'State_P131Label', 'Lon_Lat_P625'
]

df_total = pd.DataFrame(columns=column_names)

for ind in range(len(df)):
    gov_building_subclass = r.df.Gov_Building_P279Label[ind]
#     gov_building_subclass = gov_building_subclass.replace(' ', '_')
    qnum = r.df.Gov_Building_P279[ind].split('/')[-1]
    r2 = createRelation(qnum, label=True)
#     r2.extend('P31', True, gov_building_subclass, label=True) # extend via property P31 = is instance of
    r2.extend('P31', True, 'gov_building_subclass', label=True) # extend via property P31 = is instance of

#     r2.changeFocus('%s_P31' % gov_building_subclass)
    r2.changeFocus('gov_building_subclass_P31')
    r2.extend('P17', False, 'Country', label=True) # extend via property P17 = is in country
    r2.extend('P131', False, 'State', label=True)
    r2.extend('P625', False, 'Lon_Lat')
    r2.query()

    # filter r2's dataframe to only include entities in the US
    df2 = r2.df
    df2 = df2[df2.Country_P17Label=='United States of America']
    
    # add column for name of subclass of gov building
    df2['gov_building_subclass_name'] = [gov_building_subclass for _ in range(len(df2))]

    print('There are %s instances of %s in the US.' % (str(len(df2)), gov_building_subclass))
    df_total = pd.concat([df_total, df2], axis=0, ignore_index=True)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['gov_building_subclass_name'] = [gov_building_subclass for _ in range(len(df2))]


There are 14 instances of official residence in the US.
There are 0 instances of weigh house in the US.
There are 412 instances of courthouse in the US.
There are 165 instances of border checkpoint in the US.
There are 40 instances of custom house in the US.
There are 3 instances of Forest Service Guard Station in the US.
There are 0 instances of Prefectural Office Building in the US.
There are 0 instances of province building in the US.
There are 23 instances of police station in the US.
There are 6 instances of legislative building in the US.


In [95]:
df_total

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,gov_building_subclass_name
0,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128),official residence
1,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q4165416,Thomas T. Gaff House,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.045186111 38.910952777),official residence
2,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q5070762,Chancellor's Residence,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1342,Pittsburgh,Point(-79.9446 40.4489),official residence
3,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q6689127,Louisiana Governor's Mansion,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1588,Louisiana,Point(-91.180547 30.458981),official residence
4,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q5554707,Getty House,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q99,California,Point(-118.319 34.0633),official residence
...,...,...,...,...,...,...,...,...,...
658,http://www.wikidata.org/entity/Q30124446,http://www.wikidata.org/entity/Q2281225,Ohio Statehouse,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q16567,Columbus,Point(-82.99911 39.96131),legislative building
659,http://www.wikidata.org/entity/Q30124446,http://www.wikidata.org/entity/Q390028,Independence Hall,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1345,Philadelphia,Point(-75.15 39.948888888),legislative building
660,http://www.wikidata.org/entity/Q30124446,http://www.wikidata.org/entity/Q7054610,North Carolina State Legislative Building,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q41087,Raleigh,Point(-78.639 35.7832),legislative building
661,http://www.wikidata.org/entity/Q30124446,http://www.wikidata.org/entity/Q7085084,Old State House,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q33486,Hartford,Point(-72.672778 41.766111),legislative building


In [96]:
df_total = df_total[[
    'Entity ID', 'gov_building_subclass_name', 'gov_building_subclass_P31',
       'gov_building_subclass_P31Label', 'Country_P17', 'Country_P17Label',
       'State_P131', 'State_P131Label', 'Lon_Lat_P625'
]]

In [97]:
df_total

Unnamed: 0,Entity ID,gov_building_subclass_name,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625
0,http://www.wikidata.org/entity/Q481289,official residence,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128)
1,http://www.wikidata.org/entity/Q481289,official residence,http://www.wikidata.org/entity/Q4165416,Thomas T. Gaff House,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.045186111 38.910952777)
2,http://www.wikidata.org/entity/Q481289,official residence,http://www.wikidata.org/entity/Q5070762,Chancellor's Residence,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1342,Pittsburgh,Point(-79.9446 40.4489)
3,http://www.wikidata.org/entity/Q481289,official residence,http://www.wikidata.org/entity/Q6689127,Louisiana Governor's Mansion,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1588,Louisiana,Point(-91.180547 30.458981)
4,http://www.wikidata.org/entity/Q481289,official residence,http://www.wikidata.org/entity/Q5554707,Getty House,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q99,California,Point(-118.319 34.0633)
...,...,...,...,...,...,...,...,...,...
658,http://www.wikidata.org/entity/Q30124446,legislative building,http://www.wikidata.org/entity/Q2281225,Ohio Statehouse,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q16567,Columbus,Point(-82.99911 39.96131)
659,http://www.wikidata.org/entity/Q30124446,legislative building,http://www.wikidata.org/entity/Q390028,Independence Hall,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q1345,Philadelphia,Point(-75.15 39.948888888)
660,http://www.wikidata.org/entity/Q30124446,legislative building,http://www.wikidata.org/entity/Q7054610,North Carolina State Legislative Building,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q41087,Raleigh,Point(-78.639 35.7832)
661,http://www.wikidata.org/entity/Q30124446,legislative building,http://www.wikidata.org/entity/Q7085084,Old State House,http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q33486,Hartford,Point(-72.672778 41.766111)


### Something inconvenient (but not a big deal)

It looks like filtering for entities in the US can only happen as the final step...

These steps:
- extend r2 via property P17 = is in country
- filter r2's dataframe to only include entities in the US
- extend r2 via property P131 = is in state / administrative territory
- extend r2 via property P625 = has lon/lat

Results in r2 having a df that contains entities in other countries.

I'm not sure there's currently any way of getting around this, but it's also not a huge deal. Just a bit inefficient but everything is really fast anyway.

#### Example of this issue:

In [57]:
ind = 0

gov_building_subclass = r.df.Gov_Building_P279Label[ind]
gov_building_subclass = gov_building_subclass.replace(' ', '_')
qnum = r.df.Gov_Building_P279[ind].split('/')[-1]
r2 = createRelation(qnum, label=True)
r2.extend('P31', True, gov_building_subclass, label=True) # extend via property P31 = is instance of

r2.changeFocus('%s_P31' % gov_building_subclass)
r2.extend('P17', False, 'Country', label=True) # extend via property P17 = is in country
r2.query()

# filter r2's dataframe to only include entities in the US
df2 = r2.df
r2.df = df2[df2.Country_P17Label=='United States of America']

r2.extend('P131', False, 'State', label=True)
r2.extend('P625', False, 'Lon_Lat')

r2.query()

r2.df

Unnamed: 0,Entity ID,official_residence_P31,official_residence_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625
0,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q55599780,official residence in Railway Station Park,http://www.wikidata.org/entity/Q33,Finland,http://www.wikidata.org/entity/Q429864,Riihimäki,Point(24.77505868 60.73506062)
1,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q55599767,president's residence in Kultaranta,http://www.wikidata.org/entity/Q33,Finland,http://www.wikidata.org/entity/Q503862,Naantali,Point(21.99911982 60.46759485)
2,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q96820594,Amtshaus Burgellern,http://www.wikidata.org/entity/Q183,Germany,http://www.wikidata.org/entity/Q238951,Scheßlitz,Point(11.04425 49.987583333)
3,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q55599758,senior physician's residence in Takaharju sana...,http://www.wikidata.org/entity/Q33,Finland,http://www.wikidata.org/entity/Q683512,Savonlinna,Point(29.30527685 61.78619362)
4,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q65081205,Palace of the Golden Gate,http://www.wikidata.org/entity/Q12536,Abbasid Caliphate,http://www.wikidata.org/entity/Q1530,Baghdad,
...,...,...,...,...,...,...,...,...
115,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q24233913,Genadendal Residence,http://www.wikidata.org/entity/Q258,South Africa,,,
116,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q3305377,National Palace,http://www.wikidata.org/entity/Q786,Dominican Republic,http://www.wikidata.org/entity/Q34820,Santo Domingo,Point(-69.89777778 18.47416667)
117,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q2356722,Quinta Vigia,http://www.wikidata.org/entity/Q45,Portugal,http://www.wikidata.org/entity/Q2190091,Sé,Point(-16.915725708 32.644916534)
118,http://www.wikidata.org/entity/Q481289,http://www.wikidata.org/entity/Q55023463,The former doctor's house,http://www.wikidata.org/entity/Q33,Finland,http://www.wikidata.org/entity/Q1001783,Eurajoki,Point(21.74149409 61.20275648)
