# Objective

Test functions for relations with entities that have coordinates.

# Updates 12/9

### parse_lon_lat()

I updated parse_lon_lat() so that now you can input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple

so it's agnostic, and either way it'll output longitude latitude tuples ready to be used by lon_lat_to_fips(), get_fips() and other functions.

### to_fips()

lon_lat_to_fips() has been replaced by to_fips(), which can take as input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple

so it's agnostic, and either way it'll output a FIPS code.

### to_county_name()

fips_to_county() has been replaced by to_county_name(), which can take as input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple
- OR a 5-character string FIPS code

so it's agnostic, and either way it'll output a county name.

### to_urban_area()

lon_lat_to_urban_area() has been replaced by to_urban_area(), which can take as input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple

so it's agnostic, and either way it'll output an urban area name.

In [1]:
import func_lib
import geopandas as gpd
import numpy as np
import pandas as pd
import shapefile as shp
from shapely.geometry import Point

In [2]:
import sys
!{sys.executable} -m pip install SPARQLWrapper
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
import datetime
import func_lib

endpoint_url = "https://query.wikidata.org/sparql"
item = "item"

class Relation:
    """
    The class returned when createRelation is called.
    It contains string field with query.
    We call Relation.query when we need to do the query.
    """

    def __init__(self, entity_id: str, property_id: str, isSubject: bool, rowVerbose: bool,
                 colVerbose: bool, time_property: str, time: str, name: str, label: bool, limit=10000, subclass=False):
        self.entity_id = entity_id
        self.query_str = ""
        self.dic = {}
        self.result_dic = {"Entity ID": []}
        self.df = pd.DataFrame()
        self.count = 0
        self.time_property = time_property
        self.time = time
        self.limit = limit
        self.subclass = subclass
        self.focus = "Entity ID"
        if property_id:
            self.extend(property_id, isSubject, name, rowVerbose, colVerbose, limit, time_property, time, label, subclass)

    def generate_html(self, name: str):
        html = (self.df).to_html()
        text_file = open(name, "w", encoding='utf-8')
        text_file.write(html)
        text_file.close()

    def query(self, require=None):
        if self.query_str == "":
            self.result_dic = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
            return self.result_dic
        results = get_results(endpoint_url, self.query_str)
        result_dict = {"Entity ID": ['http://www.wikidata.org/entity/' + str(self.entity_id)]}
        for i in range(1, self.count + 1):
            result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id']] = []
            if self.dic[i]["colVerbose"]:
                result_dict[self.dic[i]["name"] + '_rank_' + self.dic[i]['property_id'] + '_rank'] = []
                for key, value in self.dic[i]["property_name_dic"].items():
                    result_dict[
                        self.dic[i]["name"] + "_" + value + '_' + self.dic[i]['property_id'] + '_' + str(key)] = []
                for key, value in self.dic[i]["ref_dic"].items():
                    result_dict[self.dic[i]["name"] + "_ref_" + self.dic[i]['property_id'] + '_' + str(key)] = []

            if self.dic[i]["label"]:
                result_dict[self.dic[i]["name"] + '_' + self.dic[i]['property_id'] + 'Label'] = []

        for result in results['results']['bindings']:
            for key, value in result_dict.items():
                if key in result.keys():
                    result_dict[key].append(result[key]['value'])
                else:
                    result_dict[key].append('NA')
        result_dict["Entity ID"] = ['http://www.wikidata.org/entity/' + str(self.entity_id)] * len(
            result_dict[self.dic[self.count]["name"] + '_' + self.dic[self.count]["property_id"]])
        self.result_dic = result_dict
        self.df = pd.DataFrame.from_dict(self.result_dic)
        for i in range(1, self.count + 1):
            if self.dic[i]["colVerbose"] and not self.dic[i]["rowVerbose"]:
                col = self.dic[i]['name'] + '_rank_' + self.dic[i]['property_id'] + '_rank'
                if any(self.df[col] == 'http://wikiba.se/ontology#PreferredRank'):
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#PreferredRank']
                else:
                    self.df = self.df.loc[self.df[col] == 'http://wikiba.se/ontology#NormalRank']
#         if require is not None:
#             for r in require:
#                 self.df = self.df.loc[self.df[r] != 'NA']
        self.df = pd.DataFrame(data=self.df)
#         if self.df.shape[0] >= 10000:
#             print("Warning: Your query leads to too many results. Only 10,000 returned.")
        return self.df

    def extend(self, property_id: str, isSubject: bool, name: str, rowVerbose=False, colVerbose=False, limit=None,
               time_property=None, time=None, search=None, label=False, subclass=False):
        self.count += 1
        self.dic[self.count] = {}
        self.dic[self.count]["name"] = name
        self.dic[self.count]["focus"] = self.focus
        self.dic[self.count]["property_id"] = property_id
        self.dic[self.count]["isSubject"] = isSubject
        self.dic[self.count]["limit"] = limit
        self.dic[self.count]["rowVerbose"] = rowVerbose
        self.dic[self.count]["colVerbose"] = colVerbose
        self.dic[self.count]['time_property'] = time_property
        self.dic[self.count]['time'] = time
        self.dic[self.count]['search'] = search
        self.dic[self.count]['label'] = label
        self.dic[self.count]['subclass'] = subclass
        # subclass is specific to P31, when subclass is true, we get intance of all subclasses of the entity
        if rowVerbose or colVerbose:
            self.dic[self.count]["property_name_dic"], self.dic[self.count][
                "ref_dic"] = self.search_property_for_verbose()
        if time_property and time:
            self.time_property = time_property
            self.time = time
        if limit:
            self.limit = limit
        self.query_str = self.define_query_relation()

    def changeFocus(self, name="Entity ID"):
        self.focus = name
        
    def extendWithFunction(self, objcolumn, func, name):
        if type(func) == str:
            if func.startswith('F'):
                try:
                    func_id = int(func[1:])
                    if func_id >= func_lib.func_num():
                        print("Not available.")
                    else:
                        if isinstance(objcolumn, list):
                            self.df[name] = self.df[objcolumn].apply(lambda x: func_lib.func_list[func_id](*x), axis=1)
                        else:
                            self.df[name] = self.df[objcolumn].apply(func_lib.func_list[func_id])
                except:
                    raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
            else:
                raise Exception("Not a valid function id, a valid function id should be 'Fn', n is an integer.")
        else:
            if isinstance(objcolumn, list):
                self.df[name] = self.df[objcolumn].apply(lambda x: func(*x), axis=1)
            else:
                self.df[name] = self.df[objcolumn].apply(func)

    def define_query_relation(self):
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.count < 1:
            return None
        focusChanges = 0
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                if self.dic[i]["search"] is None and not self.dic[i]["isSubject"]:
                        rdf_triple += """OPTIONAL {"""
                if self.dic[i]["focus"] == "Entity ID":
#                     if self.dic[i]["search"] is None:
#                         rdf_triple += """OPTIONAL {"""
                    rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    rdf_triple += """?""" + self.dic[i]["focus"] + """ p:""" + self.dic[i][
                        'property_id'] + """ ?statement_""" + str(i) + """. """ \
                                  + """?statement_""" + str(i) + """ ps:""" + self.dic[i][
                                      'property_id'] + """ ?""" + \
                                  self.dic[i]['name'] \
                                  + """_""" + self.dic[i]['property_id'] + """. """
                for key, value in self.dic[i]["property_name_dic"].items():
                    rdf_triple += """OPTIONAL { """ + """?statement_""" + str(i) + """ pq:""" + str(key) \
                                  + """ ?""" + self.dic[i]['name'] + """_""" + value + """_""" + self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                for key, value in self.dic[i]["ref_dic"].items():
                    rdf_triple += """OPTIONAL { ?statement_""" + str(
                        i) + """ prov:wasDerivedFrom ?refnode_""" + str(
                        i) + """. ?refnode_""" + str(i) \
                                  + """ pr:""" + str(key) + """ ?""" + self.dic[i]['name'] + """_ref_""" + \
                                  self.dic[i][
                                      'property_id'] + """_""" + str(key) + """.} """
                rdf_triple += """OPTIONAL { ?statement_""" + str(i) + """ wikibase:rank ?""" + self.dic[i][
                    'name'] + """_rank_""" + self.dic[i]['property_id'] + """_rank. } """
            # none-verbose version
            else:
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
                        if not self.dic[i]['subclass']:
                            rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i][
                                              "property_id"] + """ wd:""" + self.entity_id + """. """
                        else:
                            if self.dic[i]['property_id'] == "P31":
                                rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                    'property_id'] + """ wdt:""" + self.dic[i][
                                                  "property_id"] + """ ?subclasses. """
                                rdf_triple += """?subclasses wdt:P279+ wd:""" + self.entity_id + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """. """
                else:
                    if self.dic[i]["isSubject"]:
                        if not self.dic[i]['subclass']:
                            rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + self.dic[i][
                                              'focus'] + """. """
                        else:
                            if self.dic[i]['property_id'] == "P31":
                                rdf_triple += """?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                'property_id'] + """ wdt:""" + self.dic[i]["property_id"] + """ ?subclasses. """
                                rdf_triple += """?subclasses wdt:P279+ ?""" + self.dic[i]['focus'] + """. """
                    else:
                        if self.dic[i]["search"] is None:
                            rdf_triple += """OPTIONAL {"""
                        rdf_triple += """?""" + self.dic[i]['focus'] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i][
                                          'property_id'] + """. """
            if not self.dic[i]["isSubject"]:
                if i < self.count and self.dic[i]["focus"] != self.dic[i + 1]["focus"] and self.dic[i]["search"] is None:
                    focusChanges += 1
                elif self.dic[i]["search"] is None:
                    rdf_triple += """} """
                
        for i in range(focusChanges):
            rdf_triple += """} """
        for i in range(1, self.count + 1):
            if self.dic[i]['search'] is not None and self.dic[i]["search"] != '!NA':
                if isinstance(self.dic[i]['search'], tuple):
                    if isinstance(self.dic[i]['search'][0], str):
                        rdf_triple += """FILTER (YEAR(?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                            'property_id'] + """) >= """ + \
                                      self.dic[i]['search'][0] + """ && YEAR(?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """) <= """ + self.dic[i]['search'][
                                          1] + """) """
                    else:
                        rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i]['property_id'] + \
                                      """ >= """ + str(self.dic[i]['search'][0]) + """ && ?""" + self.dic[i]['name'] + \
                                      """_""" + self.dic[i]['property_id'] + """ <= """ + str(
                            self.dic[i]['search'][1]) + """) """
                else:
                    rdf_triple += """FILTER (?""" + self.dic[i]['name'] + """_""" + self.dic[i][
                        'property_id'] + """ = """ + \
                                  """wd:""" + self.dic[i]['search'] + """) """
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                          ?pubdateStatement ps:""" + self.time_property + """ ?date	
                          FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT"""
        for i in range(1, self.count + 1):
            if self.dic[i]["rowVerbose"] or self.dic[i]["colVerbose"]:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
                for key, value in self.dic[i]["property_name_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_""" + value + """_""" + self.dic[i][
                        'property_id'] + """_""" + str(key)
                for key, value in self.dic[i]["ref_dic"].items():
                    query += """ ?""" + self.dic[i]["name"] + """_ref_""" + self.dic[i]['property_id'] + """_""" + str(
                        key)
                query += """ ?""" + self.dic[i]["name"] + """_rank_""" + self.dic[i]['property_id'] + """_rank"""
            else:
                query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id']
                if self.dic[i]["label"]:
                    query += """ ?""" + self.dic[i]["name"] + """_""" + self.dic[i]['property_id'] + """Label"""
        query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
        return query

    def search_property_for_verbose(self):
        property_to_name = {}
        ref_to_name = {}
        rdf_triple, time_filter, limit_statement = """""", """""", """"""
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            for i in range(1, self.count):
                if self.dic[i]["focus"] == "Entity ID":
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ wd:""" + self.entity_id + """ ."""
                    else:
                        rdf_triple += """wd:""" + self.entity_id + """ wdt:""" + self.dic[i]["property_id"] + """ ?""" + \
                                      self.dic[i]["name"] + """ ."""
                else:
                    last = self.dic[i]["focus"].rfind('_')
                    focus = self.dic[i]["focus"][:last]
                    if self.dic[i]["isSubject"]:
                        rdf_triple += """?""" + self.dic[i]["name"] + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + focus + """ ."""
                    else:
                        rdf_triple += """?""" + focus + """ wdt:""" + self.dic[i][
                            "property_id"] + """ ?""" + self.dic[i]["name"] + """ ."""
            if self.dic[self.count]["focus"] == "Entity ID":
                rdf_triple += """wd:""" + self.entity_id + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
            else:
                last = self.dic[self.count]["focus"].rfind('_')
                focus = self.dic[self.count]["focus"][:last]
                rdf_triple += """?""" + focus + """ p:""" + self.dic[self.count][
                    'property_id'] + """ ?statement.""" + \
                              """?statement """ + """ps:""" + self.dic[self.count]['property_id'] + """ ?item.""" + \
                              """?statement """ + """?pq """ + """?obj.""" + \
                              """?qual wikibase:qualifier ?pq.""" + \
                              """OPTIONAL{ ?statement prov:wasDerivedFrom ?refnode. ?refnode ?pr ?r.}"""
        if self.time_property is not None:
            time_filter = """?""" + self.dic[1]["name"] + """ p:""" + self.time_property + """ ?pubdateStatement.	
                                  ?pubdateStatement ps:""" + self.time_property + """ ?date	
                                  FILTER (YEAR(?date) = """ + self.time + """)"""
        if self.limit is not None:
            limit_statement = """LIMIT """ + str(self.limit)
        label_statement = """Service wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }"""
        query = """SELECT DISTINCT """
        if self.dic[self.count]["rowVerbose"] or self.dic[self.count]["colVerbose"]:
            query += """?item""" + """ ?qual""" + """ ?qualLabel""" + """ ?obj """ + """?pr ?prLabel"""
            query += """ WHERE {""" + rdf_triple + time_filter + label_statement + """} """ + limit_statement
            query_result = get_results(endpoint_url, query)
            for result in query_result['results']['bindings']:
                if 'qual' in result:
                    property_to_name[result['qual']['value'].split('/')[-1]] = result['qualLabel']['value'].replace(' ',
                                                                                                                    '_')
                if 'pr' in result:
                    ref_to_name[result['pr']['value'].split('/')[-1]] = result['prLabel']['value'].replace(' ', '_')
        else:
            query += """?""" + self.dic[self.count]["name"] + """ """
        return property_to_name, ref_to_name

    def __str__(self):
        return str(self.df)

    def __getattr__(self, col_name):
        if col_name in self.df.columns:
            return self.df[col_name]
        else:
            print(col_name + " has not been found.")
            return None


def createRelation(entity_id: str, property_id=None, isSubject=None, rowVerbose=None, colVerbose=None,
                   time_property=None, time=None, name=None, label=False, limit=None, subclass=False):
    if property_id and not name:
        print("Please specify the name of the first column")
        return None
    return Relation(entity_id, property_id, isSubject, rowVerbose, colVerbose, time_property, time, name, label, limit, subclass)

def get_Firstname(name: str):
    return name.split(' ')[0]

def get_Lastname(name: str):
    return name.split(' ')[-1]

def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def get_name(id: str):
    query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 	
                PREFIX wd: <http://www.wikidata.org/entity/> 	
                select  *	
                where {	
                wd:""" + id + """ rdfs:label ?label .	
                FILTER (langMatches( lang(?label), "EN" ) )	
                } 	
                LIMIT 1"""
    results = get_results(endpoint_url, query)
    result = ''
    for res in results["results"]["bindings"]:
        result = res['label']['value']
    return result

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m


### Lon/lat to FIPS data

From the Census Bureau's shapefiles:
https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

In [26]:
path_shp = './cb_2018_us_county_20m/cb_2018_us_county_20m.shp'
data = gpd.read_file(path_shp)
print(len(data), 'counties')
data.head()

3220 counties


Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,37,17,1026336,0500000US37017,37017,Bladen,6,2265887723,33010866,"POLYGON ((-78.90200 34.83527, -78.79960 34.850..."
1,37,167,1025844,0500000US37167,37167,Stanly,6,1023370459,25242751,"POLYGON ((-80.49737 35.20210, -80.29542 35.502..."
2,39,153,1074088,0500000US39153,39153,Summit,6,1069181981,18958267,"POLYGON ((-81.68699 41.13596, -81.68495 41.277..."
3,42,113,1213687,0500000US42113,42113,Sullivan,6,1165338428,6617028,"POLYGON ((-76.81373 41.59003, -76.22014 41.541..."
4,48,459,1384015,0500000US48459,48459,Upshur,6,1509910100,24878888,"POLYGON ((-95.15274 32.66095, -95.15211 32.902..."


# Functions for coordinates
- parse_lon_lat()
- to_fips()
- fips_to_county_name()
- to_urban_area()

In [34]:
gov_building_subclass = 'embassy'
qnum = 'Q3917681'
r = createRelation(qnum, label=True)
r.extend('P31', True, 'gov_building_subclass', label=True) # extend via property P31 = is instance of

r.changeFocus('gov_building_subclass_P31')
r.extend('P17', False, 'Country',label=True, search="Q30") # extend via property P17 = is in country
r.extend('P131', False, 'State', label=True)
r.extend('P625', False, 'Lon_Lat')
r.query()

df = r.df
df['building_type_label'] = [gov_building_subclass for _ in range(len(df))]

print('There are %s instances of %s in the US.' % (str(len(df)), gov_building_subclass))


There are 163 instances of embassy in the US.


Confirm that missing lon/lat values are represented as 'NA' in the dataframe from the relation:

In [35]:
df.head()

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label
0,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128),embassy
1,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369567,"Embassy of Belarus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0406 38.9119),embassy
2,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369803,"Embassy of Myanmar in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0506 38.9136),embassy
3,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369594,"Embassy of Cameroon, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0522 38.9137),embassy
4,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369781,"Embassy of Mali, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0481 38.9122),embassy


## Upate 12/9

I updated parse_lon_lat() so that now you can input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple

so it's agnostic, and either way it'll output longitude latitude tuples ready to be used by lon_lat_to_fips(), get_fips() and other functions.

In [36]:
'''
Input:
- string containing longitude, latitude, and other extraneous characters
- OR longitude latitude tuple
Output: (float longitude, float latitude) tuple
'''
def parse_lon_lat(lon_lat):
    # if the input value is empty
    if lon_lat == 'NA':
        return 'NA'
    
    # if the input value looks like the format from wikidata
    if lon_lat[:6]=='Point(':
        entities = lon_lat[6:-1].split(' ')
        lon = float(entities[0])
        lat = float(entities[1])
        
    # if it's already been formatted as a tuple
    if len(lon_lat) == 2 and type(lon_lat) is tuple:
        lon, lat = lon_lat
        # if lon and lat values are valid
        if type(lon) is float and type(lat is float):
            if lon >= -180 and lon <= 180:
                if lat >= -90 and lat <= 90:
                    return lon_lat
        return 'NA' # if it's an invalid tuple
    
    return(lon, lat)

r.extendWithFunction('Lon_Lat_P625', parse_lon_lat, 'lon_lat_parsed')

In [37]:
r.df.head()

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed
0,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128),embassy,"(-77.0494, 38.9128)"
1,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369567,"Embassy of Belarus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0406 38.9119),embassy,"(-77.0406, 38.9119)"
2,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369803,"Embassy of Myanmar in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0506 38.9136),embassy,"(-77.0506, 38.9136)"
3,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369594,"Embassy of Cameroon, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0522 38.9137),embassy,"(-77.0522, 38.9137)"
4,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369781,"Embassy of Mali, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0481 38.9122),embassy,"(-77.0481, 38.9122)"


## Upate 12/9

lon_lat_to_fips() has been replaced by to_fips(), which can take as input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple

so it's agnostic, and either way it'll output a FIPS code.

#### FIPS data from the Census Bureau's shapefiles:
https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

In [39]:
'''
Input:
- string containing longitude, latitude, and other extraneous characters
- OR longitude latitude tuple
Output:
- FIPS code
'''
def to_fips(input_val):
    
    lon_lat = parse_lon_lat(input_val)
    
    if lon_lat=='NA':
        return 'NA'
    path_shp = './cb_2018_us_county_20m/cb_2018_us_county_20m.shp'
    data = gpd.read_file(path_shp)
    
    lon, lat = lon_lat
    point = Point(lon, lat)
    
    for ind in range(len(data)):
        polygon = data.geometry[ind]
        if point.within(polygon):
            return data.GEOID[ind]
r.extendWithFunction('Lon_Lat_P625', to_fips, 'fips_from_coords_string')
r.extendWithFunction('lon_lat_parsed', to_fips, 'fips_from_coords_parsed')

In [40]:
r.df.head()

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed,fips_from_coords_string,fips_from_coords_parsed
0,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128),embassy,"(-77.0494, 38.9128)",11001,11001
1,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369567,"Embassy of Belarus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0406 38.9119),embassy,"(-77.0406, 38.9119)",11001,11001
2,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369803,"Embassy of Myanmar in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0506 38.9136),embassy,"(-77.0506, 38.9136)",11001,11001
3,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369594,"Embassy of Cameroon, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0522 38.9137),embassy,"(-77.0522, 38.9137)",11001,11001
4,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369781,"Embassy of Mali, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0481 38.9122),embassy,"(-77.0481, 38.9122)",11001,11001


Demonstrate that fips codes calculated with the string-format coordinates as input are identical to the fips codes calculated with the lon-lat-tuple-format coordinates as input. 

In [43]:
df = r.df
df[df.fips_from_coords_string != df.fips_from_coords_parsed]

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed,fips_from_coords_string,fips_from_coords_parsed


## Upate 12/9

fips_to_county() has been replaced by to_county_name(), which can take as input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple
- OR a 5-character string FIPS code

so it's agnostic, and either way it'll output a county name.

In [54]:
'''
Input:
- string containing longitude, latitude, and other extraneous characters
- OR longitude latitude tuple
- OR 5-character string FIPS code
Output:
- string county name
'''
def to_county_name(input_value):
    if input_value=='NA':
        return 'NA'
    if input_value==None:
        return 'NA'
    
    path_shp = './cb_2018_us_county_20m/cb_2018_us_county_20m.shp'
    data = gpd.read_file(path_shp)
    
    # if the input value looks like the format from wikidata
    if input_value[:6]=='Point(':
        fips = to_fips(input_value)
        
    # else if the input has already been formatted as a tuple
    elif len(input_value)==2 and type(input_value) is tuple:
        lon, lat = input_value
        # if lon and lat values are valid
        if type(lon) is float and type(lat is float):
            fips = to_fips(input_value)
        else:
            return 'NA'
    
    elif input_value in set(data.GEOID):
        fips = input_value

    try:
        return data[data.GEOID==fips].NAME.values[0]
    except:
        print(fips)
        print(type(fips))

In [55]:
r.extendWithFunction('fips_from_coords_parsed', to_county_name, 'county_name_from_fips_from_coords_parsed')
r.extendWithFunction('Lon_Lat_P625', to_county_name, 'county_name_from_coords_string')

In [56]:
r.df.head()

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed,fips_from_coords_string,fips_from_coords_parsed,county_name_from_fips_from_coords_parsed,county_name_from_coords_string
0,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128),embassy,"(-77.0494, 38.9128)",11001,11001,District of Columbia,District of Columbia
1,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369567,"Embassy of Belarus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0406 38.9119),embassy,"(-77.0406, 38.9119)",11001,11001,District of Columbia,District of Columbia
2,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369803,"Embassy of Myanmar in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0506 38.9136),embassy,"(-77.0506, 38.9136)",11001,11001,District of Columbia,District of Columbia
3,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369594,"Embassy of Cameroon, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0522 38.9137),embassy,"(-77.0522, 38.9137)",11001,11001,District of Columbia,District of Columbia
4,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369781,"Embassy of Mali, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0481 38.9122),embassy,"(-77.0481, 38.9122)",11001,11001,District of Columbia,District of Columbia


Demonstrate that county names found with fips codes from parsed coordinates as input are identical to the county names found with string-format coordinates as input. 

In [57]:
df = r.df
df[df.county_name_from_fips_from_coords_parsed != df.county_name_from_coords_string]

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed,fips_from_coords_string,fips_from_coords_parsed,county_name_from_fips_from_coords_parsed,county_name_from_coords_string


## Upate 12/9

lon_lat_to_urban_area() has been replaced by to_urban_area(), which can take as input
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple

so it's agnostic, and either way it'll output an urban area name.

#### Urban center data from the Census Bureau's shapefiles:
https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

In [59]:
'''
Input:
- a string containing longitude, latitude, and other extraneous characters
- OR a longitude latitude tuple
Output:
- string name of urban area
'''
def to_urban_area(input_val):
    lon_lat = parse_lon_lat(input_val)
    
    if lon_lat=='NA':
        return 'NA'
    
    path_shp = './cb_2018_us_ua10_500k/cb_2018_us_ua10_500k.shp'
    data = gpd.read_file(path_shp)
    
    lon, lat = lon_lat
    point = Point(lon, lat)
    
    for ind in range(len(data)):
        polygon = data.geometry[ind]
        if point.within(polygon):
            return data.NAME10[ind]
        
r.extendWithFunction('Lon_Lat_P625', to_urban_area, 'urban_area_from_coords_string')
r.extendWithFunction('lon_lat_parsed', to_urban_area, 'urban_area_from_coords_parsed')

In [60]:
r.df.head()

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed,fips_from_coords_string,fips_from_coords_parsed,county_name_from_fips_from_coords_parsed,county_name_from_coords_string,urban_area_from_coords_string,urban_area_from_coords_parsed
0,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128),embassy,"(-77.0494, 38.9128)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
1,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369567,"Embassy of Belarus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0406 38.9119),embassy,"(-77.0406, 38.9119)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
2,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369803,"Embassy of Myanmar in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0506 38.9136),embassy,"(-77.0506, 38.9136)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
3,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369594,"Embassy of Cameroon, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0522 38.9137),embassy,"(-77.0522, 38.9137)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
4,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369781,"Embassy of Mali, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0481 38.9122),embassy,"(-77.0481, 38.9122)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"


Demonstrate that urban area names found with fips codes from parsed coordinates as input are identical to the urban area names found with string-format coordinates as input. 

In [61]:
df = r.df
df[df.urban_area_from_coords_string != df.urban_area_from_coords_parsed]

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed,fips_from_coords_string,fips_from_coords_parsed,county_name_from_fips_from_coords_parsed,county_name_from_coords_string,urban_area_from_coords_string,urban_area_from_coords_parsed


# Final dataframe
Contains for each row:
- wikidata url to "embassy"
- embassy name and wikidata url
- country name and wikidata url
- state name and wikidata url
- lon / lat as a raw string
- lon / lat as tuple(float, float)
- fips code from lon / lat as a raw string
- fips code from lon / lat as tuple(float, float)
- county name from fips code from lon / lat as tuple(float, float)
- county name from lon / lat as tuple(float, float)
- urban area name from lon / lat as a raw string
- urban area name from lon / lat as tuple(float, float)

These column names may be unwieldy, but I wanted to demonstrate the versatility of these functions with regard to input types.

In [62]:
df

Unnamed: 0,Entity ID,gov_building_subclass_P31,gov_building_subclass_P31Label,Country_P17,Country_P17Label,State_P131,State_P131Label,Lon_Lat_P625,building_type_label,lon_lat_parsed,fips_from_coords_string,fips_from_coords_parsed,county_name_from_fips_from_coords_parsed,county_name_from_coords_string,urban_area_from_coords_string,urban_area_from_coords_parsed
0,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369663,"Embassy of Cyprus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0494 38.9128),embassy,"(-77.0494, 38.9128)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
1,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369567,"Embassy of Belarus in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0406 38.9119),embassy,"(-77.0406, 38.9119)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
2,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369803,"Embassy of Myanmar in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0506 38.9136),embassy,"(-77.0506, 38.9136)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
3,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369594,"Embassy of Cameroon, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0522 38.9137),embassy,"(-77.0522, 38.9137)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
4,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369781,"Embassy of Mali, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0481 38.9122),embassy,"(-77.0481, 38.9122)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q15216574,"Embassy of Portugal, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0474 38.9098),embassy,"(-77.0474, 38.9098)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
159,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q29025001,"Embassy of Papua New Guinea in Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.04122778 38.90885556),embassy,"(-77.04122778, 38.90885556)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
160,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q2841817,"Embassy of Monaco, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.063693 38.941925),embassy,"(-77.063693, 38.941925)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
161,http://www.wikidata.org/entity/Q3917681,http://www.wikidata.org/entity/Q5369912,"Embassy of Uruguay, Washington, D.C.",http://www.wikidata.org/entity/Q30,United States of America,http://www.wikidata.org/entity/Q61,"Washington, D.C.",Point(-77.0441 38.9015),embassy,"(-77.0441, 38.9015)",11001,11001,District of Columbia,District of Columbia,"Washington, DC--VA--MD","Washington, DC--VA--MD"
