# Clustering and Scoring Job Relocation Opportunities - Playground Notebook

Austin Rainwater

---

# Initialization

In [127]:
!pip install --quiet --upgrade sqlalchemy pymysql

from urllib.parse import quote as url_encode

import pandas as pd
import numpy as np
import aiohttp
import asyncio
import requests
import xml.etree.ElementTree as xml

from concurrent.futures import ProcessPoolExecutor
executor = ProcessPoolExecutor()

from pandas import json_normalize
from itertools import product

from sqlalchemy import (
    create_engine,
    Table,
    Column,
    MetaData,
    String,
    Numeric,
    Integer
)

import yaml

with open('secrets.yaml', 'r') as secrets_file:
    secrets = yaml.safe_load(secrets_file)
    
header = {"User-Agent": 
          'datascience jupyter notebook/0.0 '
          '(https://github.com/pacorain/datascience-certification-final-project; '
          'Austin Rainwater, paco@heckin.io)'}
v = '20201108'

---

# City Definition

Obviously, a good place for me to start is with some cities. Below is the table definition for the cities I will be exploring and their specific traits.

In [82]:
engine = create_engine(secrets['db_connection_string'], echo=True)

meta = MetaData()

cities = Table(
    'city', meta,
    Column('city_name', String(50), primary_key=True, comment='Community Name'),
    Column('metro_name', String(50), comment='Metropolitan Area Name'),
    Column('state', String(2), nullable=False, comment='2-Letter abbreviation of State'),
    Column('lat', Numeric(10, 6), nullable=False, comment='Latitude of City'),
    Column('lng', Numeric(10, 6), nullable=False, comment='Longitude of City'),
    Column('area_val', Numeric(10, 4), nullable=False, comment='Area of city in square miles'),
    Column('total_pop', Integer, nullable=False, comment='Total population of city')
)

meta.drop_all(engine)
meta.create_all(engine)

2020-11-23 23:25:19,348 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'sql_mode'
2020-11-23 23:25:19,350 INFO sqlalchemy.engine.base.Engine {}
2020-11-23 23:25:19,359 INFO sqlalchemy.engine.base.Engine SHOW VARIABLES LIKE 'lower_case_table_names'
2020-11-23 23:25:19,360 INFO sqlalchemy.engine.base.Engine {}
2020-11-23 23:25:19,368 INFO sqlalchemy.engine.base.Engine SELECT DATABASE()
2020-11-23 23:25:19,369 INFO sqlalchemy.engine.base.Engine {}
2020-11-23 23:25:19,381 INFO sqlalchemy.engine.base.Engine show collation where `Charset` = 'utf8mb4' and `Collation` = 'utf8mb4_bin'
2020-11-23 23:25:19,382 INFO sqlalchemy.engine.base.Engine {}
2020-11-23 23:25:19,388 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS CHAR(60)) AS anon_1
2020-11-23 23:25:19,389 INFO sqlalchemy.engine.base.Engine {}
2020-11-23 23:25:19,392 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS CHAR(60)) AS anon_1
2020-11-23 23:25:19,393 INFO sqlalchemy.engine.base.E

Let's start with my birthplace: Fort Wayne, Indiana.

In [83]:
new_city = cities.insert()

try:
    engine.execute(new_city, [
        {'city_name': 'Fort Wayne, IN', 'metro_name': 'Fort Wayne, IN', 'state': 'IN'}
    ])
except:
    print("Oops! That didn't work.")

2020-11-23 23:25:26,045 INFO sqlalchemy.engine.base.Engine INSERT INTO city (city_name, metro_name, state) VALUES (%(city_name)s, %(metro_name)s, %(state)s)
2020-11-23 23:25:26,046 INFO sqlalchemy.engine.base.Engine {'city_name': 'Fort Wayne', 'metro_name': 'Fort Wayne', 'state': 'IN'}
2020-11-23 23:25:26,050 INFO sqlalchemy.engine.base.Engine ROLLBACK
Oops! That didn't work.


Ah, the table requires some more data to be able to insert the record. I could use the geocoder library from before to get the latitude and longitude, but since I will be using Wikipedia anyway, let's see if I can grab it from there.

I did some experimenting with the [Wikipedia API Sandbox](https://en.wikipedia.org/wiki/Special:ApiSandbox#action=parse&format=json&page=Fort%20Wayne%2C%20Indiana&redirects=1&prop=wikitext), and oddly enough while there are multiple endpoints capable of getting the _names_ of the templates used in a page, I could not for the life of me find a way to get the _data inserted to_ the templates in an easy format such as JSON. So instead, I'm going to grab the `parsetree` and parse it with Python's XML libraries.

In [42]:
city_name = 'Fort Wayne'
state_name = 'IN'

wikipedia_url = 'https://en.wikipedia.org/w/api.php'
params = {
    "action": "parse",
    "format": "json",
    "redirects": "1",
    "page": f"{city_name}, {state_name}",
    "prop": "parsetree"
}

response = requests.get(wikipedia_url, params=params, headers=header).json()['parse']['parsetree']['*']
response = xml.canonicalize(response, strip_text=True)

# Write XML data for local exploration
with open('data/fort_wayne.xml', 'w') as xml_file:
    xml_file.write(response)

Ah, going through the XML file, the map on the Wikipedia article is an SVG (i.e. an image, not something that contains computer-readable geographic data), so I will need to use a geocoder. 

I recall from the previous lab that when you grab data from Foursquare's API, it will geocode the 'near' parameter and return the latitude and logitude used.

I also want to include the total size of the city, so in order to enter data into the table, I need to grab data from Wikipedia _and_ Foursquare. Which is fine, because I need more data to explore possible features

In [43]:
wiki_data = xml.fromstring(response)

In [34]:
foursquare_url = "https://api.foursquare.com/v2/venues/explore"

params = {
    'client_id': secrets['4SQ_CLIENT_ID'],
    'client_secret': secrets['4SQ_CLIENT_SECRET'],
    'limit': '50',
    'v': v,
    'near': 'Fort Wayne, IN',
    'radius': 1000,
    'time': 'any', 
    'day': 'any',
    'sortByPopularity': '1'
}

foursquare_response = requests.get(foursquare_url, params=params, headers=header).json()['response']

In [80]:
def template_value(wiki_data, template_title, part_name):
    template = wiki_data.find(".//template[title='{}']".format(template_title))
    return template.find(".part[name='{}'].value".format(part_name)).text

lat = float(foursquare_response['geocode']['center']['lat'])
lng = float(foursquare_response['geocode']['center']['lng'])
sq_mi = float(template_value(wiki_data, "Infobox settlement", "area_total_sq_mi"))
total_pop = int(template_value(wiki_data, "Infobox settlement", "population_est"))

41.1306

Alright, I've gotten the values I need initially for a city; now let's try inserting it.

In [84]:
engine.execute(new_city, [{
    'city_name': 'Fort Wayne', 
    'metro_name': 'Fort Wayne', 
    'state': 'IN', 
    'lat': lat,
    'lng': lng,
    'area_val': sq_mi,
    'total_pop': total_pop
}])

2020-11-23 23:25:29,737 INFO sqlalchemy.engine.base.Engine INSERT INTO city (city_name, metro_name, state, lat, lng, area_val, total_pop) VALUES (%(city_name)s, %(metro_name)s, %(state)s, %(lat)s, %(lng)s, %(area_val)s, %(total_pop)s)
2020-11-23 23:25:29,738 INFO sqlalchemy.engine.base.Engine {'city_name': 'Fort Wayne', 'metro_name': 'Fort Wayne', 'state': 'IN', 'lat': 41.1306, 'lng': -85.12886, 'area_val': 110.79, 'total_pop': 270402}
2020-11-23 23:25:29,742 INFO sqlalchemy.engine.base.Engine COMMIT


<sqlalchemy.engine.result.ResultProxy at 0x7f69cc6e6f70>

In [89]:
query = cities.select()

pd.read_sql(query, engine)

2020-11-23 23:39:42,785 INFO sqlalchemy.engine.base.OptionEngine SELECT city.city_name, city.metro_name, city.state, city.lat, city.lng, city.area_val, city.total_pop 
FROM city
2020-11-23 23:39:42,786 INFO sqlalchemy.engine.base.OptionEngine {}


Unnamed: 0,city_name,metro_name,state,lat,lng,area_val,total_pop
0,Fort Wayne,Fort Wayne,IN,41.1306,-85.12886,110.79,270402


Not bad. 

Next, I want to grab some data from Foursquare to build a feature based on what's popular within 1, 5, 25, and 100 km. I'll use the category hierarchy like I did in the week 3 lab. Given that the Foursquare API allows for 99,500 of these calls a day, and up to 5,000 per hour, I can also do this comfortably with each section defined in the `venues/explore` enpoint to see how much variety is in each section in an area.

In [37]:
url = 'https://api.foursquare.com/v2/venues/categories'
params = {
    'client_id': secrets['4SQ_CLIENT_ID'],
    'client_secret': secrets['4SQ_CLIENT_SECRET'],
    'v': v
}
foursquare_categories = requests.get(url, params=params).json()

def category_hier(categories, prefix=[]):
    result = []
    
    for category in categories:
        category = json_normalize(category).iloc[0]
        current_category = pd.Series(
            data=prefix + [category.shortName] + [np.nan] * (4 - len(prefix)),
            name=str(category.id),
            index=[
                'cat_level_1',
                'cat_level_2',
                'cat_level_3',
                'cat_level_4',
                'cat_level_5'
            ]
        )
        result.append(current_category)
        if subcategories := category.categories:
            result += category_hier(subcategories, prefix + [category.shortName])
            
    return result

categories = foursquare_categories['response']['categories']
category_df = pd.DataFrame(category_hier(categories))

In [159]:
radii = [1000, 5000, 25000, 100000]
sections = ['food', 'drinks', 'coffee', 'shops', 'arts', 'outdoors', 'sights', 'trending', 'topPicks']

async def get_popular_spots(city):
    """
    Get popular spots in various "sections" within various distances of `city`
    """
    async with aiohttp.ClientSession() as session:
        tasks = []
        for r, s in product(radii, sections):
            task = query_places(session, city, r, s)
            tasks.append(task)
        results = await asyncio.gather(*tasks)
    return pd.concat(results, ignore_index=True)
    
    
async def query_places(session, location, radius, section):
    """
    With an existing HTTP `session`, get popular spots of the type `section` within `radius` meters of `location`
    
    Uses multiprocessing for quicker processing of the 36 times this function is called
    """
    async with session.get("https://api.foursquare.com/v2/venues/explore", params={
        'client_id': secrets['4SQ_CLIENT_ID'],
        'client_secret': secrets['4SQ_CLIENT_SECRET'],
        'limit': '50',
        'v': v,
        'near': location,
        'radius': radius, 
        'section': section
    }) as result:
        data = await result.json()
    loop = asyncio.get_running_loop()
    venues = await loop.run_in_executor(executor, normalize_foursquare_response, data)
    if venues is not None:
        venues['city'] = location
        venues['radius'] = radius
        venues['section'] = section
    return venues
    
    
def normalize_foursquare_response(data):
    """
    Converts the Foursquare response into a dataframe with all of the venues, as well geolocation metadata.
    """
    if 'groups' not in data['response']:
        return None
    venues = json_normalize(data, ['response', 'groups', 'items'], sep='_')
    geo = json_normalize(data['response']['geocode']).loc[0] # json_normalize returns single-index df
    geo.index = pd.Index(f'geo_{name}' for name in geo.index)
    venues.loc[:, geo.index] = geo.values
    return venues


In [160]:
places_df = await get_popular_spots('Fort Wayne, IN')
places_df

Unnamed: 0,referralId,reasons_count,reasons_items,venue_id,venue_name,venue_location_address,venue_location_lat,venue_location_lng,venue_location_labeledLatLngs,venue_location_postalCode,...,geo_geometry.bounds.sw.lng,city,radius,section,venue_venuePage_id,flags_outsideRadius,venue_location_neighborhood,venue_events_count,venue_events_summary,venue_events_items
0,e-3-5081efe6e4b0d5064a98d8b8-0,0,"[{'summary': 'This spot is popular', 'type': '...",5081efe6e4b0d5064a98d8b8,Banh Mi Barista,5320 Coldwater Rd,41.127890,-85.135835,"[{'label': 'display', 'lat': 41.12789011966495...",46825,...,-85.303308,"Fort Wayne, IN",1000,food,,,,,,
1,e-3-4b2eb7bff964a52093e524e3-1,0,"[{'summary': 'This spot is popular', 'type': '...",4b2eb7bff964a52093e524e3,Jimmy John's,5412 Coldwater Rd,41.129067,-85.135295,"[{'label': 'display', 'lat': 41.12906702616283...",46825,...,-85.303308,"Fort Wayne, IN",1000,food,,,,,,
2,e-3-4b5f57bff964a5201db529e3-2,0,"[{'summary': 'This spot is popular', 'type': '...",4b5f57bff964a5201db529e3,Cork 'n Cleaver,221 Washington Ctr Rd,41.132858,-85.138249,"[{'label': 'display', 'lat': 41.132858, 'lng':...",46825,...,-85.303308,"Fort Wayne, IN",1000,food,,,,,,
3,e-3-4b5a3e80f964a5201ab728e3-3,0,"[{'summary': 'This spot is popular', 'type': '...",4b5a3e80f964a5201ab728e3,BakerStreet,4820 N Clinton St,41.122200,-85.125421,"[{'label': 'display', 'lat': 41.12219979566053...",46825,...,-85.303308,"Fort Wayne, IN",1000,food,,,,,,
4,e-3-4e3dd20bd22d102e8547c605-4,0,"[{'summary': 'This spot is popular', 'type': '...",4e3dd20bd22d102e8547c605,Koto Japanese Steakhouse & Sushi,301 E Washington Center Rd,41.133189,-85.137754,"[{'label': 'display', 'lat': 41.13318911813626...",46825,...,-85.303308,"Fort Wayne, IN",1000,food,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1513,e-1-4c9020060b9e37040150655e-45,0,"[{'summary': 'This spot is popular', 'type': '...",4c9020060b9e37040150655e,Heres To You Pub And Grub,31-45 North St,41.922664,-84.632565,"[{'label': 'display', 'lat': 41.92266400000000...",49242,...,-85.303308,"Fort Wayne, IN",100000,topPicks,,,,,,
1514,e-1-4b5ba2faf964a520b70c29e3-46,0,"[{'summary': 'This spot is popular', 'type': '...",4b5ba2faf964a520b70c29e3,Ricky's Taqueria,57995 County Road 9,41.652302,-85.965392,"[{'label': 'display', 'lat': 41.652302, 'lng':...",46517,...,-85.303308,"Fort Wayne, IN",100000,topPicks,,,,,,
1515,e-1-4b59f626f964a520faa328e3-47,0,"[{'summary': 'This spot is popular', 'type': '...",4b59f626f964a520faa328e3,Ivanhoe's Drive In,979 S Main St,40.465849,-85.493654,"[{'label': 'display', 'lat': 40.46584893204182...",46989,...,-85.303308,"Fort Wayne, IN",100000,topPicks,,,,,,
1516,e-1-4b159b5ff964a5200ab123e3-48,0,"[{'summary': 'This spot is popular', 'type': '...",4b159b5ff964a5200ab123e3,Henry's,536 W Main St,41.079294,-85.147520,"[{'label': 'display', 'lat': 41.07929428873227...",46802,...,-85.303308,"Fort Wayne, IN",100000,topPicks,,,,,,


In [161]:
print(f"{len(places_df.venue_id.unique())} unique venues")

727 unique venues


Cool, that will give me the ability to get an idea of what we can do on an evening or a weekend. 

Let's add in the venue category hierarchy.

In [153]:
assert all(places_df.venue_categories.apply(len) == 1)
places_df.apply(lambda row: category_df.loc[row[venue_category][0]['id']], axis=1)

In [143]:
# Is there any useful information in the reasons a venue was given?

places_df[places_df.reasons_count > 0]

Unnamed: 0,referralId,reasons_count,reasons_items,venue_id,venue_name,venue_location_address,venue_location_lat,venue_location_lng,venue_location_labeledLatLngs,venue_location_postalCode,...,geo_geometry.bounds.sw.lng,city,radius,section,venue_venuePage_id,flags_outsideRadius,venue_location_neighborhood,venue_events_count,venue_events_summary,venue_events_items


In [None]:
columns = ['venue_id', 'venue_name', 'venue_location_lat', 'venue_location_lng']