# Clustering and Scoring Job Relocation Opportunities - Features and Models
Austin Rainwater

---

Now that I've obtained some data, I can extracting some features and building some models.

In [2]:
!pip install --quiet --upgrade pymysql

import pandas as pd
import numpy as np

from sqlalchemy import (
    create_engine,
    MetaData,
    String,
    Integer,
    func as f,
    select
)
import yaml

with open('secrets.yaml', 'r') as secrets_file:
    secrets = yaml.safe_load(secrets_file)
    
conn_str = secrets['db_connection_string']
    
engine = create_engine(conn_str)
meta = MetaData()
meta.reflect(bind=engine)

## Data

Before I do anything, I need to grab the data. For now, I'm going to pull from the database into DataFrames, and then pickle the DataFrames.

In [3]:
try:
    cities = pd.read_pickle('data/cities.pkl')
except:
    cities = pd.read_sql(
        meta.tables['cities'].select(), 
        engine, 
        index_col='city_name'
    )
    cities.to_pickle('data/cities.pkl')
    
cities

Unnamed: 0_level_0,metro_name,state_name,center_latitude,center_longitude,area_val,city_population,population_density
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"4S Ranch, California",San Diego,California,33.017500,-117.111667,,,
"Aberdeen Township, New Jersey","Freehold Borough, New Jersey",New Jersey,40.430068,-74.225075,7.77,19332.0,3343.0
"Abington Township, Montgomery County, Pennsylvania",Norristown,Pennsylvania,40.100000,-75.099722,15.52,55557.0,
"Aboite Township, Allen County, Indiana","Fort Wayne, Indiana",Indiana,41.052222,-85.285278,33.35,,1073.2
"Aboite, Indiana","Fort Wayne, Indiana",Indiana,41.000000,-85.318056,,,
...,...,...,...,...,...,...,...
"Zieglerville, Pennsylvania",Camden,Pennsylvania,40.276944,-75.480000,,,
"Zionhill, Pennsylvania",Media,Pennsylvania,40.484167,-75.393889,,,
"Zorn, Texas",Seguin,Texas,29.753056,-97.948056,,,
"Zuehl, Texas",Seguin,Texas,29.491667,-98.152778,,,


In [4]:
try:
    searches = pd.read_pickle('data/searches.pkl')
except:
    searches = pd.read_sql(
        meta.tables['city_foursquare_results'].select(), 
        engine
    )
    searches.to_pickle('data/searches.pkl')
    
searches

Unnamed: 0,venue_id,search_city,search_radius,query_type,query,search_popularity
0,4b66def6f964a5202f2e2be3,"Lyndhurst, New Jersey",1000,section,food,0
1,4b9a9f22f964a5201dc735e3,"Lyndhurst, New Jersey",1000,section,food,1
2,4b44d7e7f964a520e6fd25e3,"Lyndhurst, New Jersey",1000,section,food,2
3,4bbe1522c6a2ef3b8a43ddbd,"Lyndhurst, New Jersey",1000,section,food,3
4,4bdb4c9a63c5c9b623432768,"Lyndhurst, New Jersey",1000,section,food,4
...,...,...,...,...,...,...
4513629,4bae5435f964a52068a33be3,"York Center, Ohio",100000,query,ice cream shop,9
4513630,4c372dda04cbb7131a42ee0d,"York Center, Ohio",100000,query,ice cream shop,10
4513631,4b91a65af964a520d6cc33e3,"York Center, Ohio",100000,query,ice cream shop,11
4513632,4dd6e9f652b1a5c6443fd34f,"York Center, Ohio",100000,query,ice cream shop,12


In [5]:
try:
    venues = pd.read_pickle('data/venues.pkl')
except:
    venues = pd.read_sql(
        meta.tables['foursquare_venues'].select(),
        engine,
        index_col=['venue_id']
    )
    venues.to_pickle('data/venues.pkl')
    
venues

Unnamed: 0_level_0,venue_name,venue_location_lat,venue_location_lng,venue_location_crossStreet,venue_delivery_id,venue_location_city,venue_location_state,cat_level_1,cat_level_2,cat_level_3,cat_level_4
venue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3fd66200f964a52000e71ee3,Fat Cat,40.733665,-74.002950,btwn 7th Ave S & Bleecker St,,New York,NY,Arts & Entertainment,Music Venue,Jazz Club,
3fd66200f964a52000f11ee3,Melody Lanes,40.652726,-74.002993,at 5th Ave.,,Brooklyn,NY,Arts & Entertainment,Bowling Alley,,
3fd66200f964a52002ef1ee3,The Short Stop,34.075293,-118.253701,at Sutherland St,,Los Angeles,CA,Nightlife,Bar,,
3fd66200f964a52007f11ee3,Poquito Mas,34.159801,-118.331202,at Naomi St.,,Burbank,CA,Food,Mexican,,
3fd66200f964a52008e81ee3,Serendipity 3,40.761758,-73.965054,btwn 2nd & 3rd Ave,,New York,NY,Food,Desserts,,
...,...,...,...,...,...,...,...,...,...,...,...
600335afb50b926ccc98e77d,Cultura Coffee Shop,29.326754,-98.551674,,,San Antonio,TX,Food,Coffee Shop,,
600343989a58fc0b07106204,Redline Athletics,40.233030,-75.225066,,,Montgomeryville,PA,Outdoors & Recreation,Athletics & Sports,,
6004a774cf4c6c1531cabe42,Haddonfield Zoo Sculpture Park,39.900356,-75.027940,,,Haddonfield,NJ,Outdoors & Recreation,Park,,
6004b020c6f7954a4788243f,Garnet Stream,39.919204,-75.361948,,,Springfield,PA,Outdoors & Recreation,Park,,


## Control

I have had lots of ideas for features using the data I've grabbed, and I'm not quite sure which ones would be the most effective, or what might be too much. 

So before I start developing features, I want to use the one-shot features I used in the TorontoSegementation notebook to cluster and score it. Then, I can re-run the model with new features to see how the scores have improved (or degraded).

In [6]:
search_city_venues = searches.join(venues, 'venue_id', how='inner')
search_city_venues['has_delivery'] = ~pd.isna(search_city_venues.venue_delivery_id)
search_city_venues = search_city_venues[search_city_venues.search_radius == 25000]
print(search_city_venues.shape)
search_city_venues.sample(1).T

(1682136, 18)


Unnamed: 0,1503510
venue_id,4bbf3966006dc9b62b31fc3f
search_city,"McCook, Illinois"
search_radius,25000
query_type,query
query,hiking trail
search_popularity,31
venue_name,Mile Marker Zero - Great Western Trail
venue_location_lat,41.8896
venue_location_lng,-87.9759
venue_location_crossStreet,


In [None]:
categories = ['cat_level_1', 'cat_level_2', 'cat_level_3', 'cat_level_4']

byquery = pd.get_dummies(
    search_city_venues[['search_city', 'query'] + categories],
    columns=categories,
    prefix=['Category', 'Subcategory', 'Type 1', 'Type 2'],
    prefix_sep=': ',
    dtype=bool
).groupby(['search_city', 'query']).mean()

In [14]:
byquery

Unnamed: 0_level_0,Unnamed: 1_level_0,Category: Arts & Entertainment,Category: College & Education,Category: Food,Category: Nightlife,Category: Outdoors & Recreation,Category: Professional,Category: Residence,Category: Shops,Category: Travel,Subcategory: ATM,...,Type 2: Ramen,Type 2: Shabu-Shabu,Type 2: Shanghai,Type 2: Sushi,Type 2: Szechuan,Type 2: Taiwanese,Type 2: Track,Type 2: Udon,Type 2: Weight Loss Center,Type 2: Yoga Studio
search_city,query,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"Aberdeen Township, New Jersey",arcade,0.840000,0.0,0.04,0.000,0.060000,0.000,0.0,0.060,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Aberdeen Township, New Jersey",arts,1.000000,0.0,0.00,0.000,0.000000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Aberdeen Township, New Jersey",bbq,0.000000,0.0,0.92,0.060,0.000000,0.000,0.0,0.020,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Aberdeen Township, New Jersey",british food,0.000000,0.0,0.45,0.025,0.150000,0.025,0.0,0.225,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Aberdeen Township, New Jersey",coffee,0.000000,0.0,0.88,0.000,0.000000,0.000,0.0,0.120,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Zionhill, Pennsylvania",hiking trail,0.000000,0.0,0.00,0.000,1.000000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Zionhill, Pennsylvania",ice cream shop,0.000000,0.0,1.00,0.000,0.000000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Zionhill, Pennsylvania",park,0.111111,0.0,0.00,0.000,0.888889,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Zionhill, Pennsylvania",sights,0.000000,0.0,1.00,0.000,0.000000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
by_query = pd.merge(
    search_city_venues[['venue_id', 'search_city', 'query']],
    cat_dummies,
    left_index=True,
    right_index=True
).groupby(
    ['search_city', 'query']
).mean()

KeyboardInterrupt: 

In [None]:
features