In [3]:
import config
import numpy as np
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import folium
import urllib
import regex as re
from sklearn.cluster import KMeans

## 1 Data Collection

### 1.1 Geographic Data Collection

In [122]:
neigh_df=pd.DataFrame(columns=['Neighborhood','City','State','Neighborhood Latitude','Neighborhood Longitude'])
neigh_df.to_csv('neigh_df.csv',index=False)

#### 1.1.1 Acquire Richmond, VA neighborhood data.

In [124]:
neigh_df=pd.read_csv('neigh_df.csv')
#Scrape wikipedia to generate a list of rva neighborhood names
rva_page = requests.get('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Richmond,_Virginia').text
rva_soup = BeautifulSoup(rva_page,'html.parser')
rva_neigh=[]
neigh_search = rva_soup.find_all('li')
for x in neigh_search[7:112]:
    rva_neigh.append(x.text)

#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in rva_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Richmond, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Richmond','State':'VA','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False)

#### 1.1.2 Acquire Raleigh, NC neighborhood data

In [125]:
neigh_df=pd.read_csv('neigh_df.csv')
#Scrape wikipedia to generate a list of raleigh neighborhood names
rgh_page = requests.get('https://en.wikipedia.org/wiki/Raleigh,_North_Carolina_neighborhoods').text
rgh_soup = BeautifulSoup(rgh_page,'html.parser')
rgh_neigh=[]
neigh_search = rgh_soup.find_all('li')
for x in neigh_search[4:109]:
    rgh_neigh.append(x.text)
    
#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in rgh_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Raleigh, NC"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Raleigh','State':'NC','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False)

No lat long data found for  Westlake


#### 1.1.3 Acquire Norfolk, VA neighborhood data

In [126]:
neigh_df=pd.read_csv('neigh_df.csv')
#Scrape wikipedia to generate a list of norfolk neighborhood names
norva_page = requests.get('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Norfolk,_Virginia').text
norva_soup = BeautifulSoup(norva_page,'html.parser')
norva_neigh=[]
neigh_search = norva_soup.find_all('li')
for x in neigh_search[0:49]:
    norva_neigh.append(x.text)
norva_neigh[19]='Huntersville'
norva_neigh[3]='Bowling Green'

#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in norva_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Norfolk, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Norfolk','State':'VA','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False)   

#### 1.1.4 Acquire Virginia Beach, VA neighborhood data

In [127]:
neigh_df=pd.read_csv('neigh_df.csv')
#Here is the list of Virginia Beach neighborhoods
vab_neigh=['Alanton', 'Aragona Village', 'Bay Colony', 'Bayside', 'Cape Henry', 'Chesapeake Beach', 
               'Croatan Beach', 'Great Neck Point', 'Green Run', 'Kempsville', 'Lago Mar', 'Larkspur', 
               'London Bridge', 'Lynnhaven', 'Newtown', 'The North End', 'Oceana', 'Ocean Park', 'Pembroke Manor',
               'Princess Anne', 'Pungo', 'Red Mill Commons', 'Sandbridge', 'Thalia', 'Thoroughgood']

#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in vab_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Virginia Beach, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Virginia Beach','State':'VA','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False) 

#### 1.1.5 Acquire Washington, DC neighborhood data

In [129]:
neigh_df=pd.read_csv('neigh_df.csv')
#Scrape wikipedia to generate a list of washington, dc neighborhood names
wdc_page = requests.get('https://en.wikipedia.org/wiki/Neighborhoods_in_Washington,_D.C.').text
wdc_soup = BeautifulSoup(wdc_page,'html.parser')
wdc_neigh=[]
neigh_search = wdc_soup.find_all('li')
for x in neigh_search[11:153]:
    name= re.sub(r"(| )\([^()]*\)", "", x.text)
    if name not in wdc_neigh:
        wdc_neigh.append(name)
        
#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in wdc_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Washington, DC"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Washington','State':'DC','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False) 

#### 1.1.6 Acquire Baltimore, MD neighborhood data

In [131]:
neigh_df=pd.read_csv('neigh_df.csv')
#Scrape wikipedia to generate a list of baltimore neighborhood names
bmd_page = requests.get('https://en.wikipedia.org/wiki/List_of_Baltimore_neighborhoods').text
bmd_soup = BeautifulSoup(bmd_page,'html.parser')
bmd_neigh=[]
neigh_search = bmd_soup.find_all('li')
for x in neigh_search[14:312]:
    name= re.sub(r"((|, | )\([^(]*|(|, | )\[[^(]*)", "", x.text)
    if name not in bmd_neigh:
        bmd_neigh.append(name)
bmd_neigh[261]='West Federal Hill'

#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in bmd_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Baltimore, MD"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Baltimore','State':'MD','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False) 

No lat long data found for  West Hill Square


#### 1.1.7 Acquire Charlottesville, VA neighborhood data

In [132]:
neigh_df=pd.read_csv('neigh_df.csv')
#Scrape wikipedia to generate a list of charlottesville neighborhood names
cva_page = requests.get('https://www.cvillepedia.org/List_of_Charlottesville_neighborhoods').text
cva_soup = BeautifulSoup(cva_page,'html.parser')
cva_neigh=[]
neigh_search = cva_soup.find_all('li')
for x in neigh_search[5:23]:
    name= re.sub(r"((|, | )\([^(]*|(|, | )\[[^(]*)", "", x.text)
    if name not in cva_neigh:
        cva_neigh.append(name)

#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in cva_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Charlottesville, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Charlottesville','State':'VA','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False) 

#### 1.1.8 Acquire Charlotte, NC neighborhood data

In [133]:
neigh_df=pd.read_csv('neigh_df.csv')
#Scrape wikipedia to generate a list of Charlotte neighborhood names
cnc_page = requests.get('https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Charlotte,_North_Carolina').text
cnc_soup = BeautifulSoup(cnc_page,'html.parser')
cnc_neigh=[]
neigh_search = cnc_soup.find_all('li')
for x in neigh_search[1:34]:
    name= re.sub(r"((|, | )\([^(]*|(|, | )\[[^(]*)", "", x.text)
    if name not in cnc_neigh:
        cnc_neigh.append(name)
        
#Geocode the latitude and longitude of each neighborhood and append to the neigh_df
for name in cnc_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Charlotte, NC"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        neigh_df=neigh_df.append({'Neighborhood':name,'City':'Charlotte','State':'NC','Neighborhood Latitude':lat,'Neighborhood Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
neigh_df.to_csv('neigh_df.csv',index=False) 

In [225]:
neigh_df.shape

(747, 5)

### 1.2 Check our geographic data on a map
We will also visually determine an appropriate radius to represent the neighborhood area for each city.

In [135]:
neigh_df=pd.read_csv('neigh_df.csv')

In [144]:
def map_neigh(city,zoom, radius):
    lat_center = neigh_df.loc[neigh_df.City==city,'Neighborhood Latitude'].mean()
    lng_center = neigh_df.loc[neigh_df.City==city,'Neighborhood Longitude'].mean()

    f = folium.Figure(width=650, height=450)
    city_map = folium.Map(location=[lat_center,lng_center], zoom_start=zoom).add_to(f)

    # add markers to map
    for x,y, name in zip(neigh_df.loc[neigh_df.City==city,'Neighborhood Latitude'],neigh_df.loc[neigh_df.City==city,'Neighborhood Longitude'],neigh_df.loc[neigh_df.City==city,'Neighborhood']):
        label = '{}'.format(name)
        label = folium.Popup(label, parse_html=True)
        folium.Circle(
            [x,y],
            radius=radius,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(city_map)  

    return city_map

#### 1.2.1 Map of Balitmore, MD Neighborhoods

In [145]:
map_neigh('Baltimore',zoom=11,radius=250)

#### 1.2.2 Map of Charlotte, NC neighborhoods

In [146]:
map_neigh('Charlotte',zoom=10,radius=700)

#### 1.2.3 Map of Charlottesville, VA neighborhoods

In [147]:
map_neigh('Charlottesville',zoom=12,radius=400)

#### 1.2.4 Map of Norfolk, VA neighborhoods

In [148]:
map_neigh('Norfolk',zoom=11,radius=400)

#### 1.2.5 Map of Raleigh, NC neighborhoods

In [149]:
map_neigh('Raleigh',zoom=11,radius=400)

#### 1.2.6 Map of Richmond, VA neighborhoods

In [150]:
map_neigh('Richmond',zoom=11,radius=400)

#### 1.2.7 Map of Virginia Beach, VA neighborhoods

In [151]:
map_neigh('Virginia Beach',zoom=10,radius=800)

#### 1.2.8 Map of Washington, DC neighborhoods

In [152]:
map_neigh('Washington',zoom=11,radius=250)

### 1.3 Acquiring Retail Venue data from foursquare

We define a function to retrieve nearby retail venue data with the foursquare api.

In [153]:
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

def getNearbyVenues(city_df, radius):
    
    venues_list=[]
    for neigh, lat, lng in zip(city_df['Neighborhood'], city_df['Latitude'], city_df['Longitude']):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            config.foursquare_id, 
            config.foursquare_secret, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            neigh, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### 1.3.1 Baltimore, MD retail venue data

In [8]:
bmd_venues_df=getNearbyVenues(bmd_neigh_df,radius=250)
bmd_venues_df.to_csv('bmd_venues_df',index=False)
bmd_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ashburton,39.326771,-76.674478,Cantys Helping Hands Homecare Services LLC,39.327780,-76.673010,Home Service
1,Central Park Heights,39.344276,-76.671591,Yummy's Deli & Grocery,39.345295,-76.671749,Deli / Bodega
2,Dolfield,39.335541,-76.674972,ALDI,39.335328,-76.676008,Grocery Store
3,Dolfield,39.335541,-76.674972,BP,39.336090,-76.675866,Gas Station
4,Dolfield,39.335541,-76.674972,Legends Pizza And Wings,39.336214,-76.674231,Pizza Place
...,...,...,...,...,...,...,...
1261,Washington Hill,39.290200,-76.595778,Neima halal grocery,39.288419,-76.594037,Grocery Store
1262,Washington Hill,39.290200,-76.595778,Pearl Lounge,39.288544,-76.594049,Hookah Bar
1263,Washington Hill,39.290200,-76.595778,City Springs Pool,39.290512,-76.595752,Pool
1264,Washington Hill,39.290200,-76.595778,Gallery 1448,39.291515,-76.597484,Art Gallery


#### 1.3.2 Charlotte, NC retail venue data

In [9]:
cnc_venues_df=getNearbyVenues(cnc_neigh_df,radius=700)
cnc_venues_df.to_csv('cnc_venues_df.csv',index=False)
cnc_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ballantyne,35.046908,-80.845470,City Barbeque,35.043560,-80.847773,American Restaurant
1,Ballantyne,35.046908,-80.845470,Publix at Ballantyne Town Center,35.042551,-80.849596,Grocery Store
2,Ballantyne,35.046908,-80.845470,Walgreens,35.042595,-80.847734,Pharmacy
3,Ballantyne,35.046908,-80.845470,Bad Daddy's Burger Bar,35.052694,-80.847530,Burger Joint
4,Ballantyne,35.046908,-80.845470,Starbucks,35.042713,-80.849731,Coffee Shop
...,...,...,...,...,...,...,...
676,SouthPark,35.148448,-80.830896,SouthPark Food Court,35.153076,-80.832993,Food Court
677,Starmount,35.141693,-80.868220,Park South Station Pool and Fitness Club,35.145354,-80.864310,Club House
678,Starmount,35.141693,-80.868220,Big Daddy's Pork Palace,35.145195,-80.867126,Gastropub
679,Starmount,35.141693,-80.868220,On The Go Mart,35.143072,-80.875201,Gas Station


#### 1.3.3 Charlottesville, VA retail venue data

In [10]:
cva_venues_df=getNearbyVenues(cva_neigh_df,radius=400)
cva_venues_df.to_csv('cva_venues_df.csv',index=False)
cva_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,10th & Page,38.035116,-78.49340,Peloton Station,38.033243,-78.494078,Sports Bar
1,10th & Page,38.035116,-78.49340,Sugar Shack Donuts & Coffee,38.032918,-78.494927,Donut Shop
2,10th & Page,38.035116,-78.49340,Red Hub Food Co.,38.033959,-78.493684,BBQ Joint
3,10th & Page,38.035116,-78.49340,Hardywood Pilot Brewery & Taproom,38.032496,-78.495093,Brewery
4,10th & Page,38.035116,-78.49340,"The Draftsman, Autograph Collection",38.032613,-78.496445,Hotel
...,...,...,...,...,...,...,...
108,Woolen Mills,38.020366,-78.45619,WillowTree,38.019429,-78.456396,IT Services
109,Woolen Mills,38.020366,-78.45619,Decipher Brewing,38.022002,-78.459889,Brewery
110,Woolen Mills,38.020366,-78.45619,Rivanna River - Woolen Mills Dam,38.020450,-78.454189,Dam
111,Woolen Mills,38.020366,-78.45619,Selvedge Brewing,38.019648,-78.455780,Brewery


#### 1.3.4 Norfolk, VA retail venue data

In [11]:
norva_venues_df=getNearbyVenues(norva_neigh_df,radius=400)
norva_venues_df.to_csv('norva_venues_df.csv',index=False)
norva_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ballentine,36.859184,-76.249505,Feather-N-Fin Chicken & Seafood,36.860247,-76.249081,American Restaurant
1,Ballentine,36.859184,-76.249505,The Salvation Army Ray And Joan Kroc Corps Com...,36.858655,-76.250093,Community Center
2,Ballentine,36.859184,-76.249505,Spartan Market,36.860096,-76.250223,Convenience Store
3,Barraud Park,36.863149,-76.266739,Barraud Park,36.866360,-76.264877,Park
4,Berkley,36.831815,-76.283829,Family Dollar,36.831430,-76.282143,Discount Store
...,...,...,...,...,...,...,...
561,Young Terrace,36.856521,-76.283392,Enterprise Rent-A-Car,36.855722,-76.287148,Rental Car Location
562,Young Terrace,36.856521,-76.283392,Wyndham Garden Hotel,36.855269,-76.286246,Hotel
563,Young Terrace,36.856521,-76.283392,7-11 Monticello Ave,36.854585,-76.285464,Convenience Store
564,Young Terrace,36.856521,-76.283392,Black Tuna,36.855163,-76.286205,American Restaurant


#### 1.3.5 Raleigh, NC retail venue data

In [12]:
rgh_venues_df=getNearbyVenues(rgh_neigh_df,radius=400)
rgh_venues_df.to_csv('rgh_venues_df.csv',index=False)
rgh_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Avent West,35.765403,-78.703778,The Pool @ The Summit,35.765215,-78.702415,Pool
1,Avent West,35.765403,-78.703778,Tabacco & Gifts @ Avent Ferry Shopping Center,35.767270,-78.702763,Smoke Shop
2,Avent West,35.765403,-78.703778,Wilderness,35.763297,-78.702686,Clothing Store
3,Avent West,35.765403,-78.703778,Clarion Crossing Pool and Clubhouse,35.766051,-78.700481,Pool
4,Avent West,35.765403,-78.703778,Lake Johnson Waterfall,35.762103,-78.704353,Lake
...,...,...,...,...,...,...,...
995,Wilder's Grove,35.798804,-78.564528,Carolina Beauty,35.801453,-78.565484,Cosmetics Shop
996,Wilder's Grove,35.798804,-78.564528,New Bern Commons,35.798755,-78.560841,Plaza
997,Wilder's Grove,35.798804,-78.564528,New Bern & Corporation,35.798994,-78.568592,Intersection
998,Wilder's Grove,35.798804,-78.564528,Cato,35.798994,-78.568683,Women's Store


#### 1.3.6 Richmond, VA retail venue data

In [13]:
rva_venues_df=getNearbyVenues(rva_neigh_df,radius=400)
rva_venues_df.to_csv('rva_venues_df.csv',index=False)
rva_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Arts District,37.543453,-77.438963,Pop's Market on Grace,37.542080,-77.438512,Café
1,Arts District,37.543453,-77.438963,Secret Sandwich Society,37.541787,-77.438228,Sandwich Place
2,Arts District,37.543453,-77.438963,Perly's,37.543848,-77.441436,Deli / Bodega
3,Arts District,37.543453,-77.438963,Rappahannock Restaurant,37.542810,-77.439207,Seafood Restaurant
4,Arts District,37.543453,-77.438963,Salt & Forge,37.545206,-77.440183,Sandwich Place
...,...,...,...,...,...,...,...
719,Willow Lawn,37.581870,-77.497587,Domino's Pizza,37.580737,-77.498975,Pizza Place
720,Willow Lawn,37.581870,-77.497587,Rack Room Shoes,37.582333,-77.497385,Shoe Store
721,Willow Lawn,37.581870,-77.497587,Sally Beauty,37.580840,-77.499068,Cosmetics Shop
722,Willow Lawn,37.581870,-77.497587,GNC,37.582648,-77.497115,Supplement Shop


#### 1.3.7 Virginia Beach, VA retail venue data

In [14]:
vab_venues_df=getNearbyVenues(vab_neigh_df,radius=800)
vab_venues_df.to_csv('vab_venues_df.csv',index=False)
vab_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Alanton,36.880667,-76.024684,Mahanes C M Dr,36.877965,-76.022167,Food
1,Alanton,36.880667,-76.024684,Twisted Sisters Cupcakes Mobile,36.879453,-76.030936,Cupcake Shop
2,Alanton,36.880667,-76.024684,Alanton / Baycliff Pool,36.884724,-76.029839,Pool
3,Alanton,36.880667,-76.024684,Life is Goode,36.877602,-76.018264,Sports Bar
4,Alanton,36.880667,-76.024684,Field #2 at Cape Henry Collegiate,36.875805,-76.029456,Soccer Field
...,...,...,...,...,...,...,...
450,Thoroughgood,36.891814,-76.129102,Bayside Recreation Center,36.898481,-76.126418,Gym
451,Thoroughgood,36.891814,-76.129102,Tinkham's Rat Palace,36.892791,-76.128912,Asian Restaurant
452,Thoroughgood,36.891814,-76.129102,Thoroughgood Neighborhood,36.886225,-76.126302,Housing Development
453,Thoroughgood,36.891814,-76.129102,Church Point Longboard Loop,36.897946,-76.125478,Skate Park


#### 1.3.8 Washington, DC retail venue data

In [15]:
wdc_venues_df=getNearbyVenues(wdc_neigh_df,radius=250)
wdc_venues_df.to_csv('wdc_venues_df.csv',index=False)
wdc_venues_df

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Adams Morgan,38.921242,-77.043493,Lapis,38.921302,-77.043890,Afghan Restaurant
1,Adams Morgan,38.921242,-77.043493,Donburi,38.921673,-77.042385,Japanese Restaurant
2,Adams Morgan,38.921242,-77.043493,Tryst,38.921947,-77.042168,Coffee Shop
3,Adams Morgan,38.921242,-77.043493,Amsterdam Falafelshop,38.921162,-77.041959,Falafel Restaurant
4,Adams Morgan,38.921242,-77.043493,So's Your Mom,38.921671,-77.043753,Bagel Shop
...,...,...,...,...,...,...,...
985,Shipley Terrace,38.847335,-76.969141,Shipley Liquors,38.847601,-76.971422,Liquor Store
986,Shipley Terrace,38.847335,-76.969141,Roy's Steak 'N Subs,38.847601,-76.971422,American Restaurant
987,Shipley Terrace,38.847335,-76.969141,Capitol Sports Complex,38.848237,-76.971672,Athletics & Sports
988,Washington Highlands,38.830943,-76.994547,Ferebee Hope Recreation Center,38.832526,-76.995676,Basketball Court


## 2 Data Preparation

### 2.1 Dummy columns are created for each retail venue category.

In [170]:
bmd_venues_df=pd.read_csv('bmd_venues_df.csv')
bmd_venues_df['City']='Baltimore, MD'
cnc_venues_df=pd.read_csv('cnc_venues_df.csv')
cnc_venues_df['City']='Charlotte, NC'
cva_venues_df=pd.read_csv('cva_venues_df.csv')
cva_venues_df['City']='Charlottesville, VA'
norva_venues_df=pd.read_csv('norva_venues_df.csv')
norva_venues_df['City']='Norfolk, VA'
rgh_venues_df=pd.read_csv('rgh_venues_df.csv')
rgh_venues_df['City']='Raleigh, NC'
rva_venues_df=pd.read_csv('rva_venues_df.csv')
rva_venues_df['City']='Richmond, VA'
vab_venues_df=pd.read_csv('vab_venues_df.csv')
vab_venues_df['City']='Virginia Beach, VA'
wdc_venues_df=pd.read_csv('wdc_venues_df.csv')
wdc_venues_df['City']='Washington, DC'
venues_df=pd.concat([bmd_venues_df,cnc_venues_df,cva_venues_df,norva_venues_df,rgh_venues_df,rva_venues_df,vab_venues_df,wdc_venues_df])

#remove catergories called neighborhood
venues_df=venues_df[venues_df['Venue Category']!='Neighborhood']  
#Remove neighborhoods with less than 10 retail venues
venues_df = venues_df.join(venues_df.groupby('Neighborhood')['Venue'].count(), on='Neighborhood', rsuffix=' Count')
venues_df = venues_df[venues_df['Venue Count']>=10]
venues_df.to_csv('venues_df.csv')
#make dummy columns for categories
venues_onehot=pd.get_dummies(venues_df[['Venue Category']], prefix="", prefix_sep="") 
#Insert the neighborhood data to the front and then group by neighborhood, taking the mean of each category to give us a frequency value
categories_df=pd.concat([venues_df[['City','Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue Count']],venues_onehot],axis=1) .groupby(['City','Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue Count']).mean().reset_index()
categories_df.to_csv('categories_df.csv',index=False)
categories_df

Unnamed: 0,City,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Count,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,...,Vietnamese Restaurant,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,"Baltimore, MD",Barre Circle,39.284705,-76.627914,13,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.076923,0.000000,0.0,0.0,0.0
1,"Baltimore, MD",Bromo Arts District,39.294943,-76.619813,22,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,"Baltimore, MD",Brooklyn,39.238196,-76.603722,75,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,"Baltimore, MD",Canton,39.282183,-76.576276,15,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,"Baltimore, MD",Cathedral Hill,39.299596,-76.617173,33,0.0,0.0,0.0,0.0,0.030303,...,0.0,0.0,0.0,0.0,0.060606,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,"Washington, DC",Southwest Federal Center,38.885619,-77.021912,21,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
146,"Washington, DC",Swampoodle,38.903519,-77.002291,12,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
147,"Washington, DC",Truxton Circle,38.909973,-77.011825,11,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.090909,0.0,0.0,0.0
148,"Washington, DC",U Street Corridor,38.917001,-77.025272,45,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.022222,0.000000,0.000000,0.0,0.0,0.0


### 2.2 Find top 10 retail venue categories for each Neighborhood

Since we will want to be able to characterize each cluster and see what types of venues they have, let's write a function to sort the venues of any postal code area from most to least frequent.

In [163]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [166]:
temp_df=categories_df.drop(['Neighborhood Latitude','Neighborhood Longitude','Venue Count'],axis=1)

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City','Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
top_cats_by_neigh = pd.DataFrame(columns=columns)
top_cats_by_neigh[['City','Neighborhood']] = temp_df[['City','Neighborhood']]

for ind in np.arange(temp_df.shape[0]):
    top_cats_by_neigh.iloc[ind, 2:] = return_most_common_venues(temp_df.iloc[ind, :], num_top_venues)

top_cats_by_neigh.to_csv('top_cats_by_neigh.csv', index=False)
top_cats_by_neigh

Unnamed: 0,City,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Baltimore, MD",Barre Circle,Ethiopian Restaurant,American Restaurant,Chinese Restaurant,Bar,Bakery,Coffee Shop,Bistro,Convenience Store,Restaurant,Bus Stop
1,"Baltimore, MD",Bromo Arts District,Chinese Restaurant,Lounge,Coffee Shop,Vegetarian / Vegan Restaurant,Noodle House,Seafood Restaurant,Taco Place,Library,Food Court,Dumpling Restaurant
2,"Baltimore, MD",Brooklyn,Chinese Restaurant,Fast Food Restaurant,Bus Stop,Pizza Place,Fish Market,Eye Doctor,Falafel Restaurant,Farmers Market,Filipino Restaurant,Film Studio
3,"Baltimore, MD",Canton,Pub,Bar,Irish Pub,Park,Café,Pharmacy,Dive Bar,Restaurant,Seafood Restaurant,Pizza Place
4,"Baltimore, MD",Cathedral Hill,Indian Restaurant,Wine Bar,Sandwich Place,Lounge,Gay Bar,Convenience Store,French Restaurant,Sushi Restaurant,Spa,Café
...,...,...,...,...,...,...,...,...,...,...,...,...
145,"Washington, DC",Southwest Federal Center,Food Truck,Coffee Shop,Donut Shop,French Restaurant,Deli / Bodega,Food & Drink Shop,Mexican Restaurant,Mediterranean Restaurant,Bike Rental / Bike Share,Bakery
146,"Washington, DC",Swampoodle,Gym,Performing Arts Venue,Sculpture Garden,Brewery,Indian Restaurant,Sporting Goods Shop,Office,Restaurant,Park,Coffee Shop
147,"Washington, DC",Truxton Circle,Art Gallery,Sandwich Place,Winery,Bus Line,Bar,Intersection,Cosmetics Shop,Seafood Restaurant,Bakery,Chinese Restaurant
148,"Washington, DC",U Street Corridor,Bar,Gay Bar,Ethiopian Restaurant,Pizza Place,Taco Place,New American Restaurant,Music Venue,Sandwich Place,Speakeasy,Thai Restaurant


In [59]:
top_cats_by_neigh.loc[top_cats_by_neigh.Neighborhood=='Church Hill']

Unnamed: 0,City,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
87,"Richmond, VA",Church Hill,Bakery,New American Restaurant,Bridal Shop,BBQ Joint,Pub,Park,Sandwich Place,Southern / Soul Food Restaurant,Gift Shop,Thai Restaurant


## 3 Clustering

### 3.1 K-Means Clustering

In [171]:
categories_df=pd.read_csv('categories_df.csv')

# set number of clusters
kclusters = 26
cluster_df = caetgories_df.drop(['City','Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue Count'], axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cluster_df)

categories_df.insert(loc=5,column='Cluster Label',value=kmeans.labels_)
categories_df.to_csv('categories_df.csv',index=False)
categories_df

Unnamed: 0,City,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Count,Cluster Label,ATM,Accessories Store,Adult Boutique,Advertising Agency,...,Vietnamese Restaurant,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,"Baltimore, MD",Barre Circle,39.284705,-76.627914,13,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.076923,0.000000,0.0,0.0,0.0
1,"Baltimore, MD",Bromo Arts District,39.294943,-76.619813,22,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
2,"Baltimore, MD",Brooklyn,39.238196,-76.603722,75,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,"Baltimore, MD",Canton,39.282183,-76.576276,15,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,"Baltimore, MD",Cathedral Hill,39.299596,-76.617173,33,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.060606,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,"Washington, DC",Southwest Federal Center,38.885619,-77.021912,21,20,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
146,"Washington, DC",Swampoodle,38.903519,-77.002291,12,13,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
147,"Washington, DC",Truxton Circle,38.909973,-77.011825,11,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.090909,0.0,0.0,0.0
148,"Washington, DC",U Street Corridor,38.917001,-77.025272,45,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.022222,0.000000,0.000000,0.0,0.0,0.0


#### We can see that Church Hill, Richmond receives cluster label 16.

In [178]:
categories_df.loc[categories_df.Neighborhood=='Church Hill']

Unnamed: 0,City,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Count,Cluster Label,ATM,Accessories Store,Adult Boutique,Advertising Agency,...,Vietnamese Restaurant,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
87,"Richmond, VA",Church Hill,37.53284,-77.416252,16,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3.2 Finding the top 25 retail venue categories for each neighborhood cluster

In [179]:
def return_most_common_venues_by_cluster(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [196]:
categories_df=pd.read_csv('categories_df.csv')
cluster_df = categories_df.drop(['Neighborhood Latitude','Neighborhood Longitude','Venue Count'],axis=1).groupby('Cluster Label').sum()
cluster_df=cluster_df.reset_index()
cluster_df.to_csv('cluster_df.csv',index=False)

num_top_venues = 25

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Cluster Label']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
top_cats_by_cluster = pd.DataFrame(columns=columns)
top_cats_by_cluster[['Cluster Label']] = cluster_df[['Cluster Label']]

for ind in np.arange(temp_df.shape[0]):
    top_cats_by_cluster.iloc[ind, 1:] = return_most_common_venues_by_cluster(cluster_df.iloc[ind, :], num_top_venues)

top_cats_by_cluster.to_csv('top_cats_by_cluster.csv', index=False)
top_cats_by_cluster

Unnamed: 0,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue,21th Most Common Venue,22th Most Common Venue,23th Most Common Venue,24th Most Common Venue,25th Most Common Venue
0,0,Cycle Studio,Arcade,Theater,Spanish Restaurant,Park,Laundry Service,Coffee Shop,Beer Bar,Gay Bar,...,Farmers Market,Food Court,Food & Drink Shop,Food,Fast Food Restaurant,Filipino Restaurant,Film Studio,Financial or Legal Service,Exhibit,Fish Market
1,1,Bar,Coffee Shop,American Restaurant,Italian Restaurant,Cocktail Bar,Pizza Place,Music Venue,Mexican Restaurant,Southern / Soul Food Restaurant,...,Café,Seafood Restaurant,Bakery,Gym,Gay Bar,Taco Place,Sushi Restaurant,Nightclub,Gastropub,Sandwich Place
2,2,Chinese Restaurant,Fast Food Restaurant,Bus Stop,Pizza Place,Fish Market,Eye Doctor,Falafel Restaurant,Farmers Market,Filipino Restaurant,...,Food,Food & Drink Shop,Food Court,Food Service,Food Truck,Flower Shop,Event Space,Fountain,Ethiopian Restaurant,Deli / Bodega
3,3,Convenience Store,Sandwich Place,Bar,Pharmacy,Grocery Store,Discount Store,Video Store,Pizza Place,Supermarket,...,Gas Station,American Restaurant,Thai Restaurant,Intersection,Fast Food Restaurant,Art Gallery,Asian Restaurant,Bakery,Korean Restaurant,Arts & Crafts Store
4,4,Park,Art Gallery,Food Truck,Eye Doctor,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Film Studio,...,Food,Food & Drink Shop,Food Court,Food Service,Football Stadium,Exhibit,Yoga Studio,French Restaurant,Ethiopian Restaurant,Deli / Bodega
5,5,Restaurant,American Restaurant,Bus Stop,Yoga Studio,Floating Market,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,...,Food,Food & Drink Shop,Food Court,Food Service,Food Truck,Football Stadium,Eye Doctor,Ethiopian Restaurant,Event Space,French Restaurant
6,6,Clothing Store,Cosmetics Shop,American Restaurant,Italian Restaurant,Coffee Shop,Shoe Store,Pizza Place,Hotel,Mexican Restaurant,...,Jewelry Store,Women's Store,Plaza,Shopping Mall,Bakery,Steakhouse,Mediterranean Restaurant,Sporting Goods Shop,Gym,Fast Food Restaurant
7,7,Seafood Restaurant,Event Space,Boat or Ferry,Fish Market,Bar,Donut Shop,Food & Drink Shop,Food Court,Food,...,Film Studio,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Football Stadium,Eye Doctor,Yoga Studio,Exhibit,French Restaurant
8,8,Beach,Trail,Spa,Surf Spot,Historic Site,Yoga Studio,Fish Market,Falafel Restaurant,Farmers Market,...,Exhibit,Fondue Restaurant,Food,Food & Drink Shop,Food Court,Food Service,Food Truck,Eye Doctor,Ethiopian Restaurant,Event Space
9,9,Chinese Restaurant,Spa,Fish Market,Exhibit,Eye Doctor,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,...,Food,Food & Drink Shop,Food Court,Food Service,Food Truck,Floating Market,Event Space,Ethiopian Restaurant,Escape Room,Dance Studio


### 3.3 Finding the top 10 retail venue categories missing from each neighborhood
We need to modify the categories_df so each category frequency equals the corresponding frequency from the cluster_df minus the freq from the categories_df.

In [221]:
cluster16=categories_df[categories_df['Cluster Label']==16]
cluster16

Unnamed: 0,City,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Count,Cluster Label,ATM,Accessories Store,Adult Boutique,Advertising Agency,...,Vietnamese Restaurant,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
1,"Baltimore, MD",Bromo Arts District,39.294943,-76.619813,22,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Baltimore, MD",Cathedral Hill,39.299596,-76.617173,33,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.060606,0.0,0.0,0.0,0.0,0.0
6,"Baltimore, MD",Dolfield,39.335541,-76.674972,10,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Baltimore, MD",Downtown,39.292007,-76.616721,60,16,0.0,0.0,0.0,0.0,...,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Baltimore, MD",Federal Hill,39.279304,-76.611666,24,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,"Baltimore, MD",Midtown-Belvedere,39.304755,-76.616721,34,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,"Baltimore, MD",Mount Vernon,39.299319,-76.613833,21,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.095238,0.0,0.0,0.0,0.0,0.0
21,"Baltimore, MD",Preston Gardens,39.293513,-76.613923,18,16,0.0,0.0,0.0,0.0,...,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,"Baltimore, MD",South Balto.Imp.Comm./West Federal Hill,39.276312,-76.611404,50,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0
27,"Baltimore, MD",West Pratt,39.285653,-76.636544,10,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0


In [222]:
cat16_rank=cluster16.groupby("Cluster Label").sum().iloc[:,3:].T
cat16_rank.columns=['Category Frequency']
cat16_rank=cat16_rank[cat16_rank['Category Frequency']>0]
cat16_rank.sort_values(by='Category Frequency',inplace=True,ascending=False)
cat16_rank_list=cat16_rank.index.to_list()
cat16_rank


Unnamed: 0,Category Frequency
Pizza Place,2.341457
Sandwich Place,1.706701
Chinese Restaurant,1.240079
American Restaurant,1.187954
Café,1.093852
...,...
College Theater,0.016667
Outdoor Supply Store,0.016667
Camera Store,0.016667
Comfort Food Restaurant,0.016667


In [223]:
church_hill_rank=cluster16[cluster16['Neighborhood']=='Church Hill'].iloc[:,6:].T
church_hill_rank.columns=['Category Frequency']
church_hill_rank=church_hill_rank[church_hill_rank['Category Frequency']>0]
church_hill_rank.sort_values(by='Category Frequency',inplace=True,ascending=False)
church_hill_rank_list=church_hill_rank.index.to_list()
church_hill_rank

Unnamed: 0,Category Frequency
Bakery,0.1875
New American Restaurant,0.125
American Restaurant,0.0625
BBQ Joint,0.0625
Bridal Shop,0.0625
Deli / Bodega,0.0625
Gift Shop,0.0625
Historic Site,0.0625
Park,0.0625
Pub,0.0625


In [224]:
missing_from_church_hill=[]
for cat in cat16_rank_list:
    if cat not in church_hill_rank_list:
        missing_from_church_hill.append(cat)
missing_from_church_hill

['Pizza Place',
 'Chinese Restaurant',
 'Café',
 'Mexican Restaurant',
 'Coffee Shop',
 'Gym',
 'Convenience Store',
 'Restaurant',
 'Grocery Store',
 'Seafood Restaurant',
 'Bar',
 'Indian Restaurant',
 'Liquor Store',
 'Spa',
 'Salon / Barbershop',
 'Italian Restaurant',
 'Fast Food Restaurant',
 'Discount Store',
 'Pharmacy',
 'Sushi Restaurant',
 'Gym / Fitness Center',
 'Ice Cream Shop',
 'Burger Joint',
 'Shipping Store',
 'Yoga Studio',
 'Market',
 'French Restaurant',
 'Donut Shop',
 'Shopping Mall',
 'Lounge',
 'Breakfast Spot',
 'Furniture / Home Store',
 'Middle Eastern Restaurant',
 'Thrift / Vintage Store',
 'Supermarket',
 'Pet Store',
 'Beer Garden',
 'Bank',
 'Latin American Restaurant',
 'Vietnamese Restaurant',
 'Hookah Bar',
 'Mediterranean Restaurant',
 'Fried Chicken Joint',
 'Wine Bar',
 'Juice Bar',
 'Video Store',
 'Caribbean Restaurant',
 'Peruvian Restaurant',
 'Gas Station',
 'Bookstore',
 'Smoke Shop',
 'Arts & Crafts Store',
 'Brewery',
 'Intersection',
 'F