In [2]:
import config
import numpy as np
import pandas as pd
import json
import requests
from bs4 import BeautifulSoup
import folium
import urllib
import regex as re

## 1 Data Collection

### 1.1 Geographic Data Collection

#### 1.1.1 Acquire Richmond, VA neighborhood data.

In [14]:
#Scrape wikipedia to generate a list of neighborhood names
rva_page = requests.get('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Richmond,_Virginia').text
rva_soup = BeautifulSoup(rva_page,'html.parser')
rva_neigh=[]
neigh_search = rva_soup.find_all('li')
for x in neigh_search[7:112]:
    rva_neigh.append(x.text)

In [9]:
#Make a dataframe containing latitude and longitude of each neighborhood
rva_neigh_df=pd.DataFrame()
for name in rva_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Richmond, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        rva_neigh_df=rva_neigh_df.append({'Neighborhood':name,'City':'Richmond','State':'VA','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        rva_neigh_df=rva_neigh_df.append({'Neighborhood':name,'City':'Richmond','State':'VA'},ignore_index=True)
rva_neigh_df=rva_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

In [33]:
#Save dataframe to csv file
rva_neigh_df.to_csv('rva_neigh_df.csv',index=False)
rva_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,Arts District,Richmond,VA,37.543453,-77.438963
1,Biotech and MCV,Richmond,VA,37.543455,-77.430753
2,City Center,Richmond,VA,37.542916,-77.437926
3,Court End,Richmond,VA,37.544154,-77.432096
4,Gambles Hill,Richmond,VA,37.537863,-77.445815


#### 1.1.2 Acquire Raleigh, NC neighborhood data

In [14]:
#Scrape wikipedia to generate a list of neighborhood names
rgh_page = requests.get('https://en.wikipedia.org/wiki/Raleigh,_North_Carolina_neighborhoods').text
rgh_soup = BeautifulSoup(rgh_page,'html.parser')
rgh_neigh=[]
neigh_search = rgh_soup.find_all('li')
for x in neigh_search[4:109]:
    rgh_neigh.append(x.text)

In [15]:
#Make a dataframe containing latitude and longitude of each neighborhood
rgh_neigh_df=pd.DataFrame()
for name in rgh_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Raleigh, NC"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        rgh_neigh_df=rgh_neigh_df.append({'Neighborhood':name,'City':'Raleigh','State':'NC','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
rgh_neigh_df=rgh_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

No lat long data found for  Westlake


In [34]:
#Save dataframe to csv file
rgh_neigh_df.to_csv('rgh_neigh_df.csv',index=False)
rgh_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,Anderson Heights,Raleigh,NC,35.817114,-78.641292
1,Avent West,Raleigh,NC,35.765403,-78.703778
2,Belvidere Park,Raleigh,NC,35.798189,-78.619304
3,Battery Heights,Raleigh,NC,35.777857,-78.617885
4,Bloomsbury,Raleigh,NC,35.809108,-78.649093


#### 1.1.3 Acquire Norfolk, VA neighborhood data

In [44]:
#Scrape wikipedia to generate a list of neighborhood names
norva_page = requests.get('https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Norfolk,_Virginia').text
norva_soup = BeautifulSoup(norva_page,'html.parser')
norva_neigh=[]
neigh_search = norva_soup.find_all('li')
for x in neigh_search[0:49]:
    norva_neigh.append(x.text)
norva_neigh[19]='Huntersville'
norva_neigh[3]='Bowling Green'

In [None]:
#Make a dataframe containing latitude and longitude of each neighborhood
norva_neigh_df=pd.DataFrame()
for name in norva_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Norfolk, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        norva_neigh_df=norva_neigh_df.append({'Neighborhood':name,'City':'Norfolk','State':'VA','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        norva_neigh_df=norva_neigh_df.append({'Neighborhood':name,'City':'Norfolk','State':'VA'},ignore_index=True)
norva_neigh_df=norva_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

In [35]:
#Save dataframe to csv file
norva_neigh_df.to_csv('norva_neigh_df.csv',index=False)
norva_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,Ballentine,Norfolk,VA,36.859184,-76.249505
1,Barraud Park,Norfolk,VA,36.863149,-76.266739
2,Berkley,Norfolk,VA,36.831815,-76.283829
3,Bowling Green,Norfolk,VA,36.854967,-76.250446
4,Calvert Square,Norfolk,VA,36.854353,-76.276151


#### 1.1.4 Acquire Virginia Beach, VA neighborhood data

In [49]:
vab_neigh=['Alanton', 'Aragona Village', 'Bay Colony', 'Bayside', 'Cape Henry', 'Chesapeake Beach', 
               'Croatan Beach', 'Great Neck Point', 'Green Run', 'Kempsville', 'Lago Mar', 'Larkspur', 
               'London Bridge', 'Lynnhaven', 'Newtown', 'The North End', 'Oceana', 'Ocean Park', 'Pembroke Manor',
               'Princess Anne', 'Pungo', 'Red Mill Commons', 'Sandbridge', 'Thalia', 'Thoroughgood']

In [50]:
#Make a dataframe containing latitude and longitude of each neighborhood
vab_neigh_df=pd.DataFrame()
for name in vab_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Virginia Beach, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        vab_neigh_df=vab_neigh_df.append({'Neighborhood':name,'City':'Virginia Beach','State':'VA','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        vab_neigh_df=vab_neigh_df.append({'Neighborhood':name,'City':'Virginia Beach','State':'VA'},ignore_index=True)
vab_neigh_df=vab_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

In [36]:
#Save dataframe to csv file
vab_neigh_df.to_csv('vab_neigh_df.csv',index=False)
vab_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,Alanton,Virginia Beach,VA,36.880667,-76.024684
1,Aragona Village,Virginia Beach,VA,36.858903,-76.152288
2,Bay Colony,Virginia Beach,VA,36.874744,-75.995461
3,Bayside,Virginia Beach,VA,36.902925,-76.13438
4,Cape Henry,Virginia Beach,VA,36.931536,-76.019931


#### 1.1.5 Acquire Washington, DC neighborhood data

In [23]:
#Scrape wikipedia to generate a list of neighborhood names
wdc_page = requests.get('https://en.wikipedia.org/wiki/Neighborhoods_in_Washington,_D.C.').text
wdc_soup = BeautifulSoup(wdc_page,'html.parser')
wdc_neigh=[]
neigh_search = wdc_soup.find_all('li')
for x in neigh_search[11:153]:
    name= re.sub(r"(| )\([^()]*\)", "", x.text)
    if name not in wdc_neigh:
        wdc_neigh.append(name)

In [24]:
#Make a dataframe containing latitude and longitude of each neighborhood
wdc_neigh_df=pd.DataFrame()
for name in wdc_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Washington, DC"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        wdc_neigh_df=wdc_neigh_df.append({'Neighborhood':name,'City':'Washington','State':'DC','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
wdc_neigh_df=wdc_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

In [37]:
#Save dataframe to csv file
wdc_neigh_df.to_csv('wdc_neigh_df.csv',index=False)
wdc_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,Adams Morgan,Washington,DC,38.921242,-77.043493
1,Columbia Heights,Washington,DC,38.927641,-77.029718
2,Howard University,Washington,DC,38.922684,-77.019438
3,Kalorama,Washington,DC,38.914501,-77.046174
4,LeDroit Park,Washington,DC,38.917764,-77.018303


#### 1.1.6 Acquire Baltimore, MD neighborhood data

In [88]:
#Scrape wikipedia to generate a list of neighborhood names
bmd_page = requests.get('https://en.wikipedia.org/wiki/List_of_Baltimore_neighborhoods').text
bmd_soup = BeautifulSoup(bmd_page,'html.parser')
bmd_neigh=[]
neigh_search = bmd_soup.find_all('li')
for x in neigh_search[14:312]:
    name= re.sub(r"((|, | )\([^(]*|(|, | )\[[^(]*)", "", x.text)
    if name not in bmd_neigh:
        bmd_neigh.append(name)
bmd_neigh[261]='West Federal Hill'

In [115]:
#Make a dataframe containing latitude and longitude of each neighborhood
bmd_neigh_df=pd.DataFrame()
for name in bmd_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Baltimore, MD"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        bmd_neigh_df=bmd_neigh_df.append({'Neighborhood':name,'City':'Baltimore','State':'MD','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        print('No lat long data found for ',name)
bmd_neigh_df=bmd_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

No lat long data found for  West Hill Square
No lat long data found for  Hawkins Point


In [38]:
#Save dataframe to csv file
bmd_neigh_df.to_csv('bmd_neigh_df.csv',index=False)
bmd_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,Arlington,Baltimore,MD,39.348165,-76.68314
1,Ashburton,Baltimore,MD,39.326771,-76.674478
2,Burleigh-Leighton,Baltimore,MD,39.319664,-76.659813
3,Callaway-Garrison,Baltimore,MD,39.33254,-76.680253
4,Central Forest Park,Baltimore,MD,39.326451,-76.688192


#### 1.1.7 Acquire Charlottesville, VA neighborhood data

In [93]:
#Scrape wikipedia to generate a list of neighborhood names
cva_page = requests.get('https://www.cvillepedia.org/List_of_Charlottesville_neighborhoods').text
cva_soup = BeautifulSoup(cva_page,'html.parser')
cva_neigh=[]
neigh_search = cva_soup.find_all('li')
for x in neigh_search[5:23]:
    name= re.sub(r"((|, | )\([^(]*|(|, | )\[[^(]*)", "", x.text)
    if name not in cva_neigh:
        cva_neigh.append(name)


In [95]:
#Make a dataframe containing latitude and longitude of each neighborhood
cva_neigh_df=pd.DataFrame()
for name in cva_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Charlottesville, VA"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        cva_neigh_df=cva_neigh_df.append({'Neighborhood':name,'City':'Charlottesville','State':'VA','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        cva_neigh_df=cva_neigh_df.append({'Neighborhood':name,'City':'Charlottesville','State':'VA'},ignore_index=True)
cva_neigh_df=cva_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

In [39]:
#Save dataframe to csv file
cva_neigh_df.to_csv('cva_neigh_df.csv',index=False)
cva_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,10th & Page,Charlottesville,VA,38.035116,-78.4934
1,Barracks/Rugby,Charlottesville,VA,38.047317,-78.489401
2,Belmont-Carlton,Charlottesville,VA,38.02211,-78.46886
3,Fifeville,Charlottesville,VA,38.026989,-78.495084
4,Fry's Spring,Charlottesville,VA,38.016778,-78.510708


#### 1.1.8 Acquire Charlotte, NC neighborhood data

In [100]:
#Scrape wikipedia to generate a list of neighborhood names
cnc_page = requests.get('https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Charlotte,_North_Carolina').text
cnc_soup = BeautifulSoup(cnc_page,'html.parser')
cnc_neigh=[]
neigh_search = cnc_soup.find_all('li')
for x in neigh_search[1:34]:
    name= re.sub(r"((|, | )\([^(]*|(|, | )\[[^(]*)", "", x.text)
    if name not in cnc_neigh:
        cnc_neigh.append(name)

In [101]:
#Make a dataframe containing latitude and longitude of each neighborhood
cnc_neigh_df=pd.DataFrame()
for name in cnc_neigh:
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'.format(urllib.parse.quote(name+", Charlotte, NC"),config.google_api_key)
    results = requests.get(url).json()
    try:
        lat=results['results'][0]['geometry']['location']['lat']
        lng= results['results'][0]['geometry']['location']['lng']
        cnc_neigh_df=cnc_neigh_df.append({'Neighborhood':name,'City':'Charlotte','State':'NC','Latitude':lat,'Longitude':lng},ignore_index=True)
    except:
        cnc_neigh_df=cnc_neigh_df.append({'Neighborhood':name,'City':'Charlotte','State':'NC'},ignore_index=True)
cnc_neigh_df=cnc_neigh_df[['Neighborhood','City','State','Latitude','Longitude']]

In [40]:
#Save dataframe to csv file
cnc_neigh_df=cnc_neigh_df[cnc_neigh_df['Neighborhood']!='University City']
cnc_neigh_df.to_csv('cnc_neigh_df.csv',index=False)
cnc_neigh_df.head()

Unnamed: 0,Neighborhood,City,State,Latitude,Longitude
0,Ballantyne,Charlotte,NC,35.046908,-80.84547
1,Biddleville,Charlotte,NC,35.244594,-80.857691
2,Brooklyn,Charlotte,NC,35.220105,-80.841931
3,Chantilly,Charlotte,NC,35.212128,-80.80871
4,Uptown Charlotte,Charlotte,NC,35.22723,-80.846082


### 1.2 Check our geographic data on a map
We will also visually determine an appropriate radius to represent the neighborhood area for each city.

In [3]:
bmd_neigh_df=pd.read_csv('bmd_neigh_df.csv')
cnc_neigh_df=pd.read_csv('cnc_neigh_df.csv')
cva_neigh_df=pd.read_csv('cva_neigh_df.csv')
norva_neigh_df=pd.read_csv('norva_neigh_df.csv')
rgh_neigh_df=pd.read_csv('rgh_neigh_df.csv')
rva_neigh_df=pd.read_csv('rva_neigh_df.csv')
vab_neigh_df=pd.read_csv('vab_neigh_df.csv')
wdc_neigh_df=pd.read_csv('wdc_neigh_df.csv')

In [119]:
def map_neigh(city_df,zoom, radius):
    lat_center = city_df['Latitude'].mean()
    lng_center = city_df['Longitude'].mean()

    f = folium.Figure(width=650, height=450)
    city_map = folium.Map(location=[lat_center,lng_center], zoom_start=zoom).add_to(f)

    # add markers to map
    for x,y, name in zip(city_df['Latitude'],city_df['Longitude'],city_df['Neighborhood']):
        label = '{}'.format(name)
        label = folium.Popup(label, parse_html=True)
        folium.Circle(
            [x,y],
            radius=radius,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(city_map)  

    return city_map

#### 1.2.1 Map of Balitmore, MD Neighborhoods

In [138]:
map_neigh(bmd_neigh_df,zoom=11,radius=250)

#### 1.2.2 Map of Charlotte, NC neighborhoods

In [114]:
map_neigh(cnc_neigh_df,zoom=10,radius=700)

#### 1.2.3 Map of Charlottesville, VA neighborhoods

In [88]:
map_neigh(cva_neigh_df,zoom=12,radius=400)

#### 1.2.4 Map of Norfolk, VA neighborhoods

In [93]:
map_neigh(norva_neigh_df,zoom=11,radius=400)

#### 1.2.5 Map of Raleigh, NC neighborhoods

In [98]:
map_neigh(rgh_neigh_df,zoom=11,radius=400)

#### 1.2.6 Map of Richmond, VA neighborhoods

In [103]:
map_neigh(rva_neigh_df,zoom=11,radius=400)

#### 1.2.7 Map of Virginia Beach, VA neighborhoods

In [107]:
map_neigh(vab_neigh_df,zoom=10,radius=800)

#### 1.2.8 Map of Washington, DC neighborhoods

In [113]:
map_neigh(wdc_neigh_df,zoom=11,radius=250)

### 1.3 Acquiring Retail Venue data from foursquare

In [126]:
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [160]:
def getNearbyVenues(city_df, radius):
    
    venues_list=[]
    for neigh, lat, lng in zip(city_df['Neighborhood'], city_df['Latitude'], city_df['Longitude']):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            config.foursquare_id, 
            config.foursquare_secret, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            neigh, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [148]:
bmd_venues_df=getNearbyVenues(bmd_neigh_df,radius=250)
bmd_venues_df

In [146]:
cnc_venues_df=getNearbyVenues(cnc_neigh_df,radius=700)
cnc_venues_df

In [149]:
cva_venues_df=getNearbyVenues(cva_neigh_df,radius=400)
cva_venues_df

In [None]:
norva_venues_df=getNearbyVenues(norva_neigh_df,radius=400)
norva_venues_df

In [None]:
rgh_venues_df=getNearbyVenues(rgh_neigh_df,radius=400)
rgh_venues_df

In [None]:
rva_venues_df=getNearbyVenues(rva_neigh_df,radius=400)
rva_venues_df

In [None]:
vab_venues_df=getNearbyVenues(vab_neigh_df,radius=800)
vab_venues_df

In [None]:
wdc_venues_df=getNearbyVenues(wdc_neigh_df,radius=250)
wdc_venues_df

## 2 Data Preparation

### 2.1 Dummy variables are created for each venue category.

#### 2.2 We group each city by neighborhood taking the mean of each category.  This gives a frequency of category occuranace from 0 to 1.

#### 2.3 We cluster to determine the most similar neighborhoods.