In [128]:
# The code was removed by Watson Studio for sharing.

# Data sources to answer what neighborhood is the most 'Minnesota Nice'?

## The hypothetical data science firm of MCG has researched the 'Minnesota Nice' business problem and determined that a variety of data needs to be gathered. In particular, geolocation data will be critical from the Foursquare API.

### A more detailed description of data requirements follows:

* Neighborhood names along with census data from the American Community Survey will be pulled from the Minnesota open data website:
 * https://www.mncompass.org/profiles/neighborhoods/minneapolis-saint-paul#!community-areas 
* Neighborhood names will be associated with central latitute/longitude coordinates using the the methods described in the StackOverflow post:
 * https://stackoverflow.com/questions/44616592/search-google-geocoding-api-by-neighborhood
   * This will use the Google API searching for a combination of Neighborhood + City and then pulling the lat-long coordinates.
   
* Foursquare data will be obtained similar to the Toronto neighborhood analysis. We plan to look at restaurants, parks, schools, and spiritual centers.
 * https://developer.foursquare.com/docs/resources/categories 
 
* Walk scores for the neighborhoods will be obtained from the 'Walk Score' API:
 * https://www.walkscore.com/professional/api.php  

## First we import a couple of useful packages

In [91]:
import pandas as pd
import numpy as np
import googlemaps
import requests
import urllib
import uszipcode

## Now I import a couple of .csv files that were pulled from the mncompass.org website. We'll combine and pull just the neighborhood names.

In [92]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

In [93]:
# The code was removed by Watson Studio for sharing.

In [94]:
body = client_a28f8de00eed48e5bb907b36c94b68c9.get_object(Bucket='minnesotanice-donotdelete-pr-m1b1j2ihuwlryd',Key='MSP Neighborhoods_2010.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body, skiprows = 1)

body = client_a28f8de00eed48e5bb907b36c94b68c9.get_object(Bucket='minnesotanice-donotdelete-pr-m1b1j2ihuwlryd',Key='MSP Neighborhoods_2013-2017.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_2 = pd.read_csv(body, skiprows = 1)

df1 = df_data_1[['geography', 'City', 'Average household size - count (2010 Census)', 'Total population - Total - count (2010 Census)', 'Population under age 18 - share (2010 Census)']]
df1 = df1.rename(index=str, columns={'geography': "geography", "City": "City", 'Average household size - count (2010 Census)': "household_size", 'Total population - Total - count (2010 Census)': "total_population", 'Population under age 18 - share (2010 Census)' : 'share_population_under18'})
df2 = df_data_2[['geography', 'City', 'Average household size - count (2013-2017 ACS)', 'Total population - Total population - count (2013-2017 ACS)', 'Population under age 18 - share (2010 Census)']]
df2 = df2.rename(index=str, columns={'geography': "geography", "City": "City", 'Average household size - count (2013-2017 ACS)': "household_size", 'Total population - Total population - count (2013-2017 ACS)': "total_population", 'Population under age 18 - share (2010 Census)' : 'share_population_under18'})

TwinCityNeighborhoods = df1.append(df2)

### Just quick sanity check on the import

In [95]:
TwinCityNeighborhoods.head()

Unnamed: 0,geography,City,household_size,total_population,share_population_under18
0,Mid-City Industrial,Minneapolis,1.480278,213.0,0.03125
1,University of Minnesota,Minneapolis,3.722899,5421.0,0.384615
2,Northeast Park,Minneapolis,2.390534,672.0,0.288684
3,Beltrami,Minneapolis,2.886937,1248.0,0.364183
4,Downtown East,Minneapolis,1.629117,1254.0,0.110879


In [96]:
TwinCityNeighborhoods.shape

(113, 5)

### We know there are 102 neighborhoods from the website listing so let's drop any duplicates.

In [97]:
TwinCityNeighborhoods = TwinCityNeighborhoods.drop_duplicates().dropna()

### Now since we need lat-longs, we'll make a list of the neighborhoods we want to search for on the google API.

In [98]:
TwinCityNeighborhoods['neighborhood'] = TwinCityNeighborhoods.geography + ", " + TwinCityNeighborhoods.City

In [99]:
TwinCityNeighborhoods.head()

Unnamed: 0,geography,City,household_size,total_population,share_population_under18,neighborhood
0,Mid-City Industrial,Minneapolis,1.480278,213.0,0.03125,"Mid-City Industrial, Minneapolis"
1,University of Minnesota,Minneapolis,3.722899,5421.0,0.384615,"University of Minnesota, Minneapolis"
2,Northeast Park,Minneapolis,2.390534,672.0,0.288684,"Northeast Park, Minneapolis"
3,Beltrami,Minneapolis,2.886937,1248.0,0.364183,"Beltrami, Minneapolis"
4,Downtown East,Minneapolis,1.629117,1254.0,0.110879,"Downtown East, Minneapolis"


In [100]:
# The code was removed by Watson Studio for sharing.

In [101]:
def geocode_address_lat(loc):
    geocode_result = gmaps.geocode(loc)
    lat = geocode_result[0]["geometry"]["location"]["lat"]
    return(lat)
    
def geocode_address_lon(loc):
    geocode_result = gmaps.geocode(loc)
    lon = geocode_result[0]["geometry"]["location"]["lng"]
    return(lon)


In [102]:
TwinCityNeighborhoods['latitude'] = TwinCityNeighborhoods['neighborhood'].apply(geocode_address_lat);
TwinCityNeighborhoods['longitude'] = TwinCityNeighborhoods['neighborhood'].apply(geocode_address_lon);


In [103]:
TwinCityNeighborhoods.head()

Unnamed: 0,geography,City,household_size,total_population,share_population_under18,neighborhood,latitude,longitude
0,Mid-City Industrial,Minneapolis,1.480278,213.0,0.03125,"Mid-City Industrial, Minneapolis",44.998862,-93.217771
1,University of Minnesota,Minneapolis,3.722899,5421.0,0.384615,"University of Minnesota, Minneapolis",44.97399,-93.227728
2,Northeast Park,Minneapolis,2.390534,672.0,0.288684,"Northeast Park, Minneapolis",45.00312,-93.241263
3,Beltrami,Minneapolis,2.886937,1248.0,0.364183,"Beltrami, Minneapolis",44.994943,-93.2416
4,Downtown East,Minneapolis,1.629117,1254.0,0.110879,"Downtown East, Minneapolis",44.975911,-93.254587


## To use the walkscore API, we also need an address to associate with the lat-long. We'll now do the reverse geocode to associate a human recognizable address.

In [104]:
# The code was removed by Watson Studio for sharing.

In [105]:
def geocode_address(loc):
    geocode_result = gmaps.geocode(loc)
    address = geocode_result[0]['formatted_address']
    return(address)

In [106]:
TwinCityNeighborhoods['lat-lon'] = TwinCityNeighborhoods.latitude.map(str) + "," + TwinCityNeighborhoods.longitude.map(str);
TwinCityNeighborhoods['address'] = TwinCityNeighborhoods['lat-lon'].apply(geocode_address);

In [107]:
TwinCityNeighborhoods.head()

Unnamed: 0,geography,City,household_size,total_population,share_population_under18,neighborhood,latitude,longitude,lat-lon,address
0,Mid-City Industrial,Minneapolis,1.480278,213.0,0.03125,"Mid-City Industrial, Minneapolis",44.998862,-93.217771,"44.9988622,-93.2177712","Broadway St NE & Hoover St, Minneapolis, MN 55..."
1,University of Minnesota,Minneapolis,3.722899,5421.0,0.384615,"University of Minnesota, Minneapolis",44.97399,-93.227728,"44.97399,-93.2277285","Oak St SE & Washington Ave SE, Minneapolis, MN..."
2,Northeast Park,Minneapolis,2.390534,672.0,0.288684,"Northeast Park, Minneapolis",45.00312,-93.241263,"45.0031203,-93.2412634","1653 Fillmore St NE, Minneapolis, MN 55413, USA"
3,Beltrami,Minneapolis,2.886937,1248.0,0.364183,"Beltrami, Minneapolis",44.994943,-93.2416,"44.994943,-93.2415998","453 Fillmore St NE, Minneapolis, MN 55413, USA"
4,Downtown East,Minneapolis,1.629117,1254.0,0.110879,"Downtown East, Minneapolis",44.975911,-93.254587,"44.9759107,-93.25458719999999","1001 S Washington Ave, Minneapolis, MN 55415, USA"


In [108]:
def walkscore(address, latitude, longitude, walk_key = walk_key):
    walk_base_url = 'http://api.walkscore.com/score'
    w_url = walk_base_url + '?' + urllib.parse.urlencode({
        'format': 'json',
        'address': str(address),
        'lat': str(latitude),
        'lon': str(longitude),
        'wsapikey': walk_key,
        'transit': 1
    })
    results=requests.get(w_url).json()
    return(results['walkscore'])

In [109]:
TwinCityNeighborhoods['walkscore'] = TwinCityNeighborhoods.apply(lambda x: walkscore(x.address, x.latitude, x.longitude), axis = 1);

In [110]:
TwinCityNeighborhoods.head()

Unnamed: 0,geography,City,household_size,total_population,share_population_under18,neighborhood,latitude,longitude,lat-lon,address,walkscore
0,Mid-City Industrial,Minneapolis,1.480278,213.0,0.03125,"Mid-City Industrial, Minneapolis",44.998862,-93.217771,"44.9988622,-93.2177712","Broadway St NE & Hoover St, Minneapolis, MN 55...",34
1,University of Minnesota,Minneapolis,3.722899,5421.0,0.384615,"University of Minnesota, Minneapolis",44.97399,-93.227728,"44.97399,-93.2277285","Oak St SE & Washington Ave SE, Minneapolis, MN...",77
2,Northeast Park,Minneapolis,2.390534,672.0,0.288684,"Northeast Park, Minneapolis",45.00312,-93.241263,"45.0031203,-93.2412634","1653 Fillmore St NE, Minneapolis, MN 55413, USA",62
3,Beltrami,Minneapolis,2.886937,1248.0,0.364183,"Beltrami, Minneapolis",44.994943,-93.2416,"44.994943,-93.2415998","453 Fillmore St NE, Minneapolis, MN 55413, USA",59
4,Downtown East,Minneapolis,1.629117,1254.0,0.110879,"Downtown East, Minneapolis",44.975911,-93.254587,"44.9759107,-93.25458719999999","1001 S Washington Ave, Minneapolis, MN 55415, USA",90


## Now we grab the median homevalue and median household income for each neighborhood, however it will only be as unique as the zipcode it resides in.

In [111]:
from uszipcode import Zipcode
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)

def zip_search_income(latitude, longitude, radius = 30, returns = 1):
    result = search.by_coordinates(latitude, longitude, radius = radius, returns = returns)
    median_household_income = result[0].median_household_income
    return(median_household_income)

def zip_search_homeval(latitude, longitude, radius = 30, returns = 1):
    result = search.by_coordinates(latitude, longitude, radius = radius, returns = returns)
    median_home_value = result[0].median_home_value
    return(median_home_value)

In [112]:
TwinCityNeighborhoods['median_household_income'] = TwinCityNeighborhoods.apply(lambda x: zip_search_income(x.latitude, x.longitude), axis = 1);
TwinCityNeighborhoods['median_home_value'] = TwinCityNeighborhoods.apply(lambda x: zip_search_homeval(x.latitude, x.longitude), axis = 1);

## Looks like there are 3-5 neighborhoods that return NaN values for the median household income and home value. Instead of dropping those neighborhoods, we'll impute the NaN with the mean for that column.

In [119]:
TwinCityNeighborhoods.fillna(TwinCityNeighborhoods.median(), inplace = True)

In [120]:
TwinCityNeighborhoods.head()

Unnamed: 0,geography,City,household_size,total_population,share_population_under18,neighborhood,latitude,longitude,lat-lon,address,walkscore,median_household_income,median_home_value
0,Mid-City Industrial,Minneapolis,1.480278,213.0,0.03125,"Mid-City Industrial, Minneapolis",44.998862,-93.217771,"44.9988622,-93.2177712","Broadway St NE & Hoover St, Minneapolis, MN 55...",34,45518.0,195800.0
1,University of Minnesota,Minneapolis,3.722899,5421.0,0.384615,"University of Minnesota, Minneapolis",44.97399,-93.227728,"44.97399,-93.2277285","Oak St SE & Washington Ave SE, Minneapolis, MN...",77,53316.989899,214517.525773
2,Northeast Park,Minneapolis,2.390534,672.0,0.288684,"Northeast Park, Minneapolis",45.00312,-93.241263,"45.0031203,-93.2412634","1653 Fillmore St NE, Minneapolis, MN 55413, USA",62,45518.0,195800.0
3,Beltrami,Minneapolis,2.886937,1248.0,0.364183,"Beltrami, Minneapolis",44.994943,-93.2416,"44.994943,-93.2415998","453 Fillmore St NE, Minneapolis, MN 55413, USA",59,45518.0,195800.0
4,Downtown East,Minneapolis,1.629117,1254.0,0.110879,"Downtown East, Minneapolis",44.975911,-93.254587,"44.9759107,-93.25458719999999","1001 S Washington Ave, Minneapolis, MN 55415, USA",90,52736.0,241000.0


## Now let's export this .csv so we have an intermediate result in case we have to restart the kernel. Don't want to re-hit the GoogleAPI and run into paying. 

In [130]:
# Save dataframe as csv file to storage
project.save_data(data=TwinCityNeighborhoods.to_csv(index=False),file_name='TwinCityNeighborhoods.csv',overwrite=True)


{'file_name': 'TwinCityNeighborhoods.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'minnesotanice-donotdelete-pr-m1b1j2ihuwlryd',
 'asset_id': '12a09502-9234-4208-9577-6d8e26cdfa27'}

## Read the Twin City Neighborhoods dataframe back in...

In [131]:
body = client_a28f8de00eed48e5bb907b36c94b68c9.get_object(Bucket='minnesotanice-donotdelete-pr-m1b1j2ihuwlryd',Key='TwinCityNeighborhoods.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

TwinCityNeighborhoodsDF = pd.read_csv(body)
TwinCityNeighborhoodsDF.head()


Unnamed: 0,geography,City,household_size,total_population,share_population_under18,neighborhood,latitude,longitude,lat-lon,address,walkscore,median_household_income,median_home_value
0,Mid-City Industrial,Minneapolis,1.480278,213.0,0.03125,"Mid-City Industrial, Minneapolis",44.998862,-93.217771,"44.9988622,-93.2177712","Broadway St NE & Hoover St, Minneapolis, MN 55...",34,45518.0,195800.0
1,University of Minnesota,Minneapolis,3.722899,5421.0,0.384615,"University of Minnesota, Minneapolis",44.97399,-93.227728,"44.97399,-93.2277285","Oak St SE & Washington Ave SE, Minneapolis, MN...",77,53316.989899,214517.525773
2,Northeast Park,Minneapolis,2.390534,672.0,0.288684,"Northeast Park, Minneapolis",45.00312,-93.241263,"45.0031203,-93.2412634","1653 Fillmore St NE, Minneapolis, MN 55413, USA",62,45518.0,195800.0
3,Beltrami,Minneapolis,2.886937,1248.0,0.364183,"Beltrami, Minneapolis",44.994943,-93.2416,"44.994943,-93.2415998","453 Fillmore St NE, Minneapolis, MN 55413, USA",59,45518.0,195800.0
4,Downtown East,Minneapolis,1.629117,1254.0,0.110879,"Downtown East, Minneapolis",44.975911,-93.254587,"44.9759107,-93.25458719999999","1001 S Washington Ave, Minneapolis, MN 55415, USA",90,52736.0,241000.0


## Now I'm just going to start dropping in the code from the Toronto Analysis

In [134]:
from sklearn.cluster import KMeans


In [122]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

ModuleNotFoundError: No module named 'folium'

In [None]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(DF['Latitude'], DF['Longitude'], DF['Borough'], DF['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [None]:
Toronto_DF = DF[DF['Borough'].str.contains('Toronto')]
Toronto_DF.drop('Postal Code', axis = 1)

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto2 = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_DF['Latitude'], Toronto_DF['Longitude'], Toronto_DF['Borough'], Toronto_DF['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto2)  
    
map_toronto2

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
TorontoVenues = getNearbyVenues(names=Toronto_DF['Neighbourhood'],
                                   latitudes=Toronto_DF['Latitude'],
                                   longitudes=Toronto_DF['Longitude']
                                  )

In [None]:
print(TorontoVenues.shape)
TorontoVenues.head()

In [None]:
TorontoVenues.groupby('Neighbourhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(TorontoVenues['Venue Category'].unique())))

In [None]:


# one hot encoding
Toronto_onehot = pd.get_dummies(TorontoVenues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = TorontoVenues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()



In [None]:
Toronto_onehot.shape

In [None]:
Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped

In [None]:
Toronto_grouped.shape

In [None]:
num_top_venues = 5

for hood in Toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:


# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto_DF

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

Toronto_merged.head() # check the last columns!



In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters