# 1. Import our libraries

First, let's import all the libraries needed for the project.

In [1]:
import os # Operating System
import pandas as pd
import numpy as np
import datetime as dt # Datetime
import hmac
import json # library to handle JSON files
!pip install shapely 

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from geopy.distance import vincenty

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes
import folium #import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Collecting shapely
  Downloading https://files.pythonhosted.org/packages/a2/6c/966fa320a88fc685c956af08135855fa84a1589631256abebf73721c26ed/Shapely-1.6.4.post2-cp35-cp35m-manylinux1_x86_64.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 689kB/s eta 0:00:01
[?25hInstalling collected packages: shapely
Successfully installed shapely-1.6.4.post2
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.19.0-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  24.43 MB/s
geopy-1.19.0-p 100% |################################| Time: 0:00:00  33.84 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTA

Before using data, we will have to explore and understand it.

# 2. Explore and Understand Data

We read the dataset that we collected from the Kaggle website into a pandas' data frame and display the first five rows of it as follows.
The data represents more than 32000 neighborhoods in USA with geolocalisation and income data.

In [99]:
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.
client_0f2552ad732c46e6af256afaeed2ca10 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='eCvxitpZE5vxDKgLGQ8HByQtjZXYwtT1w3hhfuFzQza2',
    ibm_auth_endpoint="https://iam.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_0f2552ad732c46e6af256afaeed2ca10.get_object(Bucket='finalproject-donotdelete-pr-tov1wfmrfkusdh',Key='kaggle_income.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data = pd.read_csv(body, encoding = "ISO-8859-1")
df_data.head()


Unnamed: 0,id,State_Code,State_Name,State_ab,County,City,Place,Type,Primary,Zip_Code,Area_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w
0,1011000,1,Alabama,AL,Mobile County,Chickasaw,Chickasaw city,City,place,36611,251,10894952,909156,30.77145,-88.079697,38773,30506,33101,1638.260513
1,1011010,1,Alabama,AL,Barbour County,Louisville,Clio city,City,place,36048,334,26070325,23254,31.708516,-85.611039,37725,19528,43789,258.017685
2,1011020,1,Alabama,AL,Shelby County,Columbiana,Columbiana city,City,place,35051,205,44835274,261034,33.191452,-86.615618,54606,31930,57348,926.031
3,1011030,1,Alabama,AL,Mobile County,Satsuma,Creola city,City,place,36572,251,36878729,2374530,30.874343,-88.009442,63919,52814,47707,378.114619
4,1011040,1,Alabama,AL,Mobile County,Dauphin Island,Dauphin Island,Town,place,36528,251,16204185,413605152,30.250913,-88.171268,77948,67225,54270,282.320328


In [100]:
df_data.shape

(32526, 19)

Our dataset consists of over 32000 rows and 19 columns. We will now prepare and preprocess data accordingly.

# 3. Data preparation and preprocessing

At this stage, we prepare our dataset for the modeling process, opting for the most suitable machine learning algorithm for our scope. Accordingly, we perform the following steps:

-  Rename the column names.
-  Select data only for the specific state we investigate.
-  Make a list of neighborhoods sorted by average income.
-  Narrow the research on the most profitable neighborhoods.
-  Create clusters for close neighborhoods.
-  Search on Foursquare nearest shopping mall locations by coordinates of the selected neighborhoods. 
-  Create a target list of best shopping malls based on average income of inhabitants.
-  Plot recommended locations on the state map to add clear visibility of the commercial plan.

In [101]:
# Drop useless data

df_data.drop(['State_Code', 'State_ab', 'County', 'Place', 'Type', 'Primary', 'Zip_Code', 'Area_Code', 'ALand', 'AWater', 'Median', 'Stdev', 'sum_w'], axis=1, inplace=True)
df_data.head() 

Unnamed: 0,id,State_Name,City,Lat,Lon,Mean
0,1011000,Alabama,Chickasaw,30.77145,-88.079697,38773
1,1011010,Alabama,Louisville,31.708516,-85.611039,37725
2,1011020,Alabama,Columbiana,33.191452,-86.615618,54606
3,1011030,Alabama,Satsuma,30.874343,-88.009442,63919
4,1011040,Alabama,Dauphin Island,30.250913,-88.171268,77948


In [102]:
# Rename some of the column names

df_data.rename(columns={  'id':'Id_neighborhood',
                          'Mean':'Avg_income',
                          'Lat':'Latitude',
                          'Lon':'Longitude'}                       
                       , 
                 inplace=True)
df_data.head()

Unnamed: 0,Id_neighborhood,State_Name,City,Latitude,Longitude,Avg_income
0,1011000,Alabama,Chickasaw,30.77145,-88.079697,38773
1,1011010,Alabama,Louisville,31.708516,-85.611039,37725
2,1011020,Alabama,Columbiana,33.191452,-86.615618,54606
3,1011030,Alabama,Satsuma,30.874343,-88.009442,63919
4,1011040,Alabama,Dauphin Island,30.250913,-88.171268,77948


In [103]:
# Selection of a specific state for our research.

State_name = 'California'

In [104]:
# Drop data outside our research limit.

df_data.drop(df_data[df_data['State_Name'] != State_name ].index , inplace=True)
df_data.head()

Unnamed: 0,Id_neighborhood,State_Name,City,Latitude,Longitude,Avg_income
1588,6011848,California,Bieber,41.199312,-120.956779,54602
1589,6011858,California,Blocksburg,40.161347,-123.615729,24830
1590,6011868,California,Alpine,32.842286,-116.756044,91662
1591,6011878,California,Watsonville,36.959483,-121.781739,63761
1592,6011888,California,Aptos,36.991147,-121.892758,96841


In [105]:
df_data.shape

(3280, 6)

In [106]:
print('Now, our dataset is made of '+ str(df_data.shape[0])+ ' locations in our specific state.')

Now, our dataset is made of 3280 locations in our specific state.


Let's describe the dataset.

In [107]:
df_data['Avg_income'].describe()

count      3280.000000
mean      78126.737805
std       33477.779862
min           0.000000
25%       53095.500000
50%       72331.500000
75%       98073.000000
max      242857.000000
Name: Avg_income, dtype: float64

In [108]:
# Drop the rows with Avg_income reported of 0.

df_data.drop(df_data[df_data['Avg_income'] == 0 ].index , inplace=True)
df_data.head()

Unnamed: 0,Id_neighborhood,State_Name,City,Latitude,Longitude,Avg_income
1588,6011848,California,Bieber,41.199312,-120.956779,54602
1589,6011858,California,Blocksburg,40.161347,-123.615729,24830
1590,6011868,California,Alpine,32.842286,-116.756044,91662
1591,6011878,California,Watsonville,36.959483,-121.781739,63761
1592,6011888,California,Aptos,36.991147,-121.892758,96841


In [109]:
df_data.shape

(3258, 6)

In [110]:
df_data['Avg_income'].describe()

count      3258.000000
mean      78654.297115
std       32967.022720
min       15740.000000
25%       53554.250000
50%       72636.000000
75%       98369.500000
max      242857.000000
Name: Avg_income, dtype: float64

In [111]:
# Sort the dataset by average income in order to select only the most valuable locations.

df_data = df_data.sort_values(by=['Avg_income'], ascending=False)

In [112]:
df_data.head(100)

Unnamed: 0,Id_neighborhood,State_Name,City,Latitude,Longitude,Avg_income
3341,60224419,California,San Diego,32.737719,-117.197744,242857
2910,60220539,California,Huntington Beach,33.679397,-118.020316,203910
2157,60213769,California,Los Angeles,34.043325,-118.404305,201716
3677,60227449,California,Menlo Park,37.436442,-122.192328,192856
3814,60228689,California,Palo Alto,37.451186,-122.147024,190876
2147,60213679,California,Los Angeles,34.091809,-118.508627,189728
2547,60217279,California,Manhattan Beach,33.880731,-118.410369,187441
3451,60225409,California,Solana Beach,32.989037,-117.240270,186307
3811,60228659,California,Los Altos,37.391307,-122.107655,184961
2919,60220619,California,Huntington Beach,33.722749,-118.061046,182565


# 4. Select the most valuable locations

In [113]:
# We will concentrate on the top income neighborhoods.

top = 50

n = df_data.shape[0]-top
df_data.drop(df_data.tail(n).index,inplace=True)

In [114]:
df_data.head(top)

Unnamed: 0,Id_neighborhood,State_Name,City,Latitude,Longitude,Avg_income
3341,60224419,California,San Diego,32.737719,-117.197744,242857
2910,60220539,California,Huntington Beach,33.679397,-118.020316,203910
2157,60213769,California,Los Angeles,34.043325,-118.404305,201716
3677,60227449,California,Menlo Park,37.436442,-122.192328,192856
3814,60228689,California,Palo Alto,37.451186,-122.147024,190876
2147,60213679,California,Los Angeles,34.091809,-118.508627,189728
2547,60217279,California,Manhattan Beach,33.880731,-118.410369,187441
3451,60225409,California,Solana Beach,32.989037,-117.24027,186307
3811,60228659,California,Los Altos,37.391307,-122.107655,184961
2919,60220619,California,Huntington Beach,33.722749,-118.061046,182565


In [115]:
df_data.shape

(50, 6)

# 5. Create a map of the most valuable locations

Now we create a map of the state and plot all the selected location in our dataset.

In [116]:
address = State_name+', USA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of '+State_name+' are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinates of California are 36.7014631, -118.7559974.


In [117]:
# create map of Indiana using latitude and longitude values
map_state = folium.Map(location=[latitude, longitude], zoom_start=7)

# add markers to map
for lat, lng, income, city in zip(df_data['Latitude'], df_data['Longitude'], df_data['Avg_income'], df_data['City']):
    label = '{}, {}'.format(city, income)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_state)  
    
map_state

In [118]:
print('We can see all the '+str(top)+' most valuable locations plotted on the state map.')

We can see all the 50 most valuable locations plotted on the state map.


# 6. Create geolocalisation clusters and extract centroids

Run  to cluster the neighborhood into clusters and find best centroids for each cluster.

In [119]:
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

coords = df_data.as_matrix(columns=['Latitude', 'Longitude'])


In [120]:
kms_per_radian = 6371.0088
epsilon = 8 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('We reduce the number of locations (clusters centroids) from '+str(top)+' to {}.'.format(num_clusters))

We reduce the number of locations (clusters centroids) from 50 to 23.


In [121]:
# Let's create a dataframe made of the most valuable shopping malls with their coordinates.

def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)

In [122]:
lats, lons = zip(*centermost_points)
rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
rep_points = rep_points.rename_axis('centroid_id')

In [123]:
rep_points

Unnamed: 0_level_0,lat,lon
centroid_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,32.737719,-117.197744
1,33.722749,-118.061046
2,34.108023,-118.456964
3,37.371934,-122.101819
4,33.880731,-118.410369
5,32.989037,-117.24027
6,37.879801,-122.136483
7,34.160627,-118.773024
8,33.639447,-117.629962
9,37.309034,-121.739794


# 7. Create a map of the defined clusters

In [124]:
# create map of State best malls using latitude and longitude values
map_state_centroids = folium.Map(location=[latitude, longitude], zoom_start=7)

# add markers to map
for lat, lng in zip(rep_points['lat'], rep_points['lon']):
    label = '{}, {}'.format(lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_state_centroids)  
    
map_state_centroids

In [125]:
print('We can see the '+str(num_clusters)+' most valuable locations centroids plotted on the state map.')

We can see the 23 most valuable locations centroids plotted on the state map.


# 8. Explore data of shopping malls close to the centroids

Now we will create a request to identify the closest shopping malls to our most valuable locations centroids.
We will use Foursquare API to get our data.

In [126]:
# Define Foursquare Credentials and Version

CLIENT_ID = 'GOY52VWOMAUW4LHBAECMJUAPLPEYBF2IQVUOBD4O0PNRPX5B' # Foursquare ID
CLIENT_SECRET = '541CZ4YTN1PHTMNT0TCFQTVSQFCHK1QVXQ15ONBFVXNRE5J1' # Foursquare Secret
VERSION = '20190101' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: GOY52VWOMAUW4LHBAECMJUAPLPEYBF2IQVUOBD4O0PNRPX5B
CLIENT_SECRET:541CZ4YTN1PHTMNT0TCFQTVSQFCHK1QVXQ15ONBFVXNRE5J1


In [127]:
# Define Foursquare Category ID for Shopping Malls from Foursquare website

Malls_id='4bf58dd8d48988d1fd941735'

In [128]:
# Create a function to explore shopping malls locations near specific coordinates.

def getNearbyVenues(names, latitudes, longitudes, radius=2000, LIMIT=10):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            Malls_id,
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
                  'Mall name', 
                  'Mall Latitude', 
                  'Mall Longitude', 
                  'Category']
  
    return(nearby_venues)

In [129]:
# Run the above function on each location and create a new dataframe called location_venues and display it.

location_malls = getNearbyVenues(names=rep_points.index,
                                   latitudes=rep_points['lat'],
                                   longitudes=rep_points['lon']
                                  )

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


# 9. Create a list of the most valuable shopping malls with coordinates

In [130]:
# Sometimes the same shopping mall has number of close coordinates in Foursquare API. We will drop the duplicates.

location_malls = location_malls.drop_duplicates(['Mall name'])
location_malls

Unnamed: 0,Mall name,Mall Latitude,Mall Longitude,Category
0,Fiesta de Reyes,32.755402,-117.19734,Shopping Mall
1,Loma Square,32.747835,-117.205684,Shopping Mall
2,Sunset Beach Farmer's Market,33.723658,-118.076168,Shopping Mall
3,Harbour Pointe,33.72287,-118.041826,Shopping Mall
4,Huntington Harbour Mall,33.7174,-118.051013,Shopping Mall
5,Rancho Shopping Center,37.360838,-122.097417,Shopping Mall
6,Metlox - Manhattan Beach,33.885976,-118.408416,Shopping Mall
7,Flower Hill Promenade,32.981108,-117.251224,Shopping Mall
8,Polo Plaza,32.983147,-117.229405,Shopping Mall
9,Lomas Santa Fe Plaza,32.995811,-117.257901,Shopping Mall


In [131]:
location_malls.shape

(23, 4)

In [132]:
# Sometimes under the category ID of shopping malls, Foursquare API returns other categories (e.g parking lots). We will keep only real shopping malls by dropping other categories.

location_malls = location_malls[location_malls.Category == 'Shopping Mall']

In [133]:
location_malls

Unnamed: 0,Mall name,Mall Latitude,Mall Longitude,Category
0,Fiesta de Reyes,32.755402,-117.19734,Shopping Mall
1,Loma Square,32.747835,-117.205684,Shopping Mall
2,Sunset Beach Farmer's Market,33.723658,-118.076168,Shopping Mall
3,Harbour Pointe,33.72287,-118.041826,Shopping Mall
4,Huntington Harbour Mall,33.7174,-118.051013,Shopping Mall
5,Rancho Shopping Center,37.360838,-122.097417,Shopping Mall
6,Metlox - Manhattan Beach,33.885976,-118.408416,Shopping Mall
7,Flower Hill Promenade,32.981108,-117.251224,Shopping Mall
8,Polo Plaza,32.983147,-117.229405,Shopping Mall
9,Lomas Santa Fe Plaza,32.995811,-117.257901,Shopping Mall


In [134]:
location_malls.shape

(23, 4)

In [135]:
print('Our final list consist of '+str(location_malls.shape[0])+' most valuable shopping malls in '+State_name+'.')

Our final list consist of 23 most valuable shopping malls in California.


# 10. Create a map of the most valuable shopping malls in the state

In [136]:
# create map of State best malls using latitude and longitude values
map_state_malls = folium.Map(location=[latitude, longitude], zoom_start=7)

# add markers to map
for lat, lng, name in zip(location_malls['Mall Latitude'], location_malls['Mall Longitude'], location_malls['Mall name']):
    label = '{}, {}, {}'.format(name, lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='purple',
        fill_opacity=0.7,
        parse_html=False).add_to(map_state_malls)  
    
map_state_malls

In [137]:
print('We can see the '+str(num_clusters)+' most valuable shopping malls plotted on the map of '+State_name+'.')

We can see the 23 most valuable shopping malls plotted on the map of California.


Our analysis is over. :)