# IBM Applied Data Science Capstone Course by Coursera
### Week 5 Final Report
**_Opening a New Spa in Mumbai_**
- Build a dataframe of neighborhoods in Mumbai  by web scraping the data from Wikipedia page
- Get the geographical coordinates of the neighborhoods
- Obtain the venue data for the neighborhoods from Foursquare API
- Explore and cluster the neighborhoods
- Select the best cluster to open a new shopping mall
***
### 1. Import libraries

In [1]:
#!conda update -n base -c defaults conda --yes
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

#import BeautifulSoup # library to parse HTML and XML documents
!conda install -c conda beautifulsoup4 --yes

#import beautifulsoup4
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library
#!conda install -c conda geocoder --yes
#import geocoder # to get coordinates

print("Libraries imported.")

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge

The following packages will be UPDATED:

    certifi: 2019.6.16-py36_1  --> 2019.6.16-py36_1  conda-forge

The following packages will be DOWNGRADED:

    openssl: 1.0.2s-h7b6447c_0 --> 1.0.2r-h14c3975_0 conda-forge


Downloading and Extracting Packages
certifi-2019.6.16    | 149 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

## 2. Scrap data from Wikipedia page into a DataFrame

In [2]:
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighbourhoods_in_Mumbai").text

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [4]:
# create a list to store neighborhood data
neighborhoodList = []

In [5]:
# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [6]:
# create a new DataFrame from the list
kl_df = pd.DataFrame({"Neighborhood": neighborhoodList})
kl_df = kl_df.drop(kl_df.index[0])
kl_df['Neighborhood'] = kl_df['Neighborhood'].str.replace(',', ' ')
kl_df['Neighborhood'] = kl_df['Neighborhood'].str.replace('(', '')
kl_df['Neighborhood'] = kl_df['Neighborhood'].str.replace(')', '')
kl_df['Neighborhood'] = kl_df['Neighborhood'].str.replace('`', '')
kl_df['Neighborhood'] = kl_df['Neighborhood'].str.replace('.', '')
kl_df

Unnamed: 0,Neighborhood
1,Aarey Milk Colony
2,Agripada
3,Altamount Road
4,Amboli Mumbai
5,Amrut Nagar
6,Antop Hill
7,Anushakti Nagar
8,Asalfa
9,Badhwar Park
10,Baiganwadi


In [7]:
# print the number of rows of the dataframe
kl_df.shape

(136, 1)

## 3. Get the geographical coordinates

In [8]:
geolocator = Nominatim(user_agent="specify_your_app_name_here")
d = pd.DataFrame()
for neighborhood in kl_df["Neighborhood"].tolist():
    lat_lng_coords = geolocator.geocode('{}, Mumbai'.format(neighborhood), timeout=None)
    if lat_lng_coords:
        if lat_lng_coords.latitude:
            if lat_lng_coords.longitude:
                #print(neighborhood,lat_lng_coords.latitude,lat_lng_coords.longitude)
                temp = pd.DataFrame({'Neighborhood': [neighborhood], 'Latitude': [lat_lng_coords.latitude], 'Longitude':[lat_lng_coords.longitude]})
                d = pd.concat([d, temp])

kl_df = pd.DataFrame(d)

  after removing the cwd from sys.path.


In [15]:
# save the DataFrame as CSV file
kl_df.to_csv("kl_df.csv", index=False)

## 4. Create a map of Mumbai with neighborhoods superimposed on top

In [16]:
# get the coordinates of Mumbai
address = 'Mumbai'

geolocator = Nominatim(user_agent="specify_your_app_name_here")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Mumbai {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Mumbai 18.9387711, 72.8353355.


In [17]:
# create map of Mumbai using latitude and longitude values
map_kl = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_kl)  
    
map_kl

In [18]:
# save the map as HTML file
map_kl.save('map_kl.html')

## 5. Use the Foursquare API to explore the neighborhoods

In [19]:
# define Foursquare Credentials and Version
#CLIENT_ID = 'UKQKXBNYSA0MCNAHWV2MD4RUUMGOL0DUQJGIE5JOWRIWRLTM' # your Foursquare ID  
#CLIENT_SECRET = 'AA10SDBSCHFCVIU0MOQS0QFL0OZ2123DV3UHQYFQR0U5WAGC' # your Foursquare Secret

#CLIENT_ID = 'NXRIWY0MHBMSOII4A4TQ0QAY2LHN5XNOD1JUCGCW1QNZDUOX' # your Foursquare ID    
#CLIENT_SECRET = 'NHVZK1LYGWN0KZKLEZ20B2RYX2J0XV4GKWHEY0JXEWAROYNN' # your Foursquare Secret


CLIENT_ID = 'XF0NPKISHXD2Z0HQVRRYZAD3Y4P10ZTIT3P0RT15QLCXUMII' # your Foursquare ID    
CLIENT_SECRET = 'HRXFXCZIQSQJWABUFIPGHNSL51BIOPV3JFH1YNGDISMDU1TG' # your Foursquare Secret

VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XF0NPKISHXD2Z0HQVRRYZAD3Y4P10ZTIT3P0RT15QLCXUMII
CLIENT_SECRET:HRXFXCZIQSQJWABUFIPGHNSL51BIOPV3JFH1YNGDISMDU1TG


In [20]:
search_query = 'spa'
radius = 2000
LIMIT = 50
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=XF0NPKISHXD2Z0HQVRRYZAD3Y4P10ZTIT3P0RT15QLCXUMII&client_secret=HRXFXCZIQSQJWABUFIPGHNSL51BIOPV3JFH1YNGDISMDU1TG&ll=18.9387711,72.8353355&v=20180605&query=spa&radius=2000&limit=50'

In [21]:
results = requests.get(url).json()
#results

In [22]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.tail()

Unnamed: 0,id,name,categories,referralId,hasPerk,location.address,location.crossStreet,location.lat,location.lng,location.labeledLatLngs,location.distance,location.postalCode,location.cc,location.city,location.state,location.country,location.formattedAddress
17,4e53fdf022711d4da5c37e3b,Spacematrix Mumbai,"[{'id': '4bf58dd8d48988d174941735', 'name': 'C...",v-1566240689,False,,,18.933575,72.824279,"[{'label': 'display', 'lat': 18.933575, 'lng':...",1299,,IN,Mumbai,Mahārāshtra,India,"[Mumbai, Mahārāshtra, India]"
18,4df98115e4cd56a649835da6,Space Matrix,"[{'id': '4bf58dd8d48988d124941735', 'name': 'O...",v-1566240689,False,,,18.933897,72.825356,"[{'label': 'display', 'lat': 18.933897, 'lng':...",1182,,IN,Mumbai,Mahārāshtra,India,"[Mumbai, Mahārāshtra, India]"
19,4b854372f964a5207a5331e3,Spaghetti Kitchen,[],v-1566240689,False,"CR2, 1st Floor, CR2, Inox Building,","Opp. Bajaj Bhavan, Nariman Point",18.926485,72.822358,"[{'label': 'display', 'lat': 18.92648457897770...",1933,400 021,IN,Mumbai,Mahārāshtra,India,"[CR2, 1st Floor, CR2, Inox Building, (Opp. Baj..."
20,54ddac44498ee59badc17324,DBS Corporate Services - Commercial Office Space,"[{'id': '4bf58dd8d48988d124941735', 'name': 'O...",v-1566240689,False,"Raheja Chambers, 2nd Floor, Nariman Point,",,18.923878,72.823592,"[{'label': 'display', 'lat': 18.92387808125605...",2068,400021,IN,Mumbai,Mahārāshtra,India,"[Raheja Chambers, 2nd Floor, Nariman Point,, M..."
21,4f2f48d5e4b0bd78b8cad496,Consulate General of Spain,"[{'id': '4bf58dd8d48988d12c951735', 'name': 'E...",v-1566240689,False,"Maker Chambers IV,",Nariman point,18.934619,72.826741,"[{'label': 'display', 'lat': 18.93461945611798...",1016,,IN,Mumbai,Mahārāshtra,India,"[Maker Chambers IV, (Nariman point), Mumbai, M..."


In [23]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]
dataframe_filtered = dataframe_filtered.rename(columns = {"location.lat": "lat", 
                                  "location.lng":"lng"})

dataframe_filtered['name'] = dataframe_filtered['name'].str.replace(',', ' ')
dataframe_filtered['name'] = dataframe_filtered['name'].str.replace('(', '')
dataframe_filtered['name'] = dataframe_filtered['name'].str.replace(')', '')
dataframe_filtered['name'] = dataframe_filtered['name'].str.replace('`', '')
dataframe_filtered['name'] = dataframe_filtered['name'].str.replace('.', '')
dataframe_filtered['name'] = dataframe_filtered['name'].str.replace('-', '')

dataframe_filtered

Unnamed: 0,name,categories,location.address,location.crossStreet,lat,lng,location.labeledLatLngs,location.distance,location.postalCode,location.cc,location.city,location.state,location.country,location.formattedAddress,id
0,Jiva Spa,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",P. J. Ramchandani Marg,Apollo Bunder,18.921927,72.832845,"[{'label': 'display', 'lat': 18.92192744189440...",1893,400 001,IN,Mumbai,Mahārāshtra,India,"[P. J. Ramchandani Marg (Apollo Bunder), Mumba...",4b0587e7f964a52074a622e3
1,Monalisa salon spa,"[{'id': '4bf58dd8d48988d110951735', 'name': 'S...","Shop No.2,2&4,Fantasy Apts.J.P. Road, 7Bungalo...",,18.931828,72.830999,"[{'label': 'display', 'lat': 18.93182798364119...",897,,IN,MUMBAI-400061,,India,"[Shop No.2,2&4,Fantasy Apts.J.P. Road, 7Bungal...",50a12db7e4b0c6336f56cdd6
2,Mudra Spa,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",,,18.930599,72.823088,"[{'label': 'display', 'lat': 18.93059939634803...",1578,,IN,,,India,[India],51a6122a498eb8bf3e6b993a
3,The Palms Spa,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",Dhanraj Mahal,,18.924603,72.833486,"[{'label': 'display', 'lat': 18.92460257892114...",1589,400039,IN,Mumbai,Mahārāshtra,India,"[Dhanraj Mahal, Mumbai 400039, Mahārāshtra, In...",4f433c99e4b0c5334e561cc1
4,Myrah spa,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",,,18.944767,72.824715,"[{'label': 'display', 'lat': 18.944767, 'lng':...",1302,,IN,,,India,[India],56fa1abd498eafeb77ff18c8
5,Oro Salon And Spa,"[{'id': '4bf58dd8d48988d110951735', 'name': 'S...",Jabreya,,18.943823,72.823677,"[{'label': 'display', 'lat': 18.94382286071777...",1350,,IN,,,India,"[Jabreya, India]",54cf5f1d498e775a90c90be1
6,Oberoi Spa,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",,,18.927245,72.820647,"[{'label': 'display', 'lat': 18.92724547743426...",2009,,IN,,,India,[India],50e17384e4b08361cd792a37
7,Sukho Thai Spa COLABA CAUSEWAY,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...","Crescent House, 1st Floor, Near McDonalds, Col...",,18.923409,72.831877,"[{'label': 'display', 'lat': 18.92340869186683...",1748,400039,IN,Mumbai,Mahārāshtra,India,"[Crescent House, 1st Floor, Near McDonalds, Co...",5be6be80772fbc002c8a6eea
8,Muktaa The Luxury Spa,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",Babubhai Chinai Road,1st floor,18.930225,72.82316,"[{'label': 'display', 'lat': 18.930225, 'lng':...",1596,400020,IN,Mumbai,Mahārāshtra,India,"[Babubhai Chinai Road (1st floor), Mumbai 4000...",5a9d20258c35dc30f43939cd
9,Suko Thai Spa Hotel Taj Mahel Mumbai,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",Mumbai,,18.922113,72.832765,"[{'label': 'display', 'lat': 18.92211256518864...",1874,,IN,,,India,"[Mumbai, India]",4feb23c7e4b076a38bc7b986


In [24]:
# add the spa as red circle markers
for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.name):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='red',
        popup=label,
        fill = True,
        fill_color='red',
        fill_opacity=0.6
    ).add_to(map_kl)

# display map
map_kl