# Coursera Capstone Project - Battle of the Neighbourhood
## Hongfang Lu

This project will utilize Foursquare api to create visualization, do machine learning analysis.

London Moscow New York

In [39]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

from geopy.geocoders import Nominatim #convert address to latitude and longitude

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.colors as colors
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## 1. Get London neighbourhoods information

Get the list of areas in London through wikipedia page

In [2]:
page_london = requests.get('https://en.wikipedia.org/wiki/List_of_areas_of_London')
page_london

<Response [200]>

Using Beautiful Soup to find the table that contains the neighbourhoods information

In [5]:
soup = BeautifulSoup(page_london.content, 'html.parser')
tables = soup.find_all('table', {'class': 'wikitable'})

Get the columns of the table, then remove all space (or unicode non-breaking sign), replace them with underscores

In [218]:
columns_row = tables[0].tbody.contents[0].children
columns_name = []
for co in columns_row:
    if co != '\n':
        #print(co.text)
        columns_name.append(co.text)

columns_name = map(lambda x: x.replace(u'\xa0', '_')
                   .replace(' ', '_').strip() 
                   if isinstance(x, (str, bytes)) 
                   else x, columns_name)

Get the columns names

In [217]:
columns_name = list(columns_name)
print(columns_name)

['Location', 'London_borough', 'Post_town', 'Postcode_district', 'Dial_code', 'OS_grid_ref']


Get the rows for each neighbourhood from the parsed webpage

In [144]:
locations = tables[0].tbody.contents[1:]

all_data = []
for location in locations:
    if location != '\n':
        #print(location)
        loc_data = []
        for co in location:
            if co != '\n':
                #print(co.contents[0])
                try: 
                    urll = co.contents[0]['href']
                    loc_data.append(urll)
                    loc_data.append(co.text.strip())
                except:
                    loc_data.append(co.text.strip())
        all_data.append(loc_data)
    

In [215]:
len(all_data)

533

This utility function transforms the coordinates from the form of '1.23N, 0.33W' into a list of the latitudes and longitudes, in the form of \[1.23, -0.33\]. 

In [131]:
def process_lan_lat(strn):
    N_sign = True
    E_sign = True
    
    lat_p = strn.find('N')
    if lat_p == -1:
        N_sign = False
        lat_p = strn.find('S')
    
    lon_p = strn.find('E')
    if lon_p == -1:
        E_sign = False
        lon_p = strn.find('W')
    
    lat = strn[:(lat_p - 1)]
    lon = strn[(lat_p + 2):(lon_p - 1)]
    
    lat_sign = 1.0 if N_sign else -1.0
    lon_sign = 1.0 if E_sign else -1.0
    return [float(lat) * lat_sign, float(lon) * lon_sign]
        

Parse wikipedia pages of each neighbourhood, then get the geograpical coordinate

In [132]:
%%time
lat_lon_all = [None for i in range(len(all_data))]

for i in range(len(all_data)):
    urll = 'https://en.wikipedia.org' + all_data[i][0]
    page = requests.get(urll)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    
    lls = soup.find_all('span', {'class': 'geo-dec'})
    if len(lls) > 0:
        print(all_data[i][1])
        print(lls[0].text)

        lat_lon_all[i] = str(lls[0].text)


Abbey Wood
51.4864°N 0.1109°E
Acton
51.513519°N 0.270661°W
Addington
51.3583°N 0.0305°W
Addiscombe
51.381°N 0.0663°W
Albany Park
51.4264°N 0.1026°E
Aldborough Hatch
51.583550°N 0.10102165°E
Aldgate
51.5132°N 0.0777°W
Aldwych
51.5132°N 0.1167°W
Alperton
51.5405°N 0.2994°W
Anerley
51.4147°N 0.067°W
Angel
51.5321°N 0.1066°W
Aperfield
51.3069°N 0.0448°E
Archway
51.566°N 0.1338°W
Ardleigh Green
51.5833°N 0.2165°E
Arkley
51.6477°N 0.2311°W
Arnos Grove
51.6163°N 0.1286°W
Balham
51.4434°N 0.1525°W
Bankside
51.5082°N 0.1001°W
Barbican
51.51917°N 0.09389°W
Barking
51.54°N 0.08°E
Barkingside
51.585833°N 0.084444°E
Barnehurst
51.4613°N 0.1690°E
Barnes
51.474°N 0.236°W
Barnes Cray
51.4578°N 0.1960°E
Barnet Gate
51.643083°N 0.240570°W
Barnet (also Chipping Barnet, High Barnet)
51.6444°N 0.1997°W
Barnsbury
51.5442°N 0.1171°W
Battersea
51.46377°N 0.16771°W
Bayswater
51.5095°N 0.1929°W
Beckenham
51.408°N 0.022°W
Beckton
51.5146°N 0.0673°E
Becontree
51.5487°N 0.1427°E
Becontree Heath
51.563595°N 0.15261

Knightsbridge
51.5017°N 0.1621°W
Ladywell
51.453°N 0.017°W
Lambeth
51.4903°N 0.1193°W
Lamorbey
51.438333°N 0.107549°E
Lampton
51.478109°N 0.363268°W
Lea Bridge
51.5621°N 0.0456°W
Leamouth
51.510719°N 0.006317°E
Leaves Green
51.335384°N 0.02952°E
Lee
51.4522°N 0.0086°E
Lessness Heath
51.4857°N 0.15177°E
Lewisham
51.461°N 0.005°W
Leyton
51.560558°N 0.015465°W
Leytonstone
51.569°N 0.010°E
Limehouse
51.5158°N 0.0318°W
Lisson Grove
51.52539°N 0.16969°W
Little Ilford
51.5509°N 0.0549°E
Little Venice
51.5216°N 0.1821°W
Locksbottom
51.3708°N 0.0608°E
Longford
51.4777°N 0.4943°W
Longlands
51.4302°N 0.0854°E
Lower Clapton
51.55212°N 0.04704°W
Lower Morden
51.3884°N 0.2198°W
Loxford
51.54807°N 0.08184°E
Maida Vale
51.5274°N 0.1899°W
Malden Rushett
51.336769°N 0.320285°W
Manor House
51.57182°N 0.09671°W
Manor Park
51.55033°N 0.056219°E
Marks Gate
51.593°N 0.1448°E
Maryland
51.545°N 0.002°W
Marylebone (also St Marylebone)
51.5177°N 0.1470°W
Mayfair
51.508755°N 0.14743°W
Maze Hill
51.481209°N 0.0036

Transform the geographical coordinates into lists

In [145]:
%%time
for i in range(len(all_data)):
    if lat_lon_all[i]:
        lat_lon = process_lan_lat(lat_lon_all[i])
    else:
        lat_lon = [None, None]
    
    all_data[i].extend(lat_lon)

Wall time: 3 ms


Create the dataframe.

In [207]:
london_df = pd.DataFrame(all_data, columns=['Link'] + columns_name + ['Latitude', 'Longitude'])
london_df.head()

Unnamed: 0,Link,Location,London_borough,Post_town,Postcode_district,Dial_code,OS_grid_ref,Latitude,Longitude
0,/wiki/Abbey_Wood,Abbey Wood,"Bexley, Greenwich [1]",LONDON,SE2,20,TQ465785,51.4864,0.1109
1,"/wiki/Acton,_London",Acton,"Ealing, Hammersmith and Fulham[2]",LONDON,"W3, W4",20,TQ205805,51.513519,-0.270661
2,"/wiki/Addington,_London",Addington,Croydon[2],CROYDON,CR0,20,TQ375645,51.3583,-0.0305
3,/wiki/Addiscombe,Addiscombe,Croydon[2],CROYDON,CR0,20,TQ345665,51.381,-0.0663
4,"/wiki/Albany_Park,_Bexley",Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728,51.4264,0.1026


There are some rows that don't have the latitude/longitude information, which is line 109 and 355.

In [208]:
london_df[london_df['Latitude'].isnull()]

Unnamed: 0,Link,Location,London_borough,Post_town,Postcode_district,Dial_code,OS_grid_ref,Latitude,Longitude
109,/wiki/Colyers_(ward),Colyers,Bexley,ERITH,DA8,1322,TQ512768,,
355,/wiki/Oval,Oval,Lambeth,LONDON,"SW8, SW9, SE11",20,TQ315575,,


Remove the rows with NaN values.

In [210]:
london_df.dropna(axis=0, inplace=True)

In [211]:
london_df.shape

(531, 9)

In [212]:
address = 'London, England'

geolocator = Nominatim(user_agent="ld_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London are {}, {}.'.format(latitude, longitude))

london_lon_lat = [latitude, longitude]

The geograpical coordinate of London are 51.5073219, -0.1276474.


In [214]:
# create map of New York using latitude and longitude values
london = folium.Map(location=london_lon_lat, zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(london_df['Latitude'], \
                                           london_df['Longitude'], \
                                           london_df['London_borough'], \
                                           london_df['Location']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(london)  
    
london

## 2. Get Moscow information

In [220]:
page_mos = requests.get('https://en.wikipedia.org/wiki/Administrative_divisions_of_Moscow')
page_mos

<Response [200]>

In [242]:
soup_mos = BeautifulSoup(page_mos.content, 'html.parser')
tables_mos = soup_mos.find_all('div', attrs={"aria-labelledby": "Administrative_divisions_of_Moscow"})

In [255]:
tables_mos_det = tables_mos[0].table.tbody.contents[1:11]

In [265]:
listss = tables_mos_det[0].td.find_all('li')
listss[0].text

'Arbat'

In [266]:
all_data_mos = []
for okrug in tables_mos_det:
    okrug_name = okrug.th.text
    okrug_link = okrug.th.a['href']
    divis = []
    for divi in okrug.td.find_all('li'):
        divi_link = divi.a['href']
        divi_name = divi.text
        divis.append([divi_link, divi_name, okrug_link, okrug_name])
    all_data_mos.extend(divis)

In [269]:
all_data_mos

[['/wiki/Arbat_District',
  'Arbat',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Basmanny_District',
  'Basmanny',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Khamovniki_District',
  'Khamovniki',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Krasnoselsky_District,_Moscow',
  'Krasnoselsky',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Meshchansky_District',
  'Meshchansky',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Presnensky_District',
  'Presnensky',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Tagansky_District',
  'Tagansky',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Tverskoy_District',
  'Tverskoy',
  '/wiki/Central_Administrative_Okrug',
  'Central Administrative Okrug'],
 ['/wiki/Yakimanka_District',
  

In [270]:
%%time
lat_lon_all_mos = [None for i in range(len(all_data_mos))]

for i in range(len(all_data_mos)):
    urll = 'https://en.wikipedia.org' + all_data_mos[i][0]
    page = requests.get(urll)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    
    lls = soup.find_all('span', {'class': 'geo-dec'})
    if len(lls) > 0:
        print(all_data_mos[i][1])
        print(lls[0].text)

        lat_lon_all_mos[i] = str(lls[0].text)

Arbat
55.751°N 37.590°E
Basmanny
55.7649472°N 37.6715833°E
Khamovniki
55.72611°N 37.57111°E
Krasnoselsky
55.77861°N 37.65639°E
Meshchansky
55.77583°N 37.62750°E
Presnensky
55.74667°N 37.53694°E
Tagansky
55.74139°N 37.65417°E
Tverskoy
55.767°N 37.600°E
Yakimanka
55.73139°N 37.60389°E
Zamoskvorechye
55.73972°N 37.62500°E
Aeroport
55.79139°N 37.55944°E
Begovoy
55.78667°N 37.57556°E
Beskudnikovsky
55.86528°N 37.56500°E
Dmitrovsky
55.88861°N 37.52361°E
Golovinsky
55.84583°N 37.51694°E
Khoroshyovsky
55.78833°N 37.52944°E
Khovrino
55.87222°N 37.50472°E
Koptevo
55.83333°N 37.51889°E
Levoberezhny
55.867°N 37.467°E
Molzhaninovsky
55.93083°N 37.38917°E
Savyolovsky
55.81028°N 37.56250°E
Sokol
55.800°N 37.500°E
Timiryazevsky
55.82028°N 37.56000°E
Vostochnoye Degunino
55.87306°N 37.56889°E
Voykovsky
55.817°N 37.500°E
Zapadnoye Degunino
55.8652806°N 37.5366694°E
Alexeyevsky
55.817°N 37.650°E
Altufyevsky
55.87972°N 37.58417°E
Babushkinsky
55.86972°N 37.66444°E
Bibirevo
55.89167°N 37.61667°E
Butyrsky
5

In [271]:
%%time
for i in range(len(all_data_mos)):
    if lat_lon_all_mos[i]:
        lat_lon = process_lan_lat(lat_lon_all_mos[i])
    else:
        lat_lon = [None, None]
    
    all_data_mos[i].extend(lat_lon)

Wall time: 1e+03 µs


In [272]:
moscow_df = pd.DataFrame(all_data_mos, columns=['Link', 'Location', 'OLink', 'OLocation', 'Latitude', 'Longitude'])
moscow_df.head()

Unnamed: 0,Link,Location,OLink,OLocation,Latitude,Longitude
0,/wiki/Arbat_District,Arbat,/wiki/Central_Administrative_Okrug,Central Administrative Okrug,55.751,37.59
1,/wiki/Basmanny_District,Basmanny,/wiki/Central_Administrative_Okrug,Central Administrative Okrug,55.764947,37.671583
2,/wiki/Khamovniki_District,Khamovniki,/wiki/Central_Administrative_Okrug,Central Administrative Okrug,55.72611,37.57111
3,"/wiki/Krasnoselsky_District,_Moscow",Krasnoselsky,/wiki/Central_Administrative_Okrug,Central Administrative Okrug,55.77861,37.65639
4,/wiki/Meshchansky_District,Meshchansky,/wiki/Central_Administrative_Okrug,Central Administrative Okrug,55.77583,37.6275


In [273]:
moscow_df.dropna(axis=0, inplace=True)
moscow_df.shape

(125, 6)

In [274]:
address = 'Moscow, Russia'

geolocator = Nominatim(user_agent="mos_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Moscow are {}, {}.'.format(latitude, longitude))

moscow_lon_lat = [latitude, longitude]

The geograpical coordinate of Moscow are 55.7504461, 37.6174943.


In [277]:
# create map of New York using latitude and longitude values
moscow = folium.Map(location=moscow_lon_lat, zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(moscow_df['Latitude'], \
                                           moscow_df['Longitude'], \
                                           moscow_df['Location'], \
                                           moscow_df['OLocation']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(moscow)  
    
moscow

## 3. Get the neighhourhoods of San Francisco

In [278]:
page_san = requests.get('https://en.wikipedia.org/wiki/Category:Neighborhoods_in_San_Francisco')
page_san

<Response [200]>

In [279]:
soup_san = BeautifulSoup(page_san.content, 'html.parser')
tables_san = soup_san.find_all('div', attrs={"class": "mw-category"})
tables_san = tables_san[1]
tables_san = tables_san.contents[1:]

In [297]:
tables_san[0].ul.find_all('li')

[<li><a href="/wiki/Alamo_Square,_San_Francisco" title="Alamo Square, San Francisco">Alamo Square, San Francisco</a></li>,
 <li><a href="/wiki/Alta_Plaza" title="Alta Plaza">Alta Plaza</a></li>,
 <li><a href="/wiki/Anza_Vista,_San_Francisco" title="Anza Vista, San Francisco">Anza Vista, San Francisco</a></li>]

In [298]:
all_data_san = []
for letters in tables_san:
    for loc in letters.ul.find_all('li'):
        loc_name = loc.text
        loc_link = loc.a['href']
        all_data_san.append([loc_link, loc_name])
len(all_data_san)

94

In [302]:
def lat_long_to_dec(strn):
    res = re.split('[°\'" ]+', strn)
    
test11 = """37°47'0"N 122°25'0"W"""
re.split('[°\'" ]+', test11)

['37', '47', '0', 'N', '122', '25', '0', 'W']