# Stage 1. Discovering Bangkok Neighbourhoods

In [1]:
import numpy as np # library to handle data in a vectorized manner
import os

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Installing BeautifulSoup4 (if not installed yet - uncommend the line below)
#!pip install BeautifulSoup4
from bs4 import BeautifulSoup
from datetime import datetime as dt
import json # library to handle JSON files

#uncomment the line below if you need to install Geopy
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# uncomment this line if Folium not found on your system
#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('\n','>>> Libraries imported.')


 >>> Libraries imported.


## 1. Getting the list of all Bangkok neighbourhoods (khwaengs)

From Wikipedia page "Khwaeng"

Downloading the page from the Web:

In [2]:
link = 'https://en.wikipedia.org/wiki/Khwaeng'
page = requests.get(link)
print(page.status_code)

200


Cooking BeautifulSoup from the retireved page :

In [3]:
soup = BeautifulSoup(page.text, 'html.parser')

The data of interest are in a table, so get tables from the soup:

In [4]:
tables = soup('table')
len(tables)

2

There are two tables on the page. The khwaengs are listed in the first one:

In [5]:
khwaengs_table = tables[0]

row_tags = khwaengs_table('tr')
for tag in row_tags[:9]:
    print(tag)
    print(tag.text, end = "\n"*2)

<tr>
<th colspan="3">District (<i>khet</i>)
</th>
<th colspan="3">Subdistrict (<i>khwaeng</i>)
</th>
<th rowspan="2">Notes
</th></tr>

District (khet)

Subdistrict (khwaeng)

Notes


<tr>
<th>Code
</th>
<th>Name
</th>
<th>Name (Thai)
</th>
<th>Code
</th>
<th>Name
</th>
<th>Name (Thai)
</th></tr>

Code

Name

Name (Thai)

Code

Name

Name (Thai)


<tr>
<td rowspan="12" valign="top">01
</td>
<td rowspan="12" valign="top"><a href="/wiki/Phra_Nakhon_district" title="Phra Nakhon district">Phra Nakhon</a>
</td>
<td rowspan="12" valign="top">พระนคร
</td>
<td>01
</td>
<td><a href="/wiki/Phra_Borom_Maha_Ratchawang_subdistrict" title="Phra Borom Maha Ratchawang subdistrict">Phra Borom Maha Ratchawang</a>
</td>
<td>พระบรมมหาราชวัง
</td>
<td>
</td></tr>

01

Phra Nakhon

พระนคร

01

Phra Borom Maha Ratchawang

พระบรมมหาราชวัง




<tr>
<td>02
</td>
<td><a href="/wiki/Wang_Burapha_Phirom_subdistrict" title="Wang Burapha Phirom subdistrict">Wang Burapha Phirom</a>
</td>
<td>วังบูรพาภิรมย์
</td>
<td>


There are total of 6 columns, and the first two rows are header.

Parse the Khwaengs table into a list of rows:

In [6]:
rows = list()
for tag in row_tags:
    row = tag.text.strip('\n').split('\n\n')
    rows.append(row)

print('Top 9 rows:')
for row in rows[0:9] :
    print(row)
print('Bottom 9 rows:')
for row in rows[-9:] :
    print(row)
print()
print('Total rows including the headers:', len(rows))

Top 9 rows:
['District (khet)', 'Subdistrict (khwaeng)', 'Notes']
['Code', 'Name', 'Name (Thai)', 'Code', 'Name', 'Name (Thai)']
['01', 'Phra Nakhon', 'พระนคร', '01', 'Phra Borom Maha Ratchawang', 'พระบรมมหาราชวัง']
['02', 'Wang Burapha Phirom', 'วังบูรพาภิรมย์']
['03', 'Wat Ratchabophit', 'วัดราชบพิธ']
['04', 'Samran Rat', 'สำราญราษฎร์']
['05', 'San Chaopho Suea', 'ศาลเจ้าพ่อเสือ']
['06', 'Sao Chingcha', 'เสาชิงช้า', 'Seat of BMA office']
['07', 'Bowon Niwet', 'บวรนิเวศ']
Bottom 9 rows:
['03', 'Bang Na Tai', 'บางนาใต้']
['48', 'Thawi Watthana', 'ทวีวัฒนา', '01', 'Thawi Watthana', 'ทวีวัฒนา', 'District seat']
['02', 'Sala Thammasop', 'ศาลาธรรมสพน์']
['49', 'Thung Khru', 'ทุ่งครุ', '01', 'Bang Mot', 'บางมด']
['02', 'Thung Khru', 'ทุ่งครุ', 'District seat']
['50', 'Bang Bon', 'บางบอน', '02', 'Bang Bon Nuea', 'บางบอนเหนือ']
['03', 'Bang Bon Tai', 'บางบอนใต้', 'District seat']
['04', 'Khlong Bang Phran', 'คลองบางพราน']
['05', 'Khlong Bang Bon', 'คลองบางบอน']

Total rows including the heade

Making all rows uniform by adding district data to rows which only contain neighbourhjood data as a result of the complex table layout, while excluding the header:

In [7]:
fullrows = list()
for row in rows[2:]:  # skip the header
    if len(row) >= len(rows[2]) :  # the first complete row as a reference
        leading_cells = row[:3]
        #print(row)
        fullrows.append(row)
    else :
        row = leading_cells + row
        #print(row)
        fullrows.append(row)
        
print('Top 9 rows:')
for row in fullrows[0:9] :
    print(row)
print('Bottom 9 rows:')
for row in fullrows[-9:] :
    print(row)
print()
print('Total rows:', len(fullrows))

Top 9 rows:
['01', 'Phra Nakhon', 'พระนคร', '01', 'Phra Borom Maha Ratchawang', 'พระบรมมหาราชวัง']
['01', 'Phra Nakhon', 'พระนคร', '02', 'Wang Burapha Phirom', 'วังบูรพาภิรมย์']
['01', 'Phra Nakhon', 'พระนคร', '03', 'Wat Ratchabophit', 'วัดราชบพิธ']
['01', 'Phra Nakhon', 'พระนคร', '04', 'Samran Rat', 'สำราญราษฎร์']
['01', 'Phra Nakhon', 'พระนคร', '05', 'San Chaopho Suea', 'ศาลเจ้าพ่อเสือ']
['01', 'Phra Nakhon', 'พระนคร', '06', 'Sao Chingcha', 'เสาชิงช้า', 'Seat of BMA office']
['01', 'Phra Nakhon', 'พระนคร', '07', 'Bowon Niwet', 'บวรนิเวศ']
['01', 'Phra Nakhon', 'พระนคร', '08', 'Talat Yot', 'ตลาดยอด']
['01', 'Phra Nakhon', 'พระนคร', '09', 'Chana Songkhram', 'ชนะสงคราม']
Bottom 9 rows:
['47', 'Bang Na', 'บางนา', '03', 'Bang Na Tai', 'บางนาใต้']
['48', 'Thawi Watthana', 'ทวีวัฒนา', '01', 'Thawi Watthana', 'ทวีวัฒนา', 'District seat']
['48', 'Thawi Watthana', 'ทวีวัฒนา', '02', 'Sala Thammasop', 'ศาลาธรรมสพน์']
['49', 'Thung Khru', 'ทุ่งครุ', '01', 'Bang Mot', 'บางมด']
['49', 'Thung Khru',

Initializing a Pandas dataframe of Bangkok neighbourhoods by setting column names and creating a dataframe object:

In [8]:
columns = ['DCode', 'District', 'DistrictThai', 'NCode', 'Neighbourhood', 'NeighbourhoodThai', 'Latitude', 'Longitude']

bkk_khwaengs = pd.DataFrame(columns = columns)
bkk_khwaengs

Unnamed: 0,DCode,District,DistrictThai,NCode,Neighbourhood,NeighbourhoodThai,Latitude,Longitude


Populating the dataframe with neighbourhood data column by column accessed by index, taking corresponding element from each row of the list:

In [9]:
for i in range(len(bkk_khwaengs.columns) - 2):  # not filling the last two columns as the data are yet to be obtained
    print(f"Filling column {i}: {columns[i]} ...", end='')
    bkk_khwaengs[columns[i]] = [row[i] for row in fullrows]
    print(" OK")

bkk_khwaengs

Filling column 0: DCode ... OK
Filling column 1: District ... OK
Filling column 2: DistrictThai ... OK
Filling column 3: NCode ... OK
Filling column 4: Neighbourhood ... OK
Filling column 5: NeighbourhoodThai ... OK


Unnamed: 0,DCode,District,DistrictThai,NCode,Neighbourhood,NeighbourhoodThai,Latitude,Longitude
0,1,Phra Nakhon,พระนคร,1,Phra Borom Maha Ratchawang,พระบรมมหาราชวัง,,
1,1,Phra Nakhon,พระนคร,2,Wang Burapha Phirom,วังบูรพาภิรมย์,,
2,1,Phra Nakhon,พระนคร,3,Wat Ratchabophit,วัดราชบพิธ,,
3,1,Phra Nakhon,พระนคร,4,Samran Rat,สำราญราษฎร์,,
4,1,Phra Nakhon,พระนคร,5,San Chaopho Suea,ศาลเจ้าพ่อเสือ,,
5,1,Phra Nakhon,พระนคร,6,Sao Chingcha,เสาชิงช้า,,
6,1,Phra Nakhon,พระนคร,7,Bowon Niwet,บวรนิเวศ,,
7,1,Phra Nakhon,พระนคร,8,Talat Yot,ตลาดยอด,,
8,1,Phra Nakhon,พระนคร,9,Chana Songkhram,ชนะสงคราม,,
9,1,Phra Nakhon,พระนคร,10,Ban Phan Thom,บ้านพานถม,,


and saving our dataframe to .csv file just in case:

In [10]:
bkk_khwaengs.to_csv('csv/khwaengs.csv', index = False)
print('>>> Saved.')

>>> Saved.


In [11]:
# reload data from csv in case need to resume from here, otherwise can skip this cell. 
# imports will still have to be rerun, though.
bkk_khwaengs = pd.read_csv('csv/khwaengs.csv', dtype = 'str')
bkk_khwaengs

Unnamed: 0,DCode,District,DistrictThai,NCode,Neighbourhood,NeighbourhoodThai,Latitude,Longitude
0,1,Phra Nakhon,พระนคร,1,Phra Borom Maha Ratchawang,พระบรมมหาราชวัง,,
1,1,Phra Nakhon,พระนคร,2,Wang Burapha Phirom,วังบูรพาภิรมย์,,
2,1,Phra Nakhon,พระนคร,3,Wat Ratchabophit,วัดราชบพิธ,,
3,1,Phra Nakhon,พระนคร,4,Samran Rat,สำราญราษฎร์,,
4,1,Phra Nakhon,พระนคร,5,San Chaopho Suea,ศาลเจ้าพ่อเสือ,,
5,1,Phra Nakhon,พระนคร,6,Sao Chingcha,เสาชิงช้า,,
6,1,Phra Nakhon,พระนคร,7,Bowon Niwet,บวรนิเวศ,,
7,1,Phra Nakhon,พระนคร,8,Talat Yot,ตลาดยอด,,
8,1,Phra Nakhon,พระนคร,9,Chana Songkhram,ชนะสงคราม,,
9,1,Phra Nakhon,พระนคร,10,Ban Phan Thom,บ้านพานถม,,


In [12]:
bkk_khwaengs.dtypes

DCode                object
District             object
DistrictThai         object
NCode                object
Neighbourhood        object
NeighbourhoodThai    object
Latitude             object
Longitude            object
dtype: object

In [13]:
bkk_khwaengs.describe()

Unnamed: 0,DCode,District,DistrictThai,NCode,Neighbourhood,NeighbourhoodThai,Latitude,Longitude
count,180,180,180,180,180,180,0.0,0.0
unique,50,50,50,12,178,178,0.0,0.0
top,1,Phra Nakhon,พระนคร,2,Bang Chak,บางจาก,,
freq,12,12,12,43,2,2,,


Applying the .describe() method to the dataframe instantly reveals an anomaly: the total count of neghbourhoods is 180 but there are only 178 unique entries. One of the most frequently occuring neighbourhood names is Bang Chak, and the top frequency is 2. So they are likely 2 duplicates.

Checking the data for duplicate khwaeng names:

In [14]:
dup = bkk_khwaengs[bkk_khwaengs.duplicated(subset='Neighbourhood', keep=False)]
dup

Unnamed: 0,DCode,District,DistrictThai,NCode,Neighbourhood,NeighbourhoodThai,Latitude,Longitude
43,9,Phra Khanong,พระโขนง,5,Bang Chak,บางจาก,,
91,22,Phasi Charoen,ภาษีเจริญ,6,Bang Chak,บางจาก,,
133,35,Chom Thong,จอมทอง,3,Bang Mot,บางมด,,
174,49,Thung Khru,ทุ่งครุ,1,Bang Mot,บางมด,,


And indeed there are 2 pairs of same-name khwaengs belonging to different districts

To resolve this issue, district name will have to be added to the neighbourhood name.

In [15]:
to_fix = bkk_khwaengs[bkk_khwaengs.duplicated(subset='Neighbourhood', keep=False)].index.to_list()
to_fix

[43, 91, 133, 174]

In [16]:
for val in to_fix:
    bkk_khwaengs.iloc[val]['Neighbourhood'] = '{} ({})'.format(bkk_khwaengs.iloc[val]['Neighbourhood'], bkk_khwaengs.iloc[val]['District'])
    bkk_khwaengs.iloc[val]['NeighbourhoodThai'] = '{} ({})'.format(bkk_khwaengs.iloc[val]['NeighbourhoodThai'], bkk_khwaengs.iloc[val]['DistrictThai'])

bkk_khwaengs.iloc[to_fix]

Unnamed: 0,DCode,District,DistrictThai,NCode,Neighbourhood,NeighbourhoodThai,Latitude,Longitude
43,9,Phra Khanong,พระโขนง,5,Bang Chak (Phra Khanong),บางจาก (พระโขนง),,
91,22,Phasi Charoen,ภาษีเจริญ,6,Bang Chak (Phasi Charoen),บางจาก (ภาษีเจริญ),,
133,35,Chom Thong,จอมทอง,3,Bang Mot (Chom Thong),บางมด (จอมทอง),,
174,49,Thung Khru,ทุ่งครุ,1,Bang Mot (Thung Khru),บางมด (ทุ่งครุ),,


In [17]:
# saving corrected data
bkk_khwaengs.to_csv('csv/khwaengs.csv', index = False)
print('>>> Saved.')

>>> Saved.
