# Case Study 1 - Apply ML on COVID-19 Data

## 1. Prepare the work environment

Import some necessary libraries

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from glob import glob

from bokeh import plotting as bplt
from bokeh import models as bmdl

# Turn on in-Jupyter viz
bplt.output_notebook()

We use Google's API to convert the address from word format to GPS coordinates "Lat" & "Long"

In [2]:
import googlemaps
from datetime import datetime

api_key = open('api_keys', 'r').read()

# replace api_key by your own Google Map api key
gmaps = googlemaps.Client(key=api_key)

Define some helper function for plotting graphs and maps

In [3]:
# Helper function for visuals
def base_plot(data=None, padding=None,
              tools='pan,wheel_zoom,reset', 
              plot_width=500, 
              plot_height=500, x_range=(0, 100), y_range=(0, 100), **plot_args):
    
    # if we send in two columns of data, we can use them to auto-size the scale
    if data is not None and padding is not None:
        x_range = (min(data.iloc[:, 0]) - padding, max(data.iloc[:, 0]) + padding)
        y_range = (min(data.iloc[:, 1]) - padding, max(data.iloc[:, 1]) + padding)
        
    p = bplt.figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
        x_range=x_range, y_range=y_range, outline_line_color=None,
        min_border=0, min_border_left=0, min_border_right=0,
        min_border_top=0, min_border_bottom=0, 
        **plot_args)

    p.axis.visible = True
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None

    p.add_tools(bmdl.BoxZoomTool(match_aspect=True))

    return p

Let's read one of the csv file for the building list

In [4]:
df = pd.read_csv('data/history_data/building_list_eng_20200912.csv', usecols=['District', 'Building name'])
df['lat'] = np.nan
df['long'] = np.nan

In [5]:
df.tail()

Unnamed: 0,District,Building name,lat,long
302,Wan Chai,Fashion Walk Food Street Burgeroom (non-reside...,,
303,Tuen Mun,Tuen Mun Town Plaza Phase 1 Noble Dim Sum (non...,,
304,Yuen Long,Full Yau Court (non-residential),,
305,Yau Tsim Mong,Pak Shing Building McDonald's (non-residential),,
306,Yau Tsim Mong,Metropark Hotel Mongkok Hong Kong Victoria Har...,,


In [6]:
addr = str(df.iloc[261,1])+ ', ' +str(df.iloc[261,0] + ', Hong Kong')
print(addr)

2W Sai Yeung Choi Street South 101肉燥飯專門店 (non-residential), Yau Tsim Mong, Hong Kong


We can use the Google Map API to get the GPS coordinates of the address. It will return the result in json format

In [7]:
geocode_result = gmaps.geocode(addr)
print(geocode_result)

[{'address_components': [{'long_name': 'Sai Yeung Choi Street South', 'short_name': 'Sai Yeung Choi St S', 'types': ['route']}, {'long_name': 'Mong Kok', 'short_name': 'Mong Kok', 'types': ['neighborhood', 'political']}, {'long_name': 'Kowloon', 'short_name': 'Kowloon', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'Hong Kong', 'short_name': 'HK', 'types': ['country', 'political']}], 'formatted_address': 'Sai Yeung Choi St S, Mong Kok, Hong Kong', 'geometry': {'location': {'lat': 22.3156355, 'lng': 114.170925}, 'location_type': 'GEOMETRIC_CENTER', 'viewport': {'northeast': {'lat': 22.3169844802915, 'lng': 114.1722739802915}, 'southwest': {'lat': 22.3142865197085, 'lng': 114.1695760197085}}}, 'partial_match': True, 'place_id': 'ChIJdZRF3sYABDQRsxk02bUOi2k', 'plus_code': {'compound_code': '858C+79 Mong Kok, Hong Kong', 'global_code': '7PJP858C+79'}, 'types': ['establishment', 'food', 'point_of_interest', 'restaurant']}]


Or you can read it like this

In [8]:
geocode_result[0]['geometry']['location']['lat'], geocode_result[0]['geometry']['location']['lng']

(22.3156355, 114.170925)

We use googlemap api to parse the geographical locations to GPS coordinates, for records from 1 Sep to 12 Sep. We don't run these now as it takes time and consumes network traffic.

In [None]:
def gmap_process(source_file):
    df = pd.read_csv(source_file, usecols=['District', 'Building name'])
    df['lat'] = np.nan
    df['long'] = np.nan
    for i in range(df.shape[0]):
        geocode_result = None
        addr = str(df.iloc[i,1])+ ', ' +str(df.iloc[i,0] + ', Hong Kong')
        addr = addr.replace('(non-residential)', '')            
        geocode_result = gmaps.geocode(addr)
        #print(geocode_result)
        try:
            if geocode_result[0] is not None:
                df.iloc[i,2] = geocode_result[0]['geometry']['location']['lat']
                df.iloc[i,3] = geocode_result[0]['geometry']['location']['lng']
                print(addr+str(" >>> "+str(geocode_result[0]['geometry']['location'])))
        except IndexError:
            print("Error! ",geocode_result)
    dest_file = source_file.replace('.csv', '_geo.csv')
    dest_file = dest_file.replace('history', 'processed')
    df.to_csv(dest_file, index = False)

You can process the conversion on a batch of csv files you like

In [None]:
for source in glob("history_data/*.csv"):
    print(source)
    print("="*50)
    gmap_process(source)

## Examine One Processed Location Data with GPS Coordinates

In [9]:
df = pd.read_csv('data/processed_data/building_list_eng_20200912_geo.csv')

Examine the first 25 rows of records

In [10]:
df[:25]

Unnamed: 0,District,Building name,lat,long
0,Yau Tsim Mong,57 Pilkem Street,22.304745,114.170985
1,Tsuen Wan,"Block 3, Lei Muk Shue Estate",22.379055,114.135748
2,Kwun Tong,"Kui Tai House, On Tai Estate",22.328484,114.228749
3,Eastern,Tai Ning House,22.284473,114.148464
4,Sai Kung,"Sheung Lai House, Sheung Tak Estate",22.31172,114.26107
5,Sai Kung,"Sheung Lai House, Sheung Tak Estate",22.31172,114.26107
6,Sai Kung,"Sheung Lai House, Sheung Tak Estate",22.31172,114.26107
7,Kwun Tong,"Choi Chun House, Choi Tak Estate",22.330744,114.214298
8,Wong Tai Sin,"Mei Yan House, Mei Tung Estate",22.333203,114.187448
9,Kowloon City,"Shun Man House, Oi Man Estate",22.310579,114.178979


Let's plot the coordinates together and see the distribution of the data. You should see the following:

![geo_distribution_single_day](image/geo_distribution_single_day.png)

In [11]:
options = dict(line_color=None, 
               fill_color='blue', 
               size=10,    # Reduce size to make points more distinct
               #alpha=.1    # Reduce alpha to avoid overplotting
               ) 

p = base_plot(data=df[['lat', 'long']], 
              padding=0.01,
              title = "Distribution of Quarntine Case on 12 Sep 2020")

p.circle(x=list(df['lat']), y=list(df['long']), **options)

bplt.show(p)

Let's find the centroids

In [12]:
km = KMeans(n_clusters=8).fit(df[['lat', 'long']])
centroids = km.cluster_centers_
print(centroids)

km_df = pd.DataFrame(centroids, columns=['lat', 'long'])

[[ 22.4450366  114.02550509]
 [ 22.33436875 114.17627113]
 [ 22.39602891 113.97726719]
 [ 22.3699456  114.12451086]
 [ 22.29243297 114.17630465]
 [ 22.48433837 114.14750036]
 [ 22.302163   113.93763484]
 [ 22.31473667 114.22980908]]


Overlay the kmeans centroids on the plot like this:

![kmean](image/kmeans.png)

In [13]:
from bokeh.transform import linear_cmap

# Plotting the density distribution with red cluster centroids.
dist = dict(line_color=None, 
            fill_color='blue', 
            #fill_color=linear_cmap('counts', 'Viridis256', 0, 100),
            size=5,
            #alpha=0.05
            )

cent = dict(line_color='black',
            line_width=2,
            fill_color='red', 
            size=15,
            alpha=.5)

p = base_plot(df[['lat', 'long']], 
              padding=0.01)

p.circle(x=list(df['lat']), y=list(df['long']), **dist)
p.circle(x=list(km_df['lat']), y=list(km_df['long']), **cent)

bplt.show(p)

In [14]:
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap

output_file("gmap.html")
map_options = GMapOptions(lat=22.319629, lng=114.172809, map_type="roadmap", zoom=11)

p = gmap(api_key, map_options, title='Hong Kong COVID-19 Distribution on 12 Sep 2020')

source = ColumnDataSource(
    data=dict(lat=list(df['lat']),
              lon=list(df['long'])
))

kmean_source = ColumnDataSource(
    data=dict(lat=list(km_df['lat']),
              lon=list(km_df['long'])
))

# plot case distribution
p.circle(x="lon", y="lat", size=8, fill_color="blue", fill_alpha=0.5, source=source)

# plot k-means centroids
p.circle(x="lon", y="lat", size=15, fill_color="red", fill_alpha=0.8, source=kmean_source)

show(p)


# Part 3: Combine all history of building list since 1 Sep 2020

In [15]:
import glob
df_new = pd.concat([pd.read_csv(f) for f in glob.glob('data/processed_data/*.csv')], ignore_index = True)

In [16]:
df_new

Unnamed: 0,District,Building name,lat,long
0,Yau Tsim Mong,Cheung Hing Mansion,22.317798,114.173150
1,Kwun Tong,"Sau Wah House, Sau Mau Ping Estate",22.318554,114.232409
2,Yuen Long,"Tower 7, The Parcville",22.450419,114.033281
3,Eastern,"Block 5, Harmony Garden",22.273389,114.236078
4,North,Ho Sheung Heung,22.511836,114.108369
...,...,...,...,...
5272,Yau Tsim Mong,2W Sai Yeung Choi Street South 101肉燥飯專門店 (non-...,22.315636,114.170925
5273,Yau Tsim Mong,Hang Shing Building (non-residential),22.307918,114.171106
5274,Yau Tsim Mong,MTR Kowloon Station McDonald's (non-residential),22.297572,114.172205
5275,Yau Tsim Mong,MTR Kowloon Station (non-residential),22.304306,114.161475


Drop the records with null values in the lat / long fields

In [17]:
nan_value = float("NaN")
df_new.replace("", nan_value, inplace=True)

df_new.dropna(subset = ["lat"], inplace=True)
df_new.dropna(subset = ["long"], inplace=True)

Let's check the size of the combined dataset.

In [18]:
df_new.head()

Unnamed: 0,District,Building name,lat,long
0,Yau Tsim Mong,Cheung Hing Mansion,22.317798,114.17315
1,Kwun Tong,"Sau Wah House, Sau Mau Ping Estate",22.318554,114.232409
2,Yuen Long,"Tower 7, The Parcville",22.450419,114.033281
3,Eastern,"Block 5, Harmony Garden",22.273389,114.236078
4,North,Ho Sheung Heung,22.511836,114.108369


Use K-means to find the centroids of the data from the combined dataset. Here we want to find 12 centroids of data clusters from the data. You can modify the value n_cluster to any intergers.

In [19]:
km = KMeans(n_clusters=12).fit(df_new[['lat', 'long']])
centroids = km.cluster_centers_
print("New Centroids found from K-Means")
print(centroids)

# put the centroids into dataref for easier plotting
km_df = pd.DataFrame(centroids, columns=['lat', 'long'])

New Centroids found from K-Means
[[ 22.39593822 113.97257031]
 [ 22.31829353 114.16846712]
 [ 22.45886284 114.00180076]
 [ 22.36982703 114.12367732]
 [ 22.47528331 114.15697697]
 [ 22.27989136 114.21425682]
 [ 22.31385249 114.24378639]
 [ 22.38114669 114.19068667]
 [ 22.27727705 114.16051386]
 [ 22.32762319 114.20106653]
 [ 22.30393815 113.94199746]
 [ 22.44141938 114.03223909]]


We can plot the data on the Google Map. Run the following cell and you will see a result similar to this.

![covid19](image/covid_19_distribution.png)

In [20]:
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions, HoverTool
from bokeh.plotting import gmap, figure

output_file("gmap.html")
map_options = GMapOptions(lat=22.319629, lng=114.172809, map_type="roadmap", zoom=11)


source = ColumnDataSource(
    data=dict(lat=list(df_new['lat']),
              lon=list(df_new['long']),
              addr=list(df_new['Building name'])
))

kmean_source = ColumnDataSource(
    data=dict(lat=list(km_df['lat']),
              lon=list(km_df['long'])
))

TOOLTIPS = [
    ("Address", "@addr"),
    #("speed", "20"),
]

p = gmap(api_key, map_options, title='Hong Kong COVID-19 Distribution',
         plot_width=800, plot_height=800,)

# plot case distribution
p.circle(x="lon", y="lat", size=4, fill_color="blue", fill_alpha=0.5, source=source)

# plot k-means centroids
p.circle(x="lon", y="lat", size=15, fill_color="red", fill_alpha=0.8, source=kmean_source)

p.add_tools( HoverTool(tooltips=TOOLTIPS))

#p.add_tools(hover)

show(p)
