## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_25.pdf")

In [4]:
len(range(0,6))

6

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,6):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,11 Jun,0500h to 1800h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
1,11 Jun,0600h to 0900h,ION Orchard (2 Orchard Turn)
2,11 Jun,0600h to 1500h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
3,11 Jun,0630h to 1230h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
4,11 Jun,0800h to 1400h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
5,11 to 12 \nJun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
6,11 Jun,0930h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
7,11 Jun,1000h to 1735h,Katong Shopping Centre (865 Mountbatten Road) \n Union United Employment Pte Ltd
8,11 Jun,1000h to 1930h,Telok Blangah Food Centre (79 Telok Blangah Drive) \n Al-barakah Food Corner
9,11 Jun,1030h to 1900h,ION Orchard (2 Orchard Turn)


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[11, 'Location'] = "Chinatown Complex (335 Smith Street): Wet Market"
df_all_raw.loc[21, 'Location'] = "Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd"
df_all_raw.loc[36, 'Location'] = "Green Effect Aquarium (801 French Road)"
df_all_raw.loc[49, 'Location'] = "VivoCity (1 HarbourFront Walk): Toys 'R' Us"
df_all_raw.loc[103, 'Location'] = "Katong Shopping Centre (865 Mountbatten Road): Central Recruitment Solutions Pte Ltd"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("18/20 Kim Tian Road", "18 Kim Tian Road", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Toys “R” Us", "Toys \'R\' Us", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Mulan (Kyawt Na Di)", "Mulan/Kyawt Na Di", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
df_all_clean = df_all_raw.drop([22,50]) 
df_all_clean = df_all_clean.reset_index(drop=True)

In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,11 Jun,0500h to 1800h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
1,11 Jun,0600h to 0900h,ION Orchard (2 Orchard Turn)
2,11 Jun,0600h to 1500h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
3,11 Jun,0630h to 1230h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
4,11 Jun,0800h to 1400h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
5,11 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
6,11 Jun,0930h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
7,11 Jun,1000h to 1735h,Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd
8,11 Jun,1000h to 1930h,Telok Blangah Food Centre (79 Telok Blangah Drive): Al-barakah Food Corner
9,11 Jun,1030h to 1900h,ION Orchard (2 Orchard Turn)


(141, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,11 Jun,0500h to 1800h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
1,11 Jun,0600h to 0900h,ION Orchard,,2 Orchard Turn
2,11 Jun,0600h to 1500h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
3,11 Jun,0630h to 1230h,Tiong Bahru Yong Tao Hu,,56 Eng Hoon Street
4,11 Jun,0800h to 1400h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
5,11 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
6,11 Jun,0930h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
7,11 Jun,1000h to 1735h,Katong Shopping Centre,Union United Employment Pte Ltd,865 Mountbatten Road
8,11 Jun,1000h to 1930h,Telok Blangah Food Centre,Al-barakah Food Corner,79 Telok Blangah Drive
9,11 Jun,1030h to 1900h,ION Orchard,,2 Orchard Turn


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')