## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_21.pdf")

In [4]:
len(range(0,5))

5

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,5):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,7 to 12 \nJun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
1,7 to 9 \nJun,0600h to 1400h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
2,7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
3,7 Jun,0630h to 1110h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
4,7 Jun,0900h to 1000h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
5,7 Jun,1000h to 1610h,Katong Shopping Centre (865 Mountbatten Road) \n Union United Employment Pte Ltd
6,7 Jun,1120h to 1200h,Tiong Bahru Plaza (298 Tiong Bahru Road)
7,7 Jun,1220h to 1300h,ION Orchard (2 Orchard Turn)
8,7 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n Guardian
9,7 Jun,1300h to 1505h,Ngee Ann City (391A Orchard Road) \n Takashimaya Shopping Centre


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[9, 'Location'] = "Takashimaya (391A Orchard Road)"
df_all_raw.loc[30, 'Location'] = "NTUC Health Senior Activity Centre SilverAce - Henderson (117 Bukit Merah View)"
df_all_raw.loc[51, 'Location'] = "Chinatown Complex (335 Smith Street): Wet Market"
df_all_raw.loc[71, 'Location'] = "Green Effect Aquarium (801 French Road)"
df_all_raw.loc[83, 'Location'] = "VivoCity (1 Harbourfront Walk): Toys 'R' Us"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
# df_all_clean = df_all_raw.drop([21,47,72]) 
# df_all_clean = df_all_clean.reset_index(drop=True)
df_all_clean = df_all_raw

In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,7 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
1,7 to 9 Jun,0600h to 1400h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
2,7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
3,7 Jun,0630h to 1110h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
4,7 Jun,0900h to 1000h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
5,7 Jun,1000h to 1610h,Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd
6,7 Jun,1120h to 1200h,Tiong Bahru Plaza (298 Tiong Bahru Road)
7,7 Jun,1220h to 1300h,ION Orchard (2 Orchard Turn)
8,7 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
9,7 Jun,1300h to 1505h,Takashimaya (391A Orchard Road)


(115, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,7 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
1,7 to 9 Jun,0600h to 1400h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
2,7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
3,7 Jun,0630h to 1110h,Tiong Bahru Yong Tao Hu,,56 Eng Hoon Street
4,7 Jun,0900h to 1000h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
5,7 Jun,1000h to 1610h,Katong Shopping Centre,Union United Employment Pte Ltd,865 Mountbatten Road
6,7 Jun,1120h to 1200h,Tiong Bahru Plaza,,298 Tiong Bahru Road
7,7 Jun,1220h to 1300h,ION Orchard,,2 Orchard Turn
8,7 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
9,7 Jun,1300h to 1505h,Takashimaya,,391A Orchard Road


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')