## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_15.pdf")

In [4]:
len(range(0,4))

4

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,4):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [12]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,1 to 7 \nJun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,1 Jun,0735h to 0810h,Sheng Siong Supermarket (446A Fajar Road)
2,1 Jun,0900h to 0930h,Beo Crescent Market (38A Beo Crescent)
3,1 Jun,1000h to 1230h,JIC Inspection Services (53 Pioneer Road)
4,1 Jun,1100h to 1230h,VICOM Inspection Centre (511 Bukit Batok Street 23)
5,1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n Guardian
6,1 Jun,1620h to 1810h,Hair Flare (201B Tampines Street 21)
7,1 Jun,1725h to 1800h,Yew Tee Point (21 Choa Chu Kang North 6) \n NTUC FairPrice
8,1 Jun,1955h to 2030h,Seletar Mall (33 Sengkang West Avenue) \n FairPrice Finest
9,1 Jun,2100h to 2130h,HDB Hub (500 Toa Payoh Lorong 6) \n NTUC FairPrice


In [13]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[39, 'Location'] = "Takashimaya (391A Orchard Road)"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
# df_all_clean = df_all_raw.drop([23]) 
# df_all_clean = df_all_clean.reset_index(drop=True)
df_all_clean = df_all_raw

In [14]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,1 to 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,1 Jun,0735h to 0810h,Sheng Siong Supermarket (446A Fajar Road)
2,1 Jun,0900h to 0930h,Beo Crescent Market (38A Beo Crescent)
3,1 Jun,1000h to 1230h,JIC Inspection Services (53 Pioneer Road)
4,1 Jun,1100h to 1230h,VICOM Inspection Centre (511 Bukit Batok Street 23)
5,1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
6,1 Jun,1620h to 1810h,Hair Flare (201B Tampines Street 21)
7,1 Jun,1725h to 1800h,Yew Tee Point (21 Choa Chu Kang North 6): NTUC FairPrice
8,1 Jun,1955h to 2030h,Seletar Mall (33 Sengkang West Avenue): FairPrice Finest
9,1 Jun,2100h to 2130h,HDB Hub (500 Toa Payoh Lorong 6): NTUC FairPrice


(74, 3)

In [15]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [16]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,1 to 7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
1,1 Jun,0735h to 0810h,Sheng Siong Supermarket,,446A Fajar Road
2,1 Jun,0900h to 0930h,Beo Crescent Market,,38A Beo Crescent
3,1 Jun,1000h to 1230h,JIC Inspection Services,,53 Pioneer Road
4,1 Jun,1100h to 1230h,VICOM Inspection Centre,,511 Bukit Batok Street 23
5,1 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
6,1 Jun,1620h to 1810h,Hair Flare,,201B Tampines Street 21
7,1 Jun,1725h to 1800h,Yew Tee Point,NTUC FairPrice,21 Choa Chu Kang North 6
8,1 Jun,1955h to 2030h,Seletar Mall,FairPrice Finest,33 Sengkang West Avenue
9,1 Jun,2100h to 2130h,HDB Hub,NTUC FairPrice,500 Toa Payoh Lorong 6


In [17]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')