## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_12.pdf")

In [4]:
len(range(0,3))

3

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,3):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [7]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,29 May \nto 1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n• Guardian
1,29 to 31 \nMay,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
2,29 to 31 \nMay,0645h to 1700h,Takashimaya (391A Orchard Road) \n• St. Leaven
3,29 May,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51) \n• Fu Fa Food Court
4,29 to 31 \nMay,1200h to 1900h,Atatcutz Singapore (348 Bedok Road)
5,29 May,0000h to 0730h,NTUC Foodfare (308 Anchorvale Road)
6,29 May,0110h to 0530h,Marina Bay Sands Casino (10 Bayfront Avenue)
7,29 May,0600h to 1300h,NTUC Foodfare (308 Anchorvale Road)
8,29 May \nto 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
9,29 May,0630h to 1300h,Hougang Green Shopping Mall (21 Hougang Street 51) \n• Fu Fa Food Court


In [10]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[22, 'Location'] = "Plaza Singapura (68 Orchard Road): Cold Storage"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Time'] = df_all_raw['Time'].str.replace("1030h to 1900j", 
                                                            "1030h to 1900h")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
df_all_clean = df_all_raw.drop([23]) 
df_all_clean = df_all_clean.reset_index(drop=True)


In [11]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,29 May to 1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
1,29 to 31 May,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
2,29 to 31 May,0645h to 1700h,Takashimaya (391A Orchard Road): St. Leaven
3,29 May,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51): Fu Fa Food Court
4,29 to 31 May,1200h to 1900h,Atatcutz Singapore (348 Bedok Road)
5,29 May,0000h to 0730h,NTUC Foodfare (308 Anchorvale Road)
6,29 May,0110h to 0530h,Marina Bay Sands Casino (10 Bayfront Avenue)
7,29 May,0600h to 1300h,NTUC Foodfare (308 Anchorvale Road)
8,29 May to 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
9,29 May,0630h to 1300h,Hougang Green Shopping Mall (21 Hougang Street 51): Fu Fa Food Court


(69, 3)

In [12]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [13]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,29 May to 1 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
1,29 to 31 May,0000h to 2359h,Tuas Amenity Centre,,71 Pioneer Road
2,29 to 31 May,0645h to 1700h,Takashimaya,St. Leaven,391A Orchard Road
3,29 May,1000h to 2200h,Hougang Green Shopping Mall,Fu Fa Food Court,21 Hougang Street 51
4,29 to 31 May,1200h to 1900h,Atatcutz Singapore,,348 Bedok Road
5,29 May,0000h to 0730h,NTUC Foodfare,,308 Anchorvale Road
6,29 May,0110h to 0530h,Marina Bay Sands Casino,,10 Bayfront Avenue
7,29 May,0600h to 1300h,NTUC Foodfare,,308 Anchorvale Road
8,29 May to 7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
9,29 May,0630h to 1300h,Hougang Green Shopping Mall,Fu Fa Food Court,21 Hougang Street 51


In [14]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')