## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_10.pdf")

In [4]:
len(range(0,4))

4

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,4):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,27 to 28 \nMay,0600h to 1900h,Food Hub (455 Sengkang West Avenue)
1,27 to 29 \nMay,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51) \n• Fu Fa Food Court
2,27 May,0900h to 1700h,Sri Murugan Trading Pte Ltd (308 Anchorvale Road)
3,27 May,0000h to 0730h,NTUC Foodfare (308 Anchorvale Road)
4,27 to 31 \nMay,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
5,27 to 28 \nMay,0600h to 2100h,NTUC Foodfare (308 Anchorvale Road)
6,27 to 28 \nMay,0630h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51) \n• Fu Fa Food Court
7,27 to 31 \nMay,0645h to 1700h,Takashimaya (391A Orchard Road) \n• St. Leaven
8,27 May,0750h to 0820h,Sheng Siong Supermarket (417 Fernvale Link)
9,27 May,0835h to 1105h,Marina Bay Sands (2 Bayfront Avenue) \n• Miu Miu


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[75, 'Location'] = "ION Orchard (2 Orchard Turn): Four Leaves"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Westgate (3 Westgate Drive)", 
                                                            "Westgate (3 Gateway Drive)", regex=False)

# Removing redundant rows
df_all_clean = df_all_raw.drop([76]) 
df_all_clean = df_all_clean.reset_index(drop=True)


In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,27 to 28 May,0600h to 1900h,Food Hub (455 Sengkang West Avenue)
1,27 to 29 May,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51): Fu Fa Food Court
2,27 May,0900h to 1700h,Sri Murugan Trading Pte Ltd (308 Anchorvale Road)
3,27 May,0000h to 0730h,NTUC Foodfare (308 Anchorvale Road)
4,27 to 31 May,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
5,27 to 28 May,0600h to 2100h,NTUC Foodfare (308 Anchorvale Road)
6,27 to 28 May,0630h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51): Fu Fa Food Court
7,27 to 31 May,0645h to 1700h,Takashimaya (391A Orchard Road): St. Leaven
8,27 May,0750h to 0820h,Sheng Siong Supermarket (417 Fernvale Link)
9,27 May,0835h to 1105h,Marina Bay Sands (2 Bayfront Avenue): Miu Miu


(80, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,27 to 28 May,0600h to 1900h,Food Hub,,455 Sengkang West Avenue
1,27 to 29 May,1000h to 2200h,Hougang Green Shopping Mall,Fu Fa Food Court,21 Hougang Street 51
2,27 May,0900h to 1700h,Sri Murugan Trading Pte Ltd,,308 Anchorvale Road
3,27 May,0000h to 0730h,NTUC Foodfare,,308 Anchorvale Road
4,27 to 31 May,0000h to 2359h,Tuas Amenity Centre,,71 Pioneer Road
5,27 to 28 May,0600h to 2100h,NTUC Foodfare,,308 Anchorvale Road
6,27 to 28 May,0630h to 2200h,Hougang Green Shopping Mall,Fu Fa Food Court,21 Hougang Street 51
7,27 to 31 May,0645h to 1700h,Takashimaya,St. Leaven,391A Orchard Road
8,27 May,0750h to 0820h,Sheng Siong Supermarket,,417 Fernvale Link
9,27 May,0835h to 1105h,Marina Bay Sands,Miu Miu,2 Bayfront Avenue


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')