## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_9.pdf")

In [4]:
len(range(0,4))

4

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,4):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,26 to 28 \nMay,0600h to 1900h,Food Hub (455 Sengkang West Avenue)
1,26 to 29 \nMay,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51) \n• Fu Fa Food Court
2,26 to 27 \nMay,0900h to 1700h,Sri Murugan Trading Pte Ltd (308 Anchorvale Road)
3,26 May,0900h to 2100h,NTUC Foodfare (308 Anchorvale Road)
4,26 May,0905h to 1200h,McDonald’s (293 Yishun Street 22)
5,26 May,1040h to 1140h,Marsiling Lane Food Centre (20 Marsiling Lane)
6,26 May,1130h to 2100h,Tiong Bahru Plaza (298 Tiong Bahru Road) \n• 328 Katong Laksa
7,26 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
8,26 May,1355h to 1700h,VivoCity (1 HarbourFront Walk)
9,26 May,1400h to 1430h,New Trendy Salon LLP (84 Redhill Lane)


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[79, 'Location'] = "Yew Tee Point (21 Choa Chu Kang North 6): NTUC FairPrice"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Westgate (3 Westgate Drive)", 
                                                            "Westgate (3 Gateway Drive)", regex=False)

# Removing redundant rows
df_all_clean = df_all_raw.drop([80]) 
df_all_clean = df_all_clean.reset_index(drop=True)


In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,26 to 28 May,0600h to 1900h,Food Hub (455 Sengkang West Avenue)
1,26 to 29 May,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51): Fu Fa Food Court
2,26 to 27 May,0900h to 1700h,Sri Murugan Trading Pte Ltd (308 Anchorvale Road)
3,26 May,0900h to 2100h,NTUC Foodfare (308 Anchorvale Road)
4,26 May,0905h to 1200h,McDonald's (293 Yishun Street 22)
5,26 May,1040h to 1140h,Marsiling Lane Food Centre (20 Marsiling Lane)
6,26 May,1130h to 2100h,Tiong Bahru Plaza (298 Tiong Bahru Road): 328 Katong Laksa
7,26 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
8,26 May,1355h to 1700h,VivoCity (1 HarbourFront Walk)
9,26 May,1400h to 1430h,New Trendy Salon LLP (84 Redhill Lane)


(89, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,26 to 28 May,0600h to 1900h,Food Hub,,455 Sengkang West Avenue
1,26 to 29 May,1000h to 2200h,Hougang Green Shopping Mall,Fu Fa Food Court,21 Hougang Street 51
2,26 to 27 May,0900h to 1700h,Sri Murugan Trading Pte Ltd,,308 Anchorvale Road
3,26 May,0900h to 2100h,NTUC Foodfare,,308 Anchorvale Road
4,26 May,0905h to 1200h,McDonald's,,293 Yishun Street 22
5,26 May,1040h to 1140h,Marsiling Lane Food Centre,,20 Marsiling Lane
6,26 May,1130h to 2100h,Tiong Bahru Plaza,328 Katong Laksa,298 Tiong Bahru Road
7,26 May,1320h to 1530h,NTUC FairPrice,,301 Yishun Avenue 2
8,26 May,1355h to 1700h,VivoCity,,1 HarbourFront Walk
9,26 May,1400h to 1430h,New Trendy Salon LLP,,84 Redhill Lane


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')