## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_5.pdf")

In [4]:
len(range(0,5))

5

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,5):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,22 May,0000h to 0400h,The Woodgrove (30 Woodlands Avenue 1) \n• McDonald’s
1,22 May,0740h to 2000h,West Mall (1 Bukit Batok Central Link) \n• Subway
2,22 May,0830h to 0930h,Fu Nan Traditional Chinese Medicine Centre (152 Bukit \nBatok Street 11)
3,22 May,1220h to 1300h,Hup Choon Eating House (1 Binjai Park)
4,22 May,1230h to 1300h,NTUC FairPrice (1 Jalan Bukit Merah)
5,22 May,1240h to 1310h,NTUC FairPrice (1 Jalan Bukit Merah)
6,22 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
7,22 May,1435h to 1510h,Sunshine Place (475 Choa Chu Kang Avenue 3) \n• Giant Supermarket
8,22 May,1500h to 2100h,Westgate (3 Gateway Drive) \n• Pizza Hut
9,22 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
# df_all_raw.loc[8, 'Location'] = "Al Forno East Coast Pte Ltd (400 East Coast Road)"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" •", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Westgate (3 Westgate Drive)", 
                                                            "Westgate (3 Gateway Drive)", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("37 & 39 Sultan Gate", "39 Sultan Gate", regex=False)

# Removing redundant rows
# df_all_clean = df_all_raw.drop() 
# df_all_clean = df_all_clean.reset_index(drop=True)

df_all_clean = df_all_raw

In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,22 May,0000h to 0400h,The Woodgrove (30 Woodlands Avenue 1): McDonald's
1,22 May,0740h to 2000h,West Mall (1 Bukit Batok Central Link): Subway
2,22 May,0830h to 0930h,Fu Nan Traditional Chinese Medicine Centre (152 Bukit Batok Street 11)
3,22 May,1220h to 1300h,Hup Choon Eating House (1 Binjai Park)
4,22 May,1230h to 1300h,NTUC FairPrice (1 Jalan Bukit Merah)
5,22 May,1240h to 1310h,NTUC FairPrice (1 Jalan Bukit Merah)
6,22 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
7,22 May,1435h to 1510h,Sunshine Place (475 Choa Chu Kang Avenue 3): Giant Supermarket
8,22 May,1500h to 2100h,Westgate (3 Gateway Drive): Pizza Hut
9,22 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)


(110, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,22 May,0000h to 0400h,The Woodgrove,McDonald's,30 Woodlands Avenue 1
1,22 May,0740h to 2000h,West Mall,Subway,1 Bukit Batok Central Link
2,22 May,0830h to 0930h,Fu Nan Traditional Chinese Medicine Centre,,152 Bukit Batok Street 11
3,22 May,1220h to 1300h,Hup Choon Eating House,,1 Binjai Park
4,22 May,1230h to 1300h,NTUC FairPrice,,1 Jalan Bukit Merah
5,22 May,1240h to 1310h,NTUC FairPrice,,1 Jalan Bukit Merah
6,22 May,1320h to 1530h,NTUC FairPrice,,301 Yishun Avenue 2
7,22 May,1435h to 1510h,Sunshine Place,Giant Supermarket,475 Choa Chu Kang Avenue 3
8,22 May,1500h to 2100h,Westgate,Pizza Hut,3 Gateway Drive
9,22 May,1320h to 1530h,NTUC FairPrice,,301 Yishun Avenue 2


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')