## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_23.pdf")

In [4]:
len(range(0,5))

5

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,5):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [9]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,9 to 11 \nJun,0500h to 1800h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
1,9 to 11 \nJun,0600h to 1500h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
2,9 to 10 \nJun,0600h to 1900h,ION Orchard (2 Orchard Turn)
3,9 Jun,0630h to 1100h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
4,9 to 10 \nJun,0730h to 0815h,Redhill Market (79 Redhill Lane)
5,9 Jun,0800h to 1500h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
6,9 to 12 \nJun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
7,9 Jun,1000h to 1100h,NTUC Health Senior Activity Centre (SilverACE) – \nHenderson (117 Bukit Merah View)
8,9 Jun,1000h to 2000h,Katong Shopping Centre (865 Mountbatten Road) \n Union United Employment Pte Ltd
9,9 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n Guardian


In [10]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[7, 'Location'] = "NTUC Health Senior Activity Centre SilverAce - Henderson (117 Bukit Merah View)"
df_all_raw.loc[30, 'Location'] = "Chinatown Complex (335 Smith Street): Wet Market"
df_all_raw.loc[53, 'Location'] = "Green Effect Aquarium (801 French Road)"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("18/20 Kim Tian Road", "18 Kim Tian Road", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Toys “R” Us", "Toys \'R\' Us", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
# df_all_clean = df_all_raw.drop([21,47,72]) 
# df_all_clean = df_all_clean.reset_index(drop=True)
df_all_clean = df_all_raw

In [11]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,9 to 11 Jun,0500h to 1800h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
1,9 to 11 Jun,0600h to 1500h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
2,9 to 10 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
3,9 Jun,0630h to 1100h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
4,9 to 10 Jun,0730h to 0815h,Redhill Market (79 Redhill Lane)
5,9 Jun,0800h to 1500h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
6,9 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
7,9 Jun,1000h to 1100h,NTUC Health Senior Activity Centre SilverAce - Henderson (117 Bukit Merah View)
8,9 Jun,1000h to 2000h,Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd
9,9 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian


(125, 3)

In [12]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [13]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,9 to 11 Jun,0500h to 1800h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
1,9 to 11 Jun,0600h to 1500h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
2,9 to 10 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
3,9 Jun,0630h to 1100h,Tiong Bahru Yong Tao Hu,,56 Eng Hoon Street
4,9 to 10 Jun,0730h to 0815h,Redhill Market,,79 Redhill Lane
5,9 Jun,0800h to 1500h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
6,9 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
7,9 Jun,1000h to 1100h,NTUC Health Senior Activity Centre SilverAce - Henderson,,117 Bukit Merah View
8,9 Jun,1000h to 2000h,Katong Shopping Centre,Union United Employment Pte Ltd,865 Mountbatten Road
9,9 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn


In [14]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')