## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_3.pdf")

In [4]:
len(range(0,6))

6

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,6):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,20 May,0800h to 2200h,Westgate (3 Gateway Drive)
1,20 to 21 \nMay,0930h to 2130h,Holland Piazza (3 Lorong Liput) \n Tiger Sugar
2,20 May,0910h to 1255h,Resorts World Sentosa Casino (8 Sentosa Gateway)
3,20 May,1150h to 1225h,Prime Supermarket (373 Bukit Batok Street 31)
4,20 May,1340h to 1510h,Kaki Bukit Recreation Centre (7 Kaki Bukit Avenue 3) \n POSB Account Services Centre
5,20 May,1420h to 1520h,ION Orchard (2 Orchard Turn) \n FLNT at 1-ATICO
6,20 May,1510h to 2100h,Westgate (3 Gateway Drive) \n Pizza Hut
7,20 May,1600h to 2300h,Punggol Plaza (168 Punggol Field)
8,20 May,1700h to 2200h,Poke Theory (238 Thomson Road)
9,20 May,1730h to 1820h,Chinatown Point (133 New Bridge Road)


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[18, 'Location'] = "Al Forno East Coast Pte Ltd (400 East Coast Road)"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Westgate (3 Westgate Drive)", 
                                                            "Westgate (3 Gateway Drive)", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("37 & 39 Sultan Gate", "39 Sultan Gate", regex=False)

# Removing redundant rows
# df_all_clean = df_all_raw.drop() 
# df_all_clean = df_all_clean.reset_index(drop=True)

df_all_clean = df_all_raw

In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,20 May,0800h to 2200h,Westgate (3 Gateway Drive)
1,20 to 21 May,0930h to 2130h,Holland Piazza (3 Lorong Liput): Tiger Sugar
2,20 May,0910h to 1255h,Resorts World Sentosa Casino (8 Sentosa Gateway)
3,20 May,1150h to 1225h,Prime Supermarket (373 Bukit Batok Street 31)
4,20 May,1340h to 1510h,Kaki Bukit Recreation Centre (7 Kaki Bukit Avenue 3): POSB Account Services Centre
5,20 May,1420h to 1520h,ION Orchard (2 Orchard Turn): FLNT at 1-ATICO
6,20 May,1510h to 2100h,Westgate (3 Gateway Drive): Pizza Hut
7,20 May,1600h to 2300h,Punggol Plaza (168 Punggol Field)
8,20 May,1700h to 2200h,Poke Theory (238 Thomson Road)
9,20 May,1730h to 1820h,Chinatown Point (133 New Bridge Road)


(135, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,20 May,0800h to 2200h,Westgate,,3 Gateway Drive
1,20 to 21 May,0930h to 2130h,Holland Piazza,Tiger Sugar,3 Lorong Liput
2,20 May,0910h to 1255h,Resorts World Sentosa Casino,,8 Sentosa Gateway
3,20 May,1150h to 1225h,Prime Supermarket,,373 Bukit Batok Street 31
4,20 May,1340h to 1510h,Kaki Bukit Recreation Centre,POSB Account Services Centre,7 Kaki Bukit Avenue 3
5,20 May,1420h to 1520h,ION Orchard,FLNT at 1-ATICO,2 Orchard Turn
6,20 May,1510h to 2100h,Westgate,Pizza Hut,3 Gateway Drive
7,20 May,1600h to 2300h,Punggol Plaza,,168 Punggol Field
8,20 May,1700h to 2200h,Poke Theory,,238 Thomson Road
9,20 May,1730h to 1820h,Chinatown Point,,133 New Bridge Road


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')