## Set-up

In [7]:
import numpy as np, pandas as pd, pdfplumber

In [8]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [9]:
pdf = pdfplumber.open("..\data\moh_annex_jun_20.pdf")

In [10]:
len(range(0,5))

5

In [11]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,5):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [12]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,6 to 7 \nJun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,6 to 12 \nJun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
2,6 to 9 \nJun,0600h to 1400h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
3,6 Jun,0645h to 1700h,Takashimaya (391A Orchard Road) \n St. Leaven
4,6 Jun,0930h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
5,6 Jun,1000h to 1100h,The Centrepoint (176 Orchard Road) \n Decathlon
6,6 Jun,1000h to 2000h,Katong Shopping Centre (865 Mountbatten Road) \n Union United Employment Pte Ltd
7,6 Jun,1100h to 1145h,Orchard Central (181 Orchard Road) \n UNIQLO
8,6 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n Guardian
9,6 Jun,1240h to 1345h,ION Orchard (2 Orchard Turn)


In [13]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[20, 'Location'] = "ION Orchard (2 Orchard Turn): Guardian"
df_all_raw.loc[22, 'Location'] = "Takashimaya (391A Orchard Road)"
df_all_raw.loc[43, 'Location'] = "NTUC Health Senior Activity Centre SilverAce - Henderson (117 Bukit Merah View)"
df_all_raw.loc[46, 'Location'] = "Tiong Bahru Plaza (298 Tiong Bahru Road): FairPrice Finest"
df_all_raw.loc[65, 'Location'] = "Chinatown Complex (335 Smith Street): Wet Market"
df_all_raw.loc[71, 'Location'] = "Katong V Mall (30 East Coast Road): FairPrice Finest"
df_all_raw.loc[85, 'Location'] = "Green Effect Aquarium (801 French Road)"
df_all_raw.loc[97, 'Location'] = "VivoCity (1 Harbourfront Walk): Toys 'R' Us"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7 ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
df_all_clean = df_all_raw.drop([21,47,72]) 
df_all_clean = df_all_clean.reset_index(drop=True)

In [14]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,6 to 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,6 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
2,6 to 9 Jun,0600h to 1400h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
3,6 Jun,0645h to 1700h,Takashimaya (391A Orchard Road): St. Leaven
4,6 Jun,0930h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
5,6 Jun,1000h to 1100h,The Centrepoint (176 Orchard Road): Decathlon
6,6 Jun,1000h to 2000h,Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd
7,6 Jun,1100h to 1145h,Orchard Central (181 Orchard Road): UNIQLO
8,6 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
9,6 Jun,1240h to 1345h,ION Orchard (2 Orchard Turn)


(112, 3)

In [15]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [16]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,6 to 7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
1,6 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
2,6 to 9 Jun,0600h to 1400h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
3,6 Jun,0645h to 1700h,Takashimaya,St. Leaven,391A Orchard Road
4,6 Jun,0930h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
5,6 Jun,1000h to 1100h,The Centrepoint,Decathlon,176 Orchard Road
6,6 Jun,1000h to 2000h,Katong Shopping Centre,Union United Employment Pte Ltd,865 Mountbatten Road
7,6 Jun,1100h to 1145h,Orchard Central,UNIQLO,181 Orchard Road
8,6 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
9,6 Jun,1240h to 1345h,ION Orchard,,2 Orchard Turn


In [17]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')