## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_19.pdf")

In [5]:
len(range(0,5))

5

In [6]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,5):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [10]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,5 to 7 \nJun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,5 to 6 \nJun,0645h to 1700h,Takashimaya (391A Orchard Road) \n• St. Leaven
2,5 to 12 \nJun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
3,5 Jun,0900h to 2130h,ION Orchard (2 Orchard Turn) \n• Four Leaves
4,5 Jun,1000h to 1045h,Divine Beauty (303 Woodlands Street 31)
5,5 Jun,1000h to 2000h,Katong Shopping Centre (865 Mountbatten Road) \n• Union United Employment Pte Ltd
6,5 Jun,1015h to 1200h,Harvest Care Centre (165 Sims Avenue) \n• Harvester Community Church
7,5 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n• Guardian
8,5 Jun,1510h to 1545h,ION Orchard (2 Orchard Turn)
9,5 Jun,1550h to 1625h,NTUC FairPrice (849 Yishun Ring Road)


In [11]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[27, 'Location'] = "Takashimaya (391A Orchard Road)"
df_all_raw.loc[48, 'Location'] = "NTUC Health Senior Activity Centre SilverAce - Henderson (117 Bukit Merah View)"
df_all_raw.loc[69, 'Location'] = "Chinatown Complex (335 Smith Street): Wet Market"
df_all_raw.loc[88, 'Location'] = "Green Effect Aquarium (801 French Road)"
df_all_raw.loc[95, 'Location'] = "Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd"
df_all_raw.loc[101, 'Location'] = "VivoCity (1 Harbourfront Walk): Toys 'R' Us"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
df_all_clean = df_all_raw.drop([96]) 
df_all_clean = df_all_clean.reset_index(drop=True)

In [12]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,5 to 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,5 to 6 Jun,0645h to 1700h,Takashimaya (391A Orchard Road): St. Leaven
2,5 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
3,5 Jun,0900h to 2130h,ION Orchard (2 Orchard Turn): Four Leaves
4,5 Jun,1000h to 1045h,Divine Beauty (303 Woodlands Street 31)
5,5 Jun,1000h to 2000h,Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd
6,5 Jun,1015h to 1200h,Harvest Care Centre (165 Sims Avenue): Harvester Community Church
7,5 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
8,5 Jun,1510h to 1545h,ION Orchard (2 Orchard Turn)
9,5 Jun,1550h to 1625h,NTUC FairPrice (849 Yishun Ring Road)


(115, 3)

In [13]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [14]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,5 to 7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
1,5 to 6 Jun,0645h to 1700h,Takashimaya,St. Leaven,391A Orchard Road
2,5 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
3,5 Jun,0900h to 2130h,ION Orchard,Four Leaves,2 Orchard Turn
4,5 Jun,1000h to 1045h,Divine Beauty,,303 Woodlands Street 31
5,5 Jun,1000h to 2000h,Katong Shopping Centre,Union United Employment Pte Ltd,865 Mountbatten Road
6,5 Jun,1015h to 1200h,Harvest Care Centre,Harvester Community Church,165 Sims Avenue
7,5 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
8,5 Jun,1510h to 1545h,ION Orchard,,2 Orchard Turn
9,5 Jun,1550h to 1625h,NTUC FairPrice,,849 Yishun Ring Road


In [15]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')