## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_17.pdf")

In [4]:
len(range(0,4))

4

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,4):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,3 to 7 \nJun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,3 to 6 \nJun,0645h to 1700h,Takashimaya (391A Orchard Road) \n• St. Leaven
2,3 to 12 \nJun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit \nMerah View)
3,3 Jun,0845h to 1050h,ION Orchard (2 Orchard Turn) \n• Dolce & Gabbana
4,3 Jun,0900h to 1600h,ION Orchard (2 Orchard Turn) \n• Four Leaves
5,3 to 4 \nJun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n• Guardian
6,3 Jun,1645h to 1720h,People’s Park Complex (1 Park Road) \n• Scarlett Supermarket
7,3 Jun,1720h to 1920h,Angel Salon (38 Beo Crescent)
8,3 Jun,1845h to 1935h,Oasis Terraces (681 Punggol Drive) \n• NTUC FairPrice
9,3 Jun,1900h to 1950h,Peninsula Plaza (111 North Bridge Road)


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[31, 'Location'] = "Takashimaya (391A Orchard Road)"
df_all_raw.loc[63, 'Location'] = "Chinatown Complex (335 Smith Street): Wet Market"
df_all_raw.loc[78, 'Location'] = "Green Effect Aquarium (801 French Road)"
df_all_raw.loc[87, 'Location'] = "VivoCity (1 Harbourfront Walk): Toys 'R' Us"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
# df_all_clean = df_all_raw.drop([23]) 
# df_all_clean = df_all_clean.reset_index(drop=True)
df_all_clean = df_all_raw

In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,3 to 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
1,3 to 6 Jun,0645h to 1700h,Takashimaya (391A Orchard Road): St. Leaven
2,3 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre (115 Bukit Merah View)
3,3 Jun,0845h to 1050h,ION Orchard (2 Orchard Turn): Dolce & Gabbana
4,3 Jun,0900h to 1600h,ION Orchard (2 Orchard Turn): Four Leaves
5,3 to 4 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
6,3 Jun,1645h to 1720h,People's Park Complex (1 Park Road): Scarlett Supermarket
7,3 Jun,1720h to 1920h,Angel Salon (38 Beo Crescent)
8,3 Jun,1845h to 1935h,Oasis Terraces (681 Punggol Drive): NTUC FairPrice
9,3 Jun,1900h to 1950h,Peninsula Plaza (111 North Bridge Road)


(93, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,3 to 7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
1,3 to 6 Jun,0645h to 1700h,Takashimaya,St. Leaven,391A Orchard Road
2,3 to 12 Jun,0830h to 1430h,115 Bukit Merah View Market & Hawker Centre,,115 Bukit Merah View
3,3 Jun,0845h to 1050h,ION Orchard,Dolce & Gabbana,2 Orchard Turn
4,3 Jun,0900h to 1600h,ION Orchard,Four Leaves,2 Orchard Turn
5,3 to 4 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
6,3 Jun,1645h to 1720h,People's Park Complex,Scarlett Supermarket,1 Park Road
7,3 Jun,1720h to 1920h,Angel Salon,,38 Beo Crescent
8,3 Jun,1845h to 1935h,Oasis Terraces,NTUC FairPrice,681 Punggol Drive
9,3 Jun,1900h to 1950h,Peninsula Plaza,,111 North Bridge Road


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')