## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [12]:
pdf = pdfplumber.open("..\data\moh_annex_jun_14.pdf")

In [13]:
len(range(0,4))

4

In [14]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,4):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [15]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,31 May,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
1,31 May \nto 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
2,31 May,0645h to 1700h,Takashimaya (391A Orchard Road) \n• St. Leaven
3,31 May,1140h to 1500h,Marina Bay Sands (2 Bayfront Avenue) \nChurch’s
4,31 May,1200h to 1900h,Atatcutz Singapore (348 Bedok Road)
5,31 May \nto 1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \nGuardian
6,31 May,1835h to 1920h,Woodlands Galaxy Community Club (31 Woodlands \nAvenue 6)
7,31 May,1940h to 2010h,Sheng Siong Supermarket (446A Fajar Road)
8,31 May,2005h to 0005h,Marina Bay Sands Casino (10 Bayfront Avenue)
9,1 Jun,0735h to 0810h,Sheng Siong Supermarket (446A Fajar Road)


In [16]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[3, 'Location'] = "Marina Bay Sands (2 Bayfront Avenue): Church’s"
df_all_raw.loc[5, 'Location'] = "ION Orchard (2 Orchard Turn): Guardian"
df_all_raw.loc[22, 'Location'] = "ION Orchard (2 Orchard Turn): Four Leaves"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("1 Habourfront Walk", "1 Harbourfront Walk")

# Removing redundant rows
df_all_clean = df_all_raw.drop([23]) 
df_all_clean = df_all_clean.reset_index(drop=True)


In [17]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,31 May,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
1,31 May to 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
2,31 May,0645h to 1700h,Takashimaya (391A Orchard Road): St. Leaven
3,31 May,1140h to 1500h,Marina Bay Sands (2 Bayfront Avenue): Church's
4,31 May,1200h to 1900h,Atatcutz Singapore (348 Bedok Road)
5,31 May to 1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
6,31 May,1835h to 1920h,Woodlands Galaxy Community Club (31 Woodlands Avenue 6)
7,31 May,1940h to 2010h,Sheng Siong Supermarket (446A Fajar Road)
8,31 May,2005h to 0005h,Marina Bay Sands Casino (10 Bayfront Avenue)
9,1 Jun,0735h to 0810h,Sheng Siong Supermarket (446A Fajar Road)


(72, 3)

In [18]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [19]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,31 May,0000h to 2359h,Tuas Amenity Centre,,71 Pioneer Road
1,31 May to 7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
2,31 May,0645h to 1700h,Takashimaya,St. Leaven,391A Orchard Road
3,31 May,1140h to 1500h,Marina Bay Sands,Church's,2 Bayfront Avenue
4,31 May,1200h to 1900h,Atatcutz Singapore,,348 Bedok Road
5,31 May to 1 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
6,31 May,1835h to 1920h,Woodlands Galaxy Community Club,,31 Woodlands Avenue 6
7,31 May,1940h to 2010h,Sheng Siong Supermarket,,446A Fajar Road
8,31 May,2005h to 0005h,Marina Bay Sands Casino,,10 Bayfront Avenue
9,1 Jun,0735h to 0810h,Sheng Siong Supermarket,,446A Fajar Road


In [20]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')