## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_13.pdf")

In [4]:
len(range(0,3))

3

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,3):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,30 May \nto 1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn) \n Guardian
1,30 to 31 \nMay,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
2,30 to 31 \nMay,0645h to 1700h,Takashimaya (391A Orchard Road) \n St. Leaven
3,30 to 31 \nMay,1200h to 1900h,Atatcutz Singapore (348 Bedok Road)
4,30 May \nto 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
5,30 May,1930h to 0000h,NTUC Foodfare (308 Anchorvale Road)
6,30 May,1100h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51) \n Pizza Hut
7,30 May,1125h to 1200h,Ang Mo Supermarket (642 Hougang Avenue 8)
8,30 May,1200h to 1300h,Bugis Street (3 New Bugis Street)
9,30 May,1350h to 1440h,NTUC FairPrice (37 Teban Gardens Road)


In [7]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[21, 'Location'] = "Yew Tee Point (21 Choa Chu Kang North 6): NTUC FairPrice"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Time'] = df_all_raw['Time'].str.replace("1030h to 1900j", 
                                                            "1030h to 1900h")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
df_all_clean = df_all_raw.drop([22]) 
df_all_clean = df_all_clean.reset_index(drop=True)


In [8]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,30 May to 1 Jun,1230h to 2030h,ION Orchard (2 Orchard Turn): Guardian
1,30 to 31 May,0000h to 2359h,Tuas Amenity Centre (71 Pioneer Road)
2,30 to 31 May,0645h to 1700h,Takashimaya (391A Orchard Road): St. Leaven
3,30 to 31 May,1200h to 1900h,Atatcutz Singapore (348 Bedok Road)
4,30 May to 7 Jun,0600h to 1900h,ION Orchard (2 Orchard Turn)
5,30 May,1930h to 0000h,NTUC Foodfare (308 Anchorvale Road)
6,30 May,1100h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51): Pizza Hut
7,30 May,1125h to 1200h,Ang Mo Supermarket (642 Hougang Avenue 8)
8,30 May,1200h to 1300h,Bugis Street (3 New Bugis Street)
9,30 May,1350h to 1440h,NTUC FairPrice (37 Teban Gardens Road)


(58, 3)

In [9]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [10]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,30 May to 1 Jun,1230h to 2030h,ION Orchard,Guardian,2 Orchard Turn
1,30 to 31 May,0000h to 2359h,Tuas Amenity Centre,,71 Pioneer Road
2,30 to 31 May,0645h to 1700h,Takashimaya,St. Leaven,391A Orchard Road
3,30 to 31 May,1200h to 1900h,Atatcutz Singapore,,348 Bedok Road
4,30 May to 7 Jun,0600h to 1900h,ION Orchard,,2 Orchard Turn
5,30 May,1930h to 0000h,NTUC Foodfare,,308 Anchorvale Road
6,30 May,1100h to 2200h,Hougang Green Shopping Mall,Pizza Hut,21 Hougang Street 51
7,30 May,1125h to 1200h,Ang Mo Supermarket,,642 Hougang Avenue 8
8,30 May,1200h to 1300h,Bugis Street,,3 New Bugis Street
9,30 May,1350h to 1440h,NTUC FairPrice,,37 Teban Gardens Road


In [11]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')