## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_7.pdf")

In [4]:
len(range(0,4))

4

In [5]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,4):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [6]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,24 May,0600h to 1500h,The Woodgrove (30 Woodlands Avenue 1) \n McDonald’s
1,24 to 28 \nMay,0600h to 1900h,Food Hub (455 Sengkang West Avenue)
2,24 to 29 \nMay,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51) \n Fu Fa Food Court
3,24 May,1155h to 1625h,Ramada (16 Ah Hood Road)
4,24 May,1230h to 1300h,D'FoodStop@ABC (2 Jalan Bukit Merah)
5,24 May,1230h to 1300h,Fajar Shopping Centre (445 Fajar Road) \n Giant Supermarket
6,24 May,1235h to 1310h,NTUC FairPrice (30 Bendemeer Road)
7,24 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
8,24 May,1535h to 1640h,Funan (107 North Bridge Road) \n Popsical
9,24 May,1745h to 1845h,Yew Tee Community Club (20 Choa Chu Kang Street 52)


In [11]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[53, 'Location'] = "Hougang Green Shopping Mall (21 Hougang Street 51): Pizza Hut"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Westgate (3 Westgate Drive)", 
                                                            "Westgate (3 Gateway Drive)", regex=False)

# Removing redundant rows
df_all_clean = df_all_raw.drop([54]) 
df_all_clean = df_all_clean.reset_index(drop=True)


In [12]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,24 May,0600h to 1500h,The Woodgrove (30 Woodlands Avenue 1): McDonald's
1,24 to 28 May,0600h to 1900h,Food Hub (455 Sengkang West Avenue)
2,24 to 29 May,1000h to 2200h,Hougang Green Shopping Mall (21 Hougang Street 51): Fu Fa Food Court
3,24 May,1155h to 1625h,Ramada (16 Ah Hood Road)
4,24 May,1230h to 1300h,D'FoodStop@ABC (2 Jalan Bukit Merah)
5,24 May,1230h to 1300h,Fajar Shopping Centre (445 Fajar Road): Giant Supermarket
6,24 May,1235h to 1310h,NTUC FairPrice (30 Bendemeer Road)
7,24 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
8,24 May,1535h to 1640h,Funan (107 North Bridge Road): Popsical
9,24 May,1745h to 1845h,Yew Tee Community Club (20 Choa Chu Kang Street 52)


(90, 3)

In [13]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [14]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,24 May,0600h to 1500h,The Woodgrove,McDonald's,30 Woodlands Avenue 1
1,24 to 28 May,0600h to 1900h,Food Hub,,455 Sengkang West Avenue
2,24 to 29 May,1000h to 2200h,Hougang Green Shopping Mall,Fu Fa Food Court,21 Hougang Street 51
3,24 May,1155h to 1625h,Ramada,,16 Ah Hood Road
4,24 May,1230h to 1300h,D'FoodStop@ABC,,2 Jalan Bukit Merah
5,24 May,1230h to 1300h,Fajar Shopping Centre,Giant Supermarket,445 Fajar Road
6,24 May,1235h to 1310h,NTUC FairPrice,,30 Bendemeer Road
7,24 May,1320h to 1530h,NTUC FairPrice,,301 Yishun Avenue 2
8,24 May,1535h to 1640h,Funan,Popsical,107 North Bridge Road
9,24 May,1745h to 1845h,Yew Tee Community Club,,20 Choa Chu Kang Street 52


In [16]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')