## Set-up

In [2]:
import numpy as np, pandas as pd, pdfplumber

In [3]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [4]:
pdf = pdfplumber.open("..\data\moh_annex_jun_2.pdf")

In [5]:
len(range(0,6))

6

In [6]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,6):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [7]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,19 to 21 \nMay,0800h to 2200h,Westgate (3 Gateway Drive)
1,19 to 21 \nMay,0930h to 2130h,Holland Piazza (3 Lorong Liput) \n Tiger Sugar
2,19 May,1030h to 2000h,Westgate (3 Westgate Drive) \n Fun Toast
3,19 to 30 \nMay,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
4,19 May,1420h to 1500h,Health Management Centre (809 French Road)
5,19 May,1420h to 1525h,Woodlands Civic Centre (900 South Woodlands Drive) \n Workforce Singapore’s Careers Connect
6,19 May,1430h to 1535h,NTUC FairPrice (301 Yishun Avenue 2)
7,19 May,1515h to 1550h,Causeway Point (1 Woodlands Square) \n Tonkotsu Hototogisu
8,19 May,1520h to 2100h,Westgate (3 Gateway Drive) \n Pizza Hut
9,19 May,1600h to 2300h,Punggol Plaza (168 Punggol Field)


In [9]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[26, 'Location'] = "Al Forno East Coast Pte Ltd (400 East Coast Road)"
df_all_raw.loc[44, 'Location'] = "Boon Lay Shopping Centre (221 Boon Lay Place): NTUC FairPrice"
df_all_raw.loc[70, 'Location'] = "Funan Mall (107 North Bridge Road): Popsical"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" \uf0b7", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Westgate (3 Westgate Drive)", 
                                                            "Westgate (3 Gateway Drive)", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("37 & 39 Sultan Gate", "39 Sultan Gate", regex=False)

# Removing redundant rows
# df_all_clean = df_all_raw.drop() 
# df_all_clean = df_all_clean.reset_index(drop=True)

df_all_clean = df_all_raw

In [10]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,19 to 21 May,0800h to 2200h,Westgate (3 Gateway Drive)
1,19 to 21 May,0930h to 2130h,Holland Piazza (3 Lorong Liput): Tiger Sugar
2,19 May,1030h to 2000h,Westgate (3 Gateway Drive): Fun Toast
3,19 to 30 May,1320h to 1530h,NTUC FairPrice (301 Yishun Avenue 2)
4,19 May,1420h to 1500h,Health Management Centre (809 French Road)
5,19 May,1420h to 1525h,Woodlands Civic Centre (900 South Woodlands Drive): Workforce Singapore's Careers Connect
6,19 May,1430h to 1535h,NTUC FairPrice (301 Yishun Avenue 2)
7,19 May,1515h to 1550h,Causeway Point (1 Woodlands Square): Tonkotsu Hototogisu
8,19 May,1520h to 2100h,Westgate (3 Gateway Drive): Pizza Hut
9,19 May,1600h to 2300h,Punggol Plaza (168 Punggol Field)


(125, 3)

In [11]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [12]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,19 to 21 May,0800h to 2200h,Westgate,,3 Gateway Drive
1,19 to 21 May,0930h to 2130h,Holland Piazza,Tiger Sugar,3 Lorong Liput
2,19 May,1030h to 2000h,Westgate,Fun Toast,3 Gateway Drive
3,19 to 30 May,1320h to 1530h,NTUC FairPrice,,301 Yishun Avenue 2
4,19 May,1420h to 1500h,Health Management Centre,,809 French Road
5,19 May,1420h to 1525h,Woodlands Civic Centre,Workforce Singapore's Careers Connect,900 South Woodlands Drive
6,19 May,1430h to 1535h,NTUC FairPrice,,301 Yishun Avenue 2
7,19 May,1515h to 1550h,Causeway Point,Tonkotsu Hototogisu,1 Woodlands Square
8,19 May,1520h to 2100h,Westgate,Pizza Hut,3 Gateway Drive
9,19 May,1600h to 2300h,Punggol Plaza,,168 Punggol Field


In [13]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')