## Set-up

In [1]:
import numpy as np, pandas as pd, pdfplumber

In [2]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = None

## Extracting COVID-19 Case Visit Data from PDF

In [3]:
pdf = pdfplumber.open("..\data\moh_annex_jun_28.pdf")

In [4]:
len(range(0,4))

4

In [6]:
# Extracting tables into dataframes
df_list = {}
for i in range(0,4):
    p = pdf.pages[i]
    tbl = p.extract_table()
    df_list[i] = pd.DataFrame(tbl[3:], columns=tbl[1])
    df_list[i].columns = ['Date', '', '', 'Time', '', '', 'Location', '', '']
    df_list[i] = df_list[i].dropna(axis=1, how="all")

In [7]:
# Binding all dataframes into one big dataframe
df_all_raw = pd.concat(df_list, ignore_index=True)
df_all_raw

Unnamed: 0,Date,Time,Location
0,14 Jun,0500h to 1620h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
1,14 Jun,0630h to 1600h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
2,14 Jun,0745h to 0820h,Eastpoint Mall (3 Simei Street 6) \n• NTUC FairPrice
3,14 Jun,1010h to 1400h,Katong Shopping Centre (865 Mountbatten Road) \n• Union United Employment Pte Ltd
4,14 Jun,1130h to 1205h,Teoh Hock Seng (129 Kim Tian Road)
5,14 Jun,1200h to 2100h,Bake King (10 Haig Road)
6,14 Jun,1430h to 1500h,Redhill Market (79 Redhill Lane)
7,14 Jun,1600h to 1700h,COURTS Megastore (50 Tampines North Drive 2)
8,14 Jun,1655h to 1830h,IKEA Tampines (60 Tampines North Drive 2)
9,14 Jun,1755h to 1925h,City Square Mall (180 Kitchener Road)


In [8]:
## Data Cleaning
# Replacing values that neeed to be reformatted
df_all_raw.loc[25, 'Location'] = "Katong Shopping Centre (865 Mountbatten Road): Central Recruitment Solutions Pte Ltd"
df_all_raw.loc[78, 'Location'] = "Tampines Central Community Complex (866A Tampines Street 83): NTUC FairPrice"

# Cleaning unknown characters ("\uf0b7"), line breaks ("\n") and other inconsistencies in data
df_all_raw['Date'] = df_all_raw['Date'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace("18/20 Kim Tian Road", "18 Kim Tian Road", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Toys “R” Us", "Toys \'R\' Us", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("Mulan (Kyawt Na Di)", "Mulan/Kyawt Na Di", regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("112 Jalan Bukit Merah Market and Food Centre", 
                                                            "112 Jalan Bukit Merah Market and Food Centre (112 Jalan Bukit Merah)",
                                                            regex=False)
df_all_raw['Location'] = df_all_raw['Location'].str.replace("’", "'") 
df_all_raw['Location'] = df_all_raw['Location'].str.replace("\n", "")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ":", n=1)
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" • ", ",")
df_all_raw['Location'] = df_all_raw['Location'].str.replace(" :", ":")

# Removing redundant rows
df_all_clean = df_all_raw.drop([26,79,]) 
df_all_clean = df_all_clean.reset_index(drop=True)

In [9]:
display(df_all_clean)
df_all_clean.shape

Unnamed: 0,Date,Time,Location
0,14 Jun,0500h to 1620h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
1,14 Jun,0630h to 1600h,Tiong Bahru Yong Tao Hu (56 Eng Hoon Street)
2,14 Jun,0745h to 0820h,Eastpoint Mall (3 Simei Street 6): NTUC FairPrice
3,14 Jun,1010h to 1400h,Katong Shopping Centre (865 Mountbatten Road): Union United Employment Pte Ltd
4,14 Jun,1130h to 1205h,Teoh Hock Seng (129 Kim Tian Road)
5,14 Jun,1200h to 2100h,Bake King (10 Haig Road)
6,14 Jun,1430h to 1500h,Redhill Market (79 Redhill Lane)
7,14 Jun,1600h to 1700h,COURTS Megastore (50 Tampines North Drive 2)
8,14 Jun,1655h to 1830h,IKEA Tampines (60 Tampines North Drive 2)
9,14 Jun,1755h to 1925h,City Square Mall (180 Kitchener Road)


(104, 3)

In [10]:
# Splitting Store info into separate column
df_all_clean_split = df_all_clean.copy(deep=True)
df_all_clean_split[['Location', 'Store']] = df_all_clean_split['Location'].str.split(pat=': ', n=1, expand=True)
df_all_clean_split['Store'] = df_all_clean_split['Store'].str.strip()

In [11]:
# Splitting Address info into separate column
df_all_clean_split[['Location','Address']] = df_all_clean_split['Location'].str.split(pat='(', n=1, expand=True)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.replace(")", "", regex=False)
df_all_clean_split['Address'] = df_all_clean_split['Address'].str.strip()
df_all_clean_split

Unnamed: 0,Date,Time,Location,Store,Address
0,14 Jun,0500h to 1620h,Tiong Bahru Yong Tao Hu,,56 Eng Hoon Street
1,14 Jun,0630h to 1600h,Tiong Bahru Yong Tao Hu,,56 Eng Hoon Street
2,14 Jun,0745h to 0820h,Eastpoint Mall,NTUC FairPrice,3 Simei Street 6
3,14 Jun,1010h to 1400h,Katong Shopping Centre,Union United Employment Pte Ltd,865 Mountbatten Road
4,14 Jun,1130h to 1205h,Teoh Hock Seng,,129 Kim Tian Road
5,14 Jun,1200h to 2100h,Bake King,,10 Haig Road
6,14 Jun,1430h to 1500h,Redhill Market,,79 Redhill Lane
7,14 Jun,1600h to 1700h,COURTS Megastore,,50 Tampines North Drive 2
8,14 Jun,1655h to 1830h,IKEA Tampines,,60 Tampines North Drive 2
9,14 Jun,1755h to 1925h,City Square Mall,,180 Kitchener Road


In [12]:
df_all_clean_split.to_csv('../data/visit_data_raw.csv')