In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
facilities_df = pd.read_csv('../data/ECHO_data_csv/SDWA_FACILITIES.csv', dtype=str)

In [3]:
facilities_df.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,FACILITY_ID,FACILITY_NAME,STATE_FACILITY_ID,FACILITY_ACTIVITY_CODE,FACILITY_DEACTIVATION_DATE,FACILITY_TYPE_CODE,SUBMISSION_STATUS_CODE,IS_SOURCE_IND,WATER_TYPE_CODE,AVAILABILITY_CODE,SELLER_TREATMENT_CODE,SELLER_PWSID,SELLER_PWS_NAME,FILTRATION_STATUS_CODE,IS_SOURCE_TREATED_IND,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
0,2021Q2,MO4201034,104814,300 GAL HYDROPNEUMATIC TANK,ST 12540,A,,ST,Y,N,,,,,,,,05/24/2014,06/01/2021
1,2021Q2,MO3281046,105032,119 GAL BLADDER TANK,ST 64182,A,,ST,Y,N,,,,,,,,05/24/2014,03/26/2020
2,2021Q2,CA5000392,1,WELL,,I,01/01/1990,WL,Y,Y,GW,P,,,,,U,,02/13/2002
3,2021Q2,CA5000393,1,WELL,,I,01/01/1996,WL,Y,Y,GW,P,,,,,U,,02/13/2002
4,2021Q2,MO5048992,104963,TWO 5HP BOOSTER PUMPS,PF 91051,A,,PF,Y,N,,,,,,,,05/24/2014,06/01/2021


In [4]:
print(f'Total number of records in facilities table is {facilities_df.shape[0]}')

Total number of records in facilities table is 1449838


In [5]:
sum(facilities_df.groupby(['PWSID', 'FACILITY_ID']).size().tolist())

1449838

The column set {PWSID, FACILITY_ID} uniquely identifies each record

In [6]:
facilities_df.isnull().sum()

SUBMISSIONYEARQUARTER               0
PWSID                               0
FACILITY_ID                         0
FACILITY_NAME                   29333
STATE_FACILITY_ID              470665
FACILITY_ACTIVITY_CODE              0
FACILITY_DEACTIVATION_DATE     912446
FACILITY_TYPE_CODE                  0
SUBMISSION_STATUS_CODE              0
IS_SOURCE_IND                       0
WATER_TYPE_CODE                756323
AVAILABILITY_CODE              730788
SELLER_TREATMENT_CODE         1403679
SELLER_PWSID                  1405727
SELLER_PWS_NAME               1405775
FILTRATION_STATUS_CODE        1417823
IS_SOURCE_TREATED_IND          953488
FIRST_REPORTED_DATE            851633
LAST_REPORTED_DATE               9926
dtype: int64

In [7]:
## Seeing date formats
date_columns = ['FACILITY_DEACTIVATION_DATE', 'FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE']
facilities_df[date_columns].head()

Unnamed: 0,FACILITY_DEACTIVATION_DATE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
0,,05/24/2014,06/01/2021
1,,05/24/2014,03/26/2020
2,01/01/1990,,02/13/2002
3,01/01/1996,,02/13/2002
4,,05/24/2014,06/01/2021


In [8]:
for datecol in date_columns:
    facilities_df[datecol] = pd.to_datetime(facilities_df[datecol], format='%m/%d/%Y')

In [9]:
facilities_df[date_columns].head()

Unnamed: 0,FACILITY_DEACTIVATION_DATE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
0,NaT,2014-05-24,2021-06-01
1,NaT,2014-05-24,2020-03-26
2,1990-01-01,NaT,2002-02-13
3,1996-01-01,NaT,2002-02-13
4,NaT,2014-05-24,2021-06-01


In [10]:
for datacol in date_columns:
    print('Range of {} is between {} and {}'.format(datacol, facilities_df[datacol].min(), facilities_df[datacol].max()))

Range of FACILITY_DEACTIVATION_DATE is between 1900-02-01 00:00:00 and 2022-10-31 00:00:00
Range of FIRST_REPORTED_DATE is between 2005-10-27 00:00:00 and 2021-07-01 00:00:00
Range of LAST_REPORTED_DATE is between 1995-07-22 00:00:00 and 2021-07-01 00:00:00


In [11]:
len(facilities_df.columns)

19

### Web Scraping Column description and data type

In [12]:
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [13]:
cols = []
filename = 'SDWA_FACILITIES.csv'
idx = [i for i, item in enumerate(list(html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3'))) if filename in item.getText()][0]
for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("thead").find_all("th"):
    cols.append(th.get_text().replace(u'\xa0', u''))
rows = []
for row in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("tbody").find_all("tr"):
    r = []
    for td in row.find_all('td'):
        r.append(td.get_text().replace(u'\xa0', u''))
    rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype = column_datatype[column_datatype.Element.isin(facilities_df.columns)]
column_datatype

Unnamed: 0,Element,Data Type,Length
0,PWSID,Char,9.0
1,SUBMISSIONYEARQUARTER,Char,6.0
2,FACILITY_ID,Char,12.0
3,FACILITY_NAME,Char,100.0
4,STATE_FACILITY_ID,Char,40.0
5,FACILITY_ACTIVITY_CODE,Char,1.0
6,FACILITY_DEACTIVATION_DATE,Date,
7,FACILITY_TYPE_CODE,Char,2.0
8,SUBMISSION_STATUS_CODE,Char,4.0
9,IS_SOURCE_IND,Char,1.0


In [28]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u'') : p.getText().replace(u'\xa0', u' ')
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in list(facilities_df.columns)
}
pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc

Unnamed: 0,COLUMN,DESCRIPTION
0,AVAILABILITY_CODE,A single-character code for how the water source is utilized by a water system.
1,FACILITY_ACTIVITY_CODE,A single-character code identifying the current status of the facility.
2,FACILITY_DEACTIVATION_DATE,The date the facility was deactivated (no longer actively serving water) or removed from federal oversight because it no longer met SDWA criteria as a public water system (MM/DD/YYYY format).
3,FACILITY_ID,"Facility ID that, when used with the PWSID, uniquely identifies a water system facility."
4,FACILITY_NAME,The name of the water system facility.
5,FACILITY_TYPE_CODE,Code identifying the type of facility.
6,FILTRATION_STATUS_CODE,A code reported by the state to indicate whether a non-emergency surface water source or a non-emergency ground water under the influence of surface water source is required to install filtration by a certain date or is successfully avoiding filtration.
7,FIRST_REPORTED_DATE,The first reported date for the milestone event. The date format is MM/DD/YYYY.
8,IS_SOURCE_IND,"Indicates whether the water system facility is designated a source (either a Consecutive Connection (CC), Infiltration Gallery (IG), Intake (IN), Non-piped (NP), Roof Catchment (RC), Reservoir (RS), Spring (SP), Well (WL), or Non-piped non-purchased (NN))."
9,IS_SOURCE_TREATED_IND,Indicates whether the water system source water is being treated or not.


### Generating create SQL statement to create 'facilities' table to store facilities details

In [30]:
temp_dict = {}

def space(n):
    return ' ' * (max([len(col) for col in facilities_df.columns]) + 5 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type
print('CREATE TABLE FACILITIES (')
for col in facilities_df.columns:
    print(f'\t{col}{space(len(col))}{temp_dict[col]}', end='')
    if col != facilities_df.columns[-1]:
        print(',')
    else:
        print()
print(') ENGINE = InnoDB;')

CREATE TABLE FACILITIES (
	SUBMISSIONYEARQUARTER          VARCHAR(6),
	PWSID                          VARCHAR(9),
	FACILITY_ID                    VARCHAR(12),
	FACILITY_NAME                  VARCHAR(100),
	STATE_FACILITY_ID              VARCHAR(40),
	FACILITY_ACTIVITY_CODE         VARCHAR(1),
	FACILITY_DEACTIVATION_DATE     DATE,
	FACILITY_TYPE_CODE             VARCHAR(2),
	SUBMISSION_STATUS_CODE         VARCHAR(4),
	IS_SOURCE_IND                  VARCHAR(1),
	WATER_TYPE_CODE                VARCHAR(2),
	AVAILABILITY_CODE              VARCHAR(1),
	SELLER_TREATMENT_CODE          VARCHAR(4),
	SELLER_PWSID                   VARCHAR(9),
	SELLER_PWS_NAME                VARCHAR(100),
	FILTRATION_STATUS_CODE         VARCHAR(4),
	IS_SOURCE_TREATED_IND          VARCHAR(1),
	FIRST_REPORTED_DATE            DATE,
	LAST_REPORTED_DATE             DATE
) ENGINE = InnoDB;


In [32]:
facilities_df.sort_values(['PWSID', 'FACILITY_ID'], ascending=[True, True]).to_csv('../data/processed_data/FACILITIES.csv', index=False) # violations.groupby(['PWSID', 'VIOLATION_ID']).size()

In [36]:
columns = facilities_df.columns.to_list()

date_columns = ['FACILITY_DEACTIVATION_DATE', 'FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE']

print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/FACILITIES.csv'
INTO TABLE FACILITIES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    if col in date_columns:
        print(f'{col} = IF(@{col} = \'\', NULL, STR_TO_DATE(@{col}, \'%Y-%m-%d\')),')
    else:
        print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/FACILITIES.csv'
INTO TABLE FACILITIES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@SUBMISSIONYEARQUARTER,@PWSID,@FACILITY_ID,@FACILITY_NAME,@STATE_FACILITY_ID,@FACILITY_ACTIVITY_CODE,@FACILITY_DEACTIVATION_DATE,@FACILITY_TYPE_CODE,@SUBMISSION_STATUS_CODE,@IS_SOURCE_IND,@WATER_TYPE_CODE,@AVAILABILITY_CODE,@SELLER_TREATMENT_CODE,@SELLER_PWSID,@SELLER_PWS_NAME,@FILTRATION_STATUS_CODE,@IS_SOURCE_TREATED_IND,@FIRST_REPORTED_DATE,@LAST_REPORTED_DATE)
SET
SUBMISSIONYEARQUARTER = IF(@SUBMISSIONYEARQUARTER = '', NULL, @SUBMISSIONYEARQUARTER),
PWSID = IF(@PWSID = '', NULL, @PWSID),
FACILITY_ID = IF(@FACILITY_ID = '', NULL, @FACILITY_ID),
FACILITY_NAME = IF(@FACILITY_NAME = '', NULL, @FACILITY_NAME),
STATE_FACILITY_ID = IF(@STATE_FACILITY_ID = '', NULL, @STATE_FACILITY_ID),
FACILITY_ACTIVITY_CODE = IF(@FACILITY_ACTIVITY_CODE = '', NULL, @FACILITY_ACTIVITY_CODE),
FACILITY