In [321]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [322]:
water_systems_df = pd.read_csv('../data/ECHO_data_csv/SDWA_PUB_WATER_SYSTEMS.csv', dtype=str)

In [324]:
water_systems_df.shape

(425899, 51)

In [297]:
## Seeing date formats
date_columns =  ['PWS_DEACTIVATION_DATE', 'FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE', 'SOURCE_PROTECTION_BEGIN_DATE', 'OUTSTANDING_PERFORM_BEGIN_DATE', 'REDUCED_MONITORING_BEGIN_DATE', 'REDUCED_MONITORING_END_DATE']
water_systems_df[(~water_systems_df.PWS_DEACTIVATION_DATE.isnull()) & (~water_systems_df.SOURCE_PROTECTION_BEGIN_DATE.isnull()) & (~water_systems_df.OUTSTANDING_PERFORM_BEGIN_DATE.isnull()) & (~water_systems_df.REDUCED_MONITORING_BEGIN_DATE.isnull()) ][date_columns].head()

Unnamed: 0,PWS_DEACTIVATION_DATE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE,SOURCE_PROTECTION_BEGIN_DATE,OUTSTANDING_PERFORM_BEGIN_DATE,REDUCED_MONITORING_BEGIN_DATE,REDUCED_MONITORING_END_DATE
247169,09/30/2016,12/15/1988,05/18/2021,10/02/2003,01/01/1987,04/01/2016,
247189,04/27/2016,01/23/1991,05/18/2021,10/15/2003,01/01/1987,04/01/2016,
247190,03/31/2016,01/23/1991,05/18/2021,10/15/2003,01/01/1987,04/01/2016,
247636,04/11/2018,03/05/1992,05/18/2021,10/15/2003,01/01/1987,04/01/2016,
247645,03/05/2021,02/05/1999,05/18/2021,10/15/2003,01/01/1987,04/01/2016,


In [298]:
water_systems_df[(~water_systems_df.REDUCED_MONITORING_END_DATE.isnull())].REDUCED_MONITORING_END_DATE.unique().tolist()[:5]

['09/30/2018', '12/31/2018', '12/31/2019', '10/31/2017', '07/31/2019']

In [300]:
for datecol in date_columns:
    water_systems_df[datecol] = pd.to_datetime(water_systems_df[datecol], format='%m/%d/%Y')

In [301]:
water_systems_df[(~water_systems_df.PWS_DEACTIVATION_DATE.isnull()) & (~water_systems_df.SOURCE_PROTECTION_BEGIN_DATE.isnull()) & (~water_systems_df.OUTSTANDING_PERFORM_BEGIN_DATE.isnull()) & (~water_systems_df.REDUCED_MONITORING_BEGIN_DATE.isnull()) ][date_columns].head()

Unnamed: 0,PWS_DEACTIVATION_DATE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE,SOURCE_PROTECTION_BEGIN_DATE,OUTSTANDING_PERFORM_BEGIN_DATE,REDUCED_MONITORING_BEGIN_DATE,REDUCED_MONITORING_END_DATE
247169,2016-09-30,1988-12-15,2021-05-18,2003-10-02,1987-01-01,2016-04-01,NaT
247189,2016-04-27,1991-01-23,2021-05-18,2003-10-15,1987-01-01,2016-04-01,NaT
247190,2016-03-31,1991-01-23,2021-05-18,2003-10-15,1987-01-01,2016-04-01,NaT
247636,2018-04-11,1992-03-05,2021-05-18,2003-10-15,1987-01-01,2016-04-01,NaT
247645,2021-03-05,1999-02-05,2021-05-18,2003-10-15,1987-01-01,2016-04-01,NaT


In [302]:
for datacol in date_columns:
    print('Range of {} is between {} and {}'.format(datacol, water_systems_df[datacol].min(), water_systems_df[datacol].max()))

Range of PWS_DEACTIVATION_DATE is between 1900-02-01 00:00:00 and 2021-06-30 00:00:00
Range of FIRST_REPORTED_DATE is between 1979-02-10 00:00:00 and 2021-07-01 00:00:00
Range of LAST_REPORTED_DATE is between 1995-07-22 00:00:00 and 2021-07-01 00:00:00
Range of SOURCE_PROTECTION_BEGIN_DATE is between 1986-12-31 00:00:00 and 2021-06-09 00:00:00
Range of OUTSTANDING_PERFORM_BEGIN_DATE is between 1987-01-01 00:00:00 and 2021-06-17 00:00:00
Range of REDUCED_MONITORING_BEGIN_DATE is between 1990-03-01 00:00:00 and 2021-05-07 00:00:00
Range of REDUCED_MONITORING_END_DATE is between 2016-04-01 00:00:00 and 2021-03-31 00:00:00


In [304]:
water_systems_df.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,PWS_NAME,PRIMACY_AGENCY_CODE,EPA_REGION,SEASON_BEGIN_DATE,SEASON_END_DATE,PWS_ACTIVITY_CODE,PWS_DEACTIVATION_DATE,PWS_TYPE_CODE,...,LAST_REPORTED_DATE,STATE_CODE,SOURCE_WATER_PROTECTION_CODE,SOURCE_PROTECTION_BEGIN_DATE,OUTSTANDING_PERFORMER,OUTSTANDING_PERFORM_BEGIN_DATE,REDUCED_RTCR_MONITORING,REDUCED_MONITORING_BEGIN_DATE,REDUCED_MONITORING_END_DATE,SEASONAL_STARTUP_SYSTEM
0,2021Q2,OK3000805,CYRIL,OK,6,,,A,NaT,CWS,...,2021-06-24,OK,N,NaT,,NaT,,NaT,NaT,
1,2021Q2,OK1010840,FALLS CREEK BAPTIST CONFERENCE CENTER,OK,6,,,A,NaT,CWS,...,2021-06-24,OK,N,NaT,,NaT,,NaT,NaT,
2,2021Q2,OK1010822,DAVIS,OK,6,,,A,NaT,CWS,...,2021-06-24,OK,Y,2009-08-10,,NaT,,NaT,NaT,
3,2021Q2,OK1010824,DOUGHERTY,OK,6,,,A,NaT,CWS,...,2021-06-24,OK,Y,2009-08-10,,NaT,,NaT,NaT,
4,2021Q2,OK2000808,EAKLY DEVELOPMENT CORP,OK,6,,,A,NaT,CWS,...,2021-06-24,OK,N,NaT,,NaT,,NaT,NaT,


### Web Scraping Column description and data type

In [325]:
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [326]:
cols = []
for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[14].find_previous_sibling('table').find("thead").find_all("th"):
    cols.append(th.get_text().replace(u'\xa0', u''))
rows = []
for row in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[14].find_previous_sibling('table').find("tbody").find_all("tr"):
    r = []
    for td in row.find_all('td'):
        r.append(td.get_text().replace('SERVICE_CONECTIONS_COUNT','SERVICE_CONNECTIONS_COUNT').replace(u'\xa0', u''))
    rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype

Unnamed: 0,Element,Data Type,Length
0,PWSID,Char,9.0
1,SUBMISSIONYEARQUARTER,Char,6.0
2,PWS_NAME,Char,100.0
3,PRIMACY_AGENCY_CODE,Char,2.0
4,EPA_REGION,Char,3.0
5,PWS_ACTIVITY_CODE,Char,1.0
6,SEASON_BEGIN_DATE,Char,5.0
7,SEASON_END_DATE,Char,5.0
8,PWS_DEACTIVATION_DATE,Date,
9,PWS_TYPE_CODE,Char,6.0


In [130]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u'').replace('PWS_TYPE_CODE/PWS_TYPE_SHORT', 'PWS_TYPE_CODE'): p.getText().replace(u'\xa0', u' ')
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in columns + ['PWS_TYPE_CODE/PWS_TYPE_SHORT']
}

pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc


Unnamed: 0,COLUMN,DESCRIPTION
0,ADMIN_NAME,Name of the water system administrative contact – usually a person’s name.
1,ADDRESS_LINE1,The first line of an address applicable to a legal entity.
2,ADDRESS_LINE2,"The second line of an address applicable to a legal entity. This field is the street address, rural route, etc."
3,ALT_PHONE_NUMBER,Administrative contact's alternative telephone number.
4,CDS_ID,Combined distribution system identifier.
5,CITY_NAME,The city in which a legal entity is located.
6,COUNTRY_CODE,Two-character abbreviation of the country code where the administrative contact is located.
7,DBPR_SCHEDULE_CAT_CODE,Stage 2 Disinfectant Byproducts Rule schedule category code.
8,EMAIL_ADDR,Email address of the administrative contact.
9,EPA_REGION,A two-character code identifying the EPA Region in which the system is located.


### Generating create SQL statement to create 'pws' table to store water system details

In [337]:
temp_dict = {}

def space(n):
    return ' ' * (max([len(col) for col in columns]) + 5 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type
print('CREATE TABLE pws (')
for col in columns:
    print(f'\t{col}{space(len(col))}{temp_dict[col]}', end='')
    if col != columns[-1]:
        print(',')
    else:
        print()
print(') ENGINE = InnoDB;')

CREATE TABLE pws (
	SUBMISSIONYEARQUARTER              VARCHAR(6),
	PWSID                              VARCHAR(9),
	PWS_NAME                           VARCHAR(100),
	PRIMACY_AGENCY_CODE                VARCHAR(2),
	EPA_REGION                         VARCHAR(3),
	SEASON_BEGIN_DATE                  VARCHAR(5),
	SEASON_END_DATE                    VARCHAR(5),
	PWS_ACTIVITY_CODE                  VARCHAR(1),
	PWS_DEACTIVATION_DATE              DATE,
	PWS_TYPE_CODE                      VARCHAR(6),
	DBPR_SCHEDULE_CAT_CODE             INT,
	CDS_ID                             VARCHAR(100),
	GW_SW_CODE                         VARCHAR(2),
	LT2_SCHEDULE_CAT_CODE              INT,
	OWNER_TYPE_CODE                    VARCHAR(1),
	POPULATION_SERVED_COUNT            INT,
	POP_CAT_2_CODE                     INT,
	POP_CAT_3_CODE                     INT,
	POP_CAT_4_CODE                     INT,
	POP_CAT_5_CODE                     INT,
	POP_CAT_11_CODE                    INT,
	PRIMACY_TYPE                  

In [346]:
columns = water_systems_df.columns.to_list()
print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/SDWA_PUB_WATER_SYSTEMS.csv'
INTO TABLE pws
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/SDWA_PUB_WATER_SYSTEMS.csv'
INTO TABLE pws
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@SUBMISSIONYEARQUARTER,@PWSID,@PWS_NAME,@PRIMACY_AGENCY_CODE,@EPA_REGION,@SEASON_BEGIN_DATE,@SEASON_END_DATE,@PWS_ACTIVITY_CODE,@PWS_DEACTIVATION_DATE,@PWS_TYPE_CODE,@DBPR_SCHEDULE_CAT_CODE,@CDS_ID,@GW_SW_CODE,@LT2_SCHEDULE_CAT_CODE,@OWNER_TYPE_CODE,@POPULATION_SERVED_COUNT,@POP_CAT_2_CODE,@POP_CAT_3_CODE,@POP_CAT_4_CODE,@POP_CAT_5_CODE,@POP_CAT_11_CODE,@PRIMACY_TYPE,@PRIMARY_SOURCE_CODE,@IS_GRANT_ELIGIBLE_IND,@IS_WHOLESALER_IND,@IS_SCHOOL_OR_DAYCARE_IND,@SERVICE_CONNECTIONS_COUNT,@SUBMISSION_STATUS_CODE,@ORG_NAME,@ADMIN_NAME,@EMAIL_ADDR,@PHONE_NUMBER,@PHONE_EXT_NUMBER,@FAX_NUMBER,@ALT_PHONE_NUMBER,@ADDRESS_LINE1,@ADDRESS_LINE2,@CITY_NAME,@ZIP_CODE,@COUNTRY_CODE,@FIRST_REPORTED_DATE,@LAST_REPORTED_DATE,@STATE_CODE,@SOURCE_WATER_PROTECTION_CODE,@SOURCE_PROTECTION_BEGIN_DATE,@O