In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
service_areas_df = pd.read_csv('../data/ECHO_data_csv/SDWA_SERVICE_AREAS.csv', dtype=str)

In [3]:
service_areas_df.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,SERVICE_AREA_TYPE_CODE,IS_PRIMARY_SERVICE_AREA_CODE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
0,2021Q2,MI2038708,RS,Y,,08/20/2005
1,2021Q2,MI2034214,OA,Y,,06/17/2021
2,2021Q2,IL3145870,WH,Y,,03/11/2010
3,2021Q2,IL3145888,WH,Y,,11/20/2013
4,2021Q2,IL3145896,WH,Y,,02/23/2012


In [4]:
print(f'Total number of records in service areas table is {service_areas_df.shape[0]}')

Total number of records in service areas table is 413562


In [7]:
service_areas_df.PWSID.nunique()

370728

In [8]:
sum([size == 1 for size in service_areas_df.groupby(['PWSID', 'SERVICE_AREA_TYPE_CODE']).size().tolist()])

413562

The column set {PWSID, SERVICE_AREA_TYPE_CODE} uniquely identifies each record

In [10]:
service_areas_df.isnull().sum()

SUBMISSIONYEARQUARTER                0
PWSID                                0
SERVICE_AREA_TYPE_CODE               0
IS_PRIMARY_SERVICE_AREA_CODE     98360
FIRST_REPORTED_DATE             334707
LAST_REPORTED_DATE                1106
dtype: int64

In [12]:
## Seeing date formats
date_columns = ['FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE']
service_areas_df[~service_areas_df.FIRST_REPORTED_DATE.isnull()][date_columns].head()

Unnamed: 0,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
14,07/30/2013,06/01/2021
31,07/30/2013,03/08/2017
32,11/17/2008,06/01/2021
38,11/17/2008,06/01/2021
40,11/17/2008,06/01/2021


In [13]:
for datecol in date_columns:
    service_areas_df[datecol] = pd.to_datetime(service_areas_df[datecol], format='%m/%d/%Y')

In [14]:
service_areas_df[~service_areas_df.FIRST_REPORTED_DATE.isnull()][date_columns].head()

Unnamed: 0,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
14,2013-07-30,2021-06-01
31,2013-07-30,2017-03-08
32,2008-11-17,2021-06-01
38,2008-11-17,2021-06-01
40,2008-11-17,2021-06-01


In [15]:
for datacol in date_columns:
    print('Range of {} is between {} and {}'.format(datacol, service_areas_df[datacol].min(), service_areas_df[datacol].max()))

Range of FIRST_REPORTED_DATE is between 2005-10-27 00:00:00 and 2021-07-01 00:00:00
Range of LAST_REPORTED_DATE is between 1995-07-22 00:00:00 and 2021-07-01 00:00:00


In [16]:
len(service_areas_df.columns)

6

### Web Scraping Column description and data type

In [17]:
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [18]:
cols = []
filename = 'SDWA_SERVICE_AREAS.csv'
idx = [i for i, item in enumerate(list(html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3'))) if filename in item.getText()][0]
for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("thead").find_all("th"):
    cols.append(th.get_text().replace(u'\xa0', u''))
rows = []
for row in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("tbody").find_all("tr"):
    r = []
    for td in row.find_all('td'):
        r.append(td.get_text().replace(u'\xa0', u''))
    rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype = column_datatype[column_datatype.Element.isin(service_areas_df.columns)]
column_datatype

Unnamed: 0,Element,Data Type,Length
0,PWSID,Char,9.0
1,SUBMISSIONYEARQUARTER,Char,6.0
2,SERVICE_AREA_TYPE_CODE,Char,4.0
3,IS_PRIMARY_SERVICE_AREA_CODE,Char,1.0
4,FIRST_REPORTED_DATE,Date,
5,LAST_REPORTED_DATE,Date,


In [19]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u'') : p.getText().replace(u'\xa0', u' ')
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in list(service_areas_df.columns)
}
pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc

Unnamed: 0,COLUMN,DESCRIPTION
0,FIRST_REPORTED_DATE,The first reported date for the milestone event. The date format is MM/DD/YYYY.
1,IS_PRIMARY_SERVICE_AREA_CODE,Indicates whether the area is the primary service area served by the water system.
2,LAST_REPORTED_DATE,The last reported date for the milestone event. The date format is MM/DD/YYYY.
3,PWSID,"A unique identifying code for a public water system in SDWIS. The PWSID consists of a two-letter state or region code, followed by seven digits."
4,SERVICE_AREA_TYPE_CODE,Service area type code. For a full list of visit codes and their descriptions see SDWA_REF_CODE_VALUES.csv under VALUE_TYPE=SERVICE_AREA_CODE.
5,SUBMISSIONYEARQUARTER,The fiscal year and quarter when the event took place.


### Generating create SQL statement to create 'service_areas' table to store service areas details

In [20]:
temp_dict = {}

def space(n):
    return ' ' * (max([len(col) for col in service_areas_df.columns]) + 5 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type
print('CREATE TABLE SERVICE_AREAS (')
for col in service_areas_df.columns:
    print(f'\t{col}{space(len(col))}{temp_dict[col]}', end='')
    if col != service_areas_df.columns[-1]:
        print(',')
    else:
        print()
print(') ENGINE = InnoDB;')

CREATE TABLE SERVICE_AREAS (
	SUBMISSIONYEARQUARTER            VARCHAR(6),
	PWSID                            VARCHAR(9),
	SERVICE_AREA_TYPE_CODE           VARCHAR(4),
	IS_PRIMARY_SERVICE_AREA_CODE     VARCHAR(1),
	FIRST_REPORTED_DATE              DATE,
	LAST_REPORTED_DATE               DATE
) ENGINE = InnoDB;


In [22]:
service_areas_df.sort_values(['PWSID', 'SERVICE_AREA_TYPE_CODE'], ascending=[True, True]).to_csv('../data/processed_data/SERVICE_AREAS.csv', index=False) # violations.groupby(['PWSID', 'VIOLATION_ID']).size()

In [23]:
columns = service_areas_df.columns.to_list()

date_columns = ['FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE']

print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/SERVICE_AREAS.csv'
INTO TABLE SERVICE_AREAS 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    if col in date_columns:
        print(f'{col} = IF(@{col} = \'\', NULL, STR_TO_DATE(@{col}, \'%Y-%m-%d\')),')
    else:
        print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/SERVICE_AREAS.csv'
INTO TABLE SERVICE_AREAS 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@SUBMISSIONYEARQUARTER,@PWSID,@SERVICE_AREA_TYPE_CODE,@IS_PRIMARY_SERVICE_AREA_CODE,@FIRST_REPORTED_DATE,@LAST_REPORTED_DATE)
SET
SUBMISSIONYEARQUARTER = IF(@SUBMISSIONYEARQUARTER = '', NULL, @SUBMISSIONYEARQUARTER),
PWSID = IF(@PWSID = '', NULL, @PWSID),
SERVICE_AREA_TYPE_CODE = IF(@SERVICE_AREA_TYPE_CODE = '', NULL, @SERVICE_AREA_TYPE_CODE),
IS_PRIMARY_SERVICE_AREA_CODE = IF(@IS_PRIMARY_SERVICE_AREA_CODE = '', NULL, @IS_PRIMARY_SERVICE_AREA_CODE),
FIRST_REPORTED_DATE = IF(@FIRST_REPORTED_DATE = '', NULL, STR_TO_DATE(@FIRST_REPORTED_DATE, '%Y-%m-%d')),
LAST_REPORTED_DATE = IF(@LAST_REPORTED_DATE = '', NULL, STR_TO_DATE(@LAST_REPORTED_DATE, '%Y-%m-%d')),
