In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
lcr_samples_df = pd.read_csv('../data/ECHO_data_csv/SDWA_LCR_SAMPLES.csv', dtype=str)

In [5]:
lcr_samples_df.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,SAMPLE_ID,SAMPLING_END_DATE,SAMPLING_START_DATE,RECONCILIATION_ID,SAMPLE_FIRST_REPORTED_DATE,SAMPLE_LAST_REPORTED_DATE,SAR_ID,CONTAMINANT_CODE,RESULT_SIGN_CODE,SAMPLE_MEASURE,UNIT_OF_MEASURE,SAR_FIRST_REPORTED_DATE,SAR_LAST_REPORTED_DATE
0,2021Q2,WI2650907,1,12/31/1993,07/01/1993,,04/03/1997,04/03/1997,3249839,PB90,,0.013,mg/L,,
1,2021Q2,WI2650907,2,06/30/1994,01/01/1994,,04/03/1997,04/03/1997,3249840,PB90,,0.007,mg/L,,
2,2021Q2,WI2650907,3,12/31/1996,07/01/1996,,04/03/1997,04/03/1997,3249841,PB90,,0.00495,mg/L,,
3,2021Q2,WI2650907,5,12/31/1998,07/01/1998,,02/27/1999,02/27/1999,3257671,PB90,,0.0022,mg/L,,
4,2021Q2,WI2650907,11,12/31/1995,07/01/1995,,03/04/2000,03/04/2000,3609018,PB90,,0.00725,mg/L,,


In [6]:
print(f'Total number of records in lcr_samples table is {lcr_samples_df.shape[0]}')

Total number of records in lcr_samples table is 760988


In [7]:
lcr_samples_df.SAMPLE_ID.nunique()

536061

In [8]:
sum(lcr_samples_df.groupby(['PWSID', 'SAMPLE_ID']).size().tolist())

760988

The column set {PWSID, SAMPLE_ID} uniquely identifies each record

In [9]:
lcr_samples_df.isnull().sum()

SUBMISSIONYEARQUARTER              0
PWSID                              0
SAMPLE_ID                          0
SAMPLING_END_DATE                  0
SAMPLING_START_DATE                0
RECONCILIATION_ID             760367
SAMPLE_FIRST_REPORTED_DATE      2630
SAMPLE_LAST_REPORTED_DATE      46012
SAR_ID                             0
CONTAMINANT_CODE                   0
RESULT_SIGN_CODE              755270
SAMPLE_MEASURE                     0
UNIT_OF_MEASURE                    0
SAR_FIRST_REPORTED_DATE       235229
SAR_LAST_REPORTED_DATE        277264
dtype: int64

In [11]:
## Seeing date formats
date_columns = ['SAMPLING_START_DATE', 'SAMPLING_END_DATE', 'SAMPLE_FIRST_REPORTED_DATE', 'SAMPLE_LAST_REPORTED_DATE', 'SAR_FIRST_REPORTED_DATE', 'SAR_LAST_REPORTED_DATE']
lcr_samples_df[~lcr_samples_df['SAR_FIRST_REPORTED_DATE'].isnull()][date_columns].head()

Unnamed: 0,SAMPLING_START_DATE,SAMPLING_END_DATE,SAMPLE_FIRST_REPORTED_DATE,SAMPLE_LAST_REPORTED_DATE,SAR_FIRST_REPORTED_DATE,SAR_LAST_REPORTED_DATE
7,01/01/2006,12/31/2006,02/26/2007,11/13/2007,02/26/2007,11/13/2007
8,01/01/2003,12/31/2003,03/02/2006,11/21/2006,03/02/2006,11/21/2006
9,01/01/2009,12/31/2009,02/23/2010,11/16/2010,02/23/2010,11/16/2010
17,01/01/2004,06/30/2004,03/02/2006,02/26/2007,03/02/2006,02/26/2007
18,07/01/2004,12/31/2004,03/02/2006,02/26/2007,03/02/2006,02/26/2007


In [12]:
for datecol in date_columns:
    lcr_samples_df[datecol] = pd.to_datetime(lcr_samples_df[datecol], format='%m/%d/%Y')

In [13]:
lcr_samples_df[~lcr_samples_df['SAR_FIRST_REPORTED_DATE'].isnull()][date_columns].head()

Unnamed: 0,SAMPLING_START_DATE,SAMPLING_END_DATE,SAMPLE_FIRST_REPORTED_DATE,SAMPLE_LAST_REPORTED_DATE,SAR_FIRST_REPORTED_DATE,SAR_LAST_REPORTED_DATE
7,2006-01-01,2006-12-31,2007-02-26,2007-11-13,2007-02-26,2007-11-13
8,2003-01-01,2003-12-31,2006-03-02,2006-11-21,2006-03-02,2006-11-21
9,2009-01-01,2009-12-31,2010-02-23,2010-11-16,2010-02-23,2010-11-16
17,2004-01-01,2004-06-30,2006-03-02,2007-02-26,2006-03-02,2007-02-26
18,2004-07-01,2004-12-31,2006-03-02,2007-02-26,2006-03-02,2007-02-26


In [14]:
for datacol in date_columns:
    print('Range of {} is between {} and {}'.format(datacol, lcr_samples_df[datacol].min(), lcr_samples_df[datacol].max()))

Range of SAMPLING_START_DATE is between 1991-07-01 00:00:00 and 2021-01-01 00:00:00
Range of SAMPLING_END_DATE is between 1992-01-01 00:00:00 and 2028-12-31 00:00:00
Range of SAMPLE_FIRST_REPORTED_DATE is between 1992-08-22 00:00:00 and 2021-07-01 00:00:00
Range of SAMPLE_LAST_REPORTED_DATE is between 1993-05-20 00:00:00 and 2021-07-01 00:00:00
Range of SAR_FIRST_REPORTED_DATE is between 2006-02-28 00:00:00 and 2021-07-01 00:00:00
Range of SAR_LAST_REPORTED_DATE is between 2006-02-28 00:00:00 and 2021-07-01 00:00:00


In [15]:
len(lcr_samples_df.columns)

15

### Web Scraping Column description and data type

In [16]:
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [20]:
cols = []
filename = 'SDWA_LCR_Samples.csv'
idx = [i for i, item in enumerate(list(html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3'))) if filename in item.getText()][0]
for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("thead").find_all("th"):
    cols.append(th.get_text().replace(u'\xa0', u''))
rows = []
for row in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("tbody").find_all("tr"):
    r = []
    for td in row.find_all('td'):
        r.append(td.get_text().replace(u'\xa0', u''))
    rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype = column_datatype[column_datatype.Element.isin(lcr_samples_df.columns)]
column_datatype

Unnamed: 0,Element,Data Type,Length
0,PWSID,Char,9.0
1,SUBMISSIONYEARQUARTER,Char,6.0
2,SAMPLE_ID,Char,
3,SAMPLING_END_DATE,Date,
4,SAMPLING_START_DATE,Date,
5,RECONCILIATION_ID,Char,40.0
6,SAMPLE_FIRST_REPORTED_DATE,Date,
7,SAMPLE_LAST_REPORTED_DATE,Date,
8,SAR_ID,Num,9.0
9,CONTAMINANT_CODE,Char,4.0


In [21]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u'') : p.getText().replace(u'\xa0', u' ')
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in list(lcr_samples_df.columns)
}
pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc

Unnamed: 0,COLUMN,DESCRIPTION
0,CONTAMINANT_CODE,A code value that represents a contaminant for which a public water system has incurred a violation of a primary drinking water regulation. A full description of the codes can be accessed in the SDWA_REF_CODE_VALUES.csv.
1,PWSID,"A unique identifying code for a public water system in SDWIS. The PWSID consists of a two-letter state or region code, followed by seven digits."
2,RECONCILIATION_ID,An identifier used for reconciliation with the state data system or LAB assigned identifiers.
3,RESULT_SIGN_CODE,Indicates if the sample result was below the minimum detection limit or equal to the value reported.
4,SAMPLE_FIRST_REPORTED_DATE,Date that the sample was first reported. The date format is MM/DD/YYYY.
5,SAMPLE_ID,Identifier used to identify the sample.
6,SAMPLE_LAST_REPORTED_DATE,The most recent date of reporting. The date of format is MM/DD/YYYY.
7,SAMPLE_MEASURE,The measured value of the contaminant as reported from the sampling analysis.
8,SAMPLING_END_DATE,Date of the last day of the monitoring period in which 90th percentile data for lead or copper was acquired (MM/DD/YYYY format).
9,SAMPLING_START_DATE,Date of the first day of the monitoring period in which 90th percentile data for lead or copper was acquired (MM/DD/YYYY format).


### Generating create SQL statement to create 'lcr_samples' table to store lcr samples details

In [23]:
temp_dict = {}

def space(n):
    return ' ' * (max([len(col) for col in lcr_samples_df.columns]) + 5 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type
print('CREATE TABLE LCR_SAMPLES (')
for col in lcr_samples_df.columns:
    print(f'\t{col}{space(len(col))}{temp_dict[col]}', end='')
    if col != lcr_samples_df.columns[-1]:
        print(',')
    else:
        print()
print(') ENGINE = InnoDB;')

CREATE TABLE LCR_SAMPLES (
	SUBMISSIONYEARQUARTER          VARCHAR(6),
	PWSID                          VARCHAR(9),
	SAMPLE_ID                      VARCHAR(255),
	SAMPLING_END_DATE              DATE,
	SAMPLING_START_DATE            DATE,
	RECONCILIATION_ID              VARCHAR(40),
	SAMPLE_FIRST_REPORTED_DATE     DATE,
	SAMPLE_LAST_REPORTED_DATE      DATE,
	SAR_ID                         INT,
	CONTAMINANT_CODE               VARCHAR(4),
	RESULT_SIGN_CODE               VARCHAR(2),
	SAMPLE_MEASURE                 INT,
	UNIT_OF_MEASURE                VARCHAR(4),
	SAR_FIRST_REPORTED_DATE        DATE,
	SAR_LAST_REPORTED_DATE         DATE
) ENGINE = InnoDB;


In [26]:
lcr_samples_df.sort_values(['PWSID', 'SAMPLE_ID'], ascending=[True, True]).to_csv('../data/processed_data/LCR_SAMPLES.csv', index=False) # violations.groupby(['PWSID', 'VIOLATION_ID']).size()

In [27]:
columns = lcr_samples_df.columns.to_list()

date_columns = ['SAMPLING_START_DATE', 'SAMPLING_END_DATE', 'SAMPLE_FIRST_REPORTED_DATE', 'SAMPLE_LAST_REPORTED_DATE', 'SAR_FIRST_REPORTED_DATE', 'SAR_LAST_REPORTED_DATE']

print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/LCR_SAMPLES.csv'
INTO TABLE LCR_SAMPLES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    if col in date_columns:
        print(f'{col} = IF(@{col} = \'\', NULL, STR_TO_DATE(@{col}, \'%Y-%m-%d\')),')
    else:
        print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/LCR_SAMPLES.csv'
INTO TABLE LCR_SAMPLES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@SUBMISSIONYEARQUARTER,@PWSID,@SAMPLE_ID,@SAMPLING_END_DATE,@SAMPLING_START_DATE,@RECONCILIATION_ID,@SAMPLE_FIRST_REPORTED_DATE,@SAMPLE_LAST_REPORTED_DATE,@SAR_ID,@CONTAMINANT_CODE,@RESULT_SIGN_CODE,@SAMPLE_MEASURE,@UNIT_OF_MEASURE,@SAR_FIRST_REPORTED_DATE,@SAR_LAST_REPORTED_DATE)
SET
SUBMISSIONYEARQUARTER = IF(@SUBMISSIONYEARQUARTER = '', NULL, @SUBMISSIONYEARQUARTER),
PWSID = IF(@PWSID = '', NULL, @PWSID),
SAMPLE_ID = IF(@SAMPLE_ID = '', NULL, @SAMPLE_ID),
SAMPLING_END_DATE = IF(@SAMPLING_END_DATE = '', NULL, STR_TO_DATE(@SAMPLING_END_DATE, '%Y-%m-%d')),
SAMPLING_START_DATE = IF(@SAMPLING_START_DATE = '', NULL, STR_TO_DATE(@SAMPLING_START_DATE, '%Y-%m-%d')),
RECONCILIATION_ID = IF(@RECONCILIATION_ID = '', NULL, @RECONCILIATION_ID),
SAMPLE_FIRST_REPORTED_DATE = IF(@SAMPLE_FIRS