In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
events_milestones_df = pd.read_csv('../data/ECHO_data_csv/SDWA_EVENTS_MILESTONES.csv', dtype=str)

In [5]:
events_milestones_df.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,EVENT_SCHEDULE_ID,EVENT_END_DATE,EVENT_ACTUAL_DATE,EVENT_COMMENTS_TEXT,EVENT_MILESTONE_CODE,EVENT_REASON_CODE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
0,2021Q2,LA1033019,COR-LA160554879,09/11/2011,05/15/2012,GWR - APP CORRECTIVE ACTION PLAN (FED)SO30-SRC...,SDFF,GW,08/16/2012,06/01/2015
1,2021Q2,LA1033019,COR-LA160564880,09/11/2011,05/15/2012,GWR - APP CORRECTIVE ACTION PLAN (FED)SO33-SRC...,SDFF,GW,08/16/2012,06/01/2015
2,2021Q2,LA1033019,COR-LA160574881,09/11/2011,05/15/2012,GWR - APP CORRECTIVE ACTION PLAN (FED)SO30-SRC...,SDFF,GW,08/16/2012,06/01/2015
3,2021Q2,LA1033019,COR-LA160584882,09/11/2011,05/15/2012,GWR - APP CORRECTIVE ACTION PLAN (FED)SO30-SRC...,SDFF,GW,08/16/2012,06/01/2015
4,2021Q2,LA1033019,COR-LA160594883,09/11/2011,05/15/2012,GWR - APP CORRECTIVE ACTION PLAN (FED)SO30-SRC...,SDFF,GW,08/16/2012,06/01/2015


In [42]:
print(f'Total number of records in events milestones table is {events_milestones_df.shape[0]}')

Total number of records in events milestones table is 178726


In [8]:
sum(events_milestones_df.groupby(['PWSID', 'EVENT_SCHEDULE_ID']).size().tolist())

178726

The column set {PWSID, EVENT_SCHEDULE_ID} uniquely identifies each record

In [9]:
events_milestones_df.isnull().sum()

SUBMISSIONYEARQUARTER        0
PWSID                        0
EVENT_SCHEDULE_ID            0
EVENT_END_DATE           87922
EVENT_ACTUAL_DATE            0
EVENT_COMMENTS_TEXT      85244
EVENT_MILESTONE_CODE         0
EVENT_REASON_CODE        14508
FIRST_REPORTED_DATE          0
LAST_REPORTED_DATE        7904
dtype: int64

In [11]:
## Seeing date formats
date_columns = ['EVENT_END_DATE', 'EVENT_ACTUAL_DATE', 'FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE']
events_milestones_df[date_columns].head()

Unnamed: 0,EVENT_END_DATE,EVENT_ACTUAL_DATE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
0,09/11/2011,05/15/2012,08/16/2012,06/01/2015
1,09/11/2011,05/15/2012,08/16/2012,06/01/2015
2,09/11/2011,05/15/2012,08/16/2012,06/01/2015
3,09/11/2011,05/15/2012,08/16/2012,06/01/2015
4,09/11/2011,05/15/2012,08/16/2012,06/01/2015


In [12]:
for datecol in date_columns:
    events_milestones_df[datecol] = pd.to_datetime(events_milestones_df[datecol], format='%m/%d/%Y')

In [13]:
events_milestones_df[date_columns].head()

Unnamed: 0,EVENT_END_DATE,EVENT_ACTUAL_DATE,FIRST_REPORTED_DATE,LAST_REPORTED_DATE
0,2011-09-11,2012-05-15,2012-08-16,2015-06-01
1,2011-09-11,2012-05-15,2012-08-16,2015-06-01
2,2011-09-11,2012-05-15,2012-08-16,2015-06-01
3,2011-09-11,2012-05-15,2012-08-16,2015-06-01
4,2011-09-11,2012-05-15,2012-08-16,2015-06-01


In [14]:
for datacol in date_columns:
    print('Range of {} is between {} and {}'.format(datacol, events_milestones_df[datacol].min(), events_milestones_df[datacol].max()))

Range of EVENT_END_DATE is between 1993-09-27 00:00:00 and 2022-06-30 00:00:00
Range of EVENT_ACTUAL_DATE is between 1991-07-01 00:00:00 and 2099-06-30 00:00:00
Range of FIRST_REPORTED_DATE is between 2000-12-08 00:00:00 and 2021-07-01 00:00:00
Range of LAST_REPORTED_DATE is between 2000-12-08 00:00:00 and 2021-07-01 00:00:00


In [15]:
len(events_milestones_df.columns)

10

### Web Scraping Column description and data type

In [16]:
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [26]:
cols = []
filename = 'SDWA_EVENTS_MILESTONES.csv'
idx = [i for i, item in enumerate(list(html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3'))) if filename in item.getText()][0]
for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("thead").find_all("th"):
    cols.append(th.get_text().replace(u'\xa0', u''))
rows = []
for row in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("tbody").find_all("tr"):
    r = []
    for td in row.find_all('td'):
        r.append(td.get_text().replace(u'\xa0', u''))
    rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype = column_datatype[column_datatype.Element.isin(events_milestones_df.columns)]
column_datatype

Unnamed: 0,Element,Data Type,Length
0,PWSID,Char,9.0
1,SUBMISSIONYEARQUARTER,Char,6.0
2,EVENT_SCHEDULE_ID,Char,
3,EVENT_END_DATE,Date,
4,EVENT_ACTUAL_DATE,Date,
5,EVENT_COMMENTS_TEXT,Char,
6,EVENT_MILESTONE_CODE,Char,4.0
7,EVENT_REASON_CODE,Char,4.0
8,FIRST_REPORTED_DATE,Date,
9,LAST_REPORTED_DATE,Date,


In [27]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u'') : p.getText().replace(u'\xa0', u' ')
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in list(events_milestones_df.columns)
}
pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc

Unnamed: 0,COLUMN,DESCRIPTION
0,EVENT_ACTUAL_DATE,The date on which the milestone was conducted or achieved. Date format is MM/DD/YYY.
1,EVENT_COMMENTS_TEXT,Description of the milestone event.
2,EVENT_END_DATE,The date on which the milestone event ended. Date format is MM/DD/YYYY.
3,EVENT_MILESTONE_CODE,A four-character code identifying the event milestone.
4,EVENT_REASON_CODE,Code identifying the reason for the milestone event.
5,EVENT_SCHEDULE_ID,Identifier used to identify milestone events.
6,FIRST_REPORTED_DATE,The first reported date for the milestone event. The date format is MM/DD/YYYY.
7,LAST_REPORTED_DATE,The last reported date for the milestone event. The date format is MM/DD/YYYY.
8,PWSID,"A unique identifying code for a public water system in SDWIS. The PWSID consists of a two-letter state or region code, followed by seven digits."
9,SUBMISSIONYEARQUARTER,The fiscal year and quarter when the event took place.


In [41]:
## Finding maximum length of events comments column
events_milestones_df.EVENT_COMMENTS_TEXT.dropna().map(lambda x: len(x)).max()

1796

### Generating create SQL statement to create 'events_milestones' table to store events milestones details

In [29]:
temp_dict = {}

def space(n):
    return ' ' * (max([len(col) for col in events_milestones_df.columns]) + 5 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type
print('CREATE TABLE EVENTS_MILESTONES (')
for col in events_milestones_df.columns:
    print(f'\t{col}{space(len(col))}{temp_dict[col]}', end='')
    if col != events_milestones_df.columns[-1]:
        print(',')
    else:
        print()
print(') ENGINE = InnoDB;')

CREATE TABLE EVENTS_MILESTONES (
	SUBMISSIONYEARQUARTER     VARCHAR(6),
	PWSID                     VARCHAR(9),
	EVENT_SCHEDULE_ID         VARCHAR(255),
	EVENT_END_DATE            DATE,
	EVENT_ACTUAL_DATE         DATE,
	EVENT_COMMENTS_TEXT       VARCHAR(255),
	EVENT_MILESTONE_CODE      VARCHAR(4),
	EVENT_REASON_CODE         VARCHAR(4),
	FIRST_REPORTED_DATE       DATE,
	LAST_REPORTED_DATE        DATE
) ENGINE = InnoDB;


In [35]:
events_milestones_df.sort_values(['PWSID', 'EVENT_SCHEDULE_ID'], ascending=[True, True]).to_csv('../data/processed_data/EVENTS_MILESTONES.csv', index=False) # violations.groupby(['PWSID', 'VIOLATION_ID']).size()

In [34]:
columns = events_milestones_df.columns.to_list()

date_columns = ['EVENT_END_DATE', 'EVENT_ACTUAL_DATE', 'FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE']

print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EVENTS_MILESTONES.csv'
INTO TABLE EVENTS_MILESTONES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    if col in date_columns:
        print(f'{col} = IF(@{col} = \'\', NULL, STR_TO_DATE(@{col}, \'%Y-%m-%d\')),')
    else:
        print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EVENTS_MILESTONES.csv'
INTO TABLE EVENTS_MILESTONES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@SUBMISSIONYEARQUARTER,@PWSID,@EVENT_SCHEDULE_ID,@EVENT_END_DATE,@EVENT_ACTUAL_DATE,@EVENT_COMMENTS_TEXT,@EVENT_MILESTONE_CODE,@EVENT_REASON_CODE,@FIRST_REPORTED_DATE,@LAST_REPORTED_DATE)
SET
SUBMISSIONYEARQUARTER = IF(@SUBMISSIONYEARQUARTER = '', NULL, @SUBMISSIONYEARQUARTER),
PWSID = IF(@PWSID = '', NULL, @PWSID),
EVENT_SCHEDULE_ID = IF(@EVENT_SCHEDULE_ID = '', NULL, @EVENT_SCHEDULE_ID),
EVENT_END_DATE = IF(@EVENT_END_DATE = '', NULL, STR_TO_DATE(@EVENT_END_DATE, '%Y-%m-%d')),
EVENT_ACTUAL_DATE = IF(@EVENT_ACTUAL_DATE = '', NULL, STR_TO_DATE(@EVENT_ACTUAL_DATE, '%Y-%m-%d')),
EVENT_COMMENTS_TEXT = IF(@EVENT_COMMENTS_TEXT = '', NULL, @EVENT_COMMENTS_TEXT),
EVENT_MILESTONE_CODE = IF(@EVENT_MILESTONE_CODE = '', NULL, @EVENT_MILESTONE_CODE),
EVENT_REASON_CODE = IF(@EVE