In [1]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
ref_code_values_df = pd.read_csv('../data/ECHO_data_csv/SDWA_REF_CODE_VALUES.csv', dtype=str)

In [3]:
ref_code_values_df.head()

Unnamed: 0,VALUE_TYPE,VALUE_CODE,VALUE_DESCRIPTION
0,ACTIVITY_CODE,A,Active
1,ACTIVITY_CODE,I,Inactive
2,ACTIVITY_CODE,N,Changed from public to non-public
3,ACTIVITY_CODE,M,Merged with another system
4,ACTIVITY_CODE,P,Potential future system to be regulated


In [6]:
print(f'Total number of records in ref code values table is {ref_code_values_df.shape[0]}')

Total number of records in ref code values table is 2212


In [8]:
print(f'Total number of value types in ref code values table is {ref_code_values_df.VALUE_TYPE.nunique()}')

Total number of value types in ref code values table is 51


In [9]:
ref_code_values_df.VALUE_TYPE.unique()

array(['ACTIVITY_CODE', 'AREA_TYPE_CODE', 'AVAILABILITY_CODE',
       'COMPLIANCE_STATUS_CODE', 'COORDINATE_SOURCE_CODE',
       'REFERENCE_POINT_CODE', 'RESULT_SIGN_CODE',
       'SELLER_TREATMENT_CODE', 'SERVICE_AREA_TYPE_CODE',
       'SITE_VISIT_EVAL_TYPE_CODE', 'SOURCE_MAP_SCALE_CODE',
       'SUBMISSION_STATUS_CODE', 'TREATMENT_OBJECTIVE_CODE',
       'TREATMENT_PROCESS_CODE', 'DBPR_SCHEDULE_CAT_CODE',
       'DRINKING_WATER_RULE_CODE', 'ENFORCEMENT_ACTION_TYPE_CODE',
       'EPA_REGION', 'EVENT_REASON_CODE', 'EVENT_MILESTONE_CODE',
       'FACILITY_TYPE_CODE', 'FILTRATION_STATUS_CODE',
       'GEOMETRY_TYPE_CODE', 'GW_SW_CODE', 'HORIZ_COLL_METHOD_CODE',
       'HORIZ_REF_DATUM_CODE', 'LT2_SCHEDULE_CAT_CODE',
       'AGENCY_TYPE_CODE', 'ORIGINATOR_CODE', 'OWNER_TYPE_CODE',
       'POP_CAT_11_CODE', 'POP_CAT_2_CODE', 'POP_CAT_3_CODE',
       'POP_CAT_4_CODE', 'POP_CAT_5_CODE', 'PRIMACY_AGENCY_CODE',
       'PWS_TYPE_CODE', 'PRIMARY_SOURCE_CODE', 'CONTAMINANT_CODE',
       'TRIBAL_

In [24]:
for type_code in ref_code_values_df.VALUE_TYPE.unique():
    filename = type_code+'.csv'
    ref_code_values_df[(ref_code_values_df.VALUE_TYPE == type_code)][['VALUE_CODE', 'VALUE_DESCRIPTION']].rename(
        columns = {'VALUE_CODE': type_code, 'VALUE_DESCRIPTION': 'DESCRIPTION'}
    ).to_csv('../data/processed_data/ref_code_values_tables/' + filename, index=False)

In [29]:
ref_code_values_df.isnull().sum()

VALUE_TYPE           0
VALUE_CODE           7
VALUE_DESCRIPTION    0
dtype: int64

In [27]:
ref_code_values_df[ref_code_values_df.VALUE_CODE.isnull()]

Unnamed: 0,VALUE_TYPE,VALUE_CODE,VALUE_DESCRIPTION
5,AREA_TYPE_CODE,,Unknown Area Type
106,SOURCE_MAP_SCALE_CODE,,SCALE NOT APPLICABLE TO COLLECTION METHOD
263,ENFORCEMENT_ACTION_TYPE_CODE,,Unknown Enforcement Action Type
436,AGENCY_TYPE_CODE,,Unknown Agency Type
446,AGENCY_TYPE_CODE,,Native American Tribe
975,TRIBAL_CODE,,Unknown Tribe
2167,CONTAMINANT_CODE,,Not applicable


In [28]:
columns = ref_code_values_df.VALUE_TYPE.unique()

### Web Scraping Column description and data type

In [30]:
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [46]:
html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')

[<h3>SDWA Dashboard Dataset</h3>,
 <h3>SDWA Dataset</h3>,
 <h3>Public Water Systems (SDWA_PUB_WATER_SYSTEMS.csv)</h3>,
 <h3>Site Visits (SDWA_SITE_VISITS.csv)</h3>,
 <h3><br/>
 	Violations (SDWA_VIOLATIONS.csv)</h3>,
 <h3>Violation to Enforcement (SDWA_VIOLATION_TO_ENFORCEMENT.csv)</h3>,
 <h3>Serious Violators (SDWA_SERIOUS_VIOLATORS.csv)</h3>,
 <h3>Enforcements (SDWA_ENFORCEMENTS.csv)</h3>,
 <h3>Return to Compliance (SDWA_RETURN_TO_COMPLIANCE.csv)</h3>,
 <h3>Events Milestones (SDWA_EVENTS_MILESTONES.csv)</h3>,
 <h3>Facilities (SDWA_FACILITIES.csv)</h3>,
 <h3>LCR (SDWA_LCR_Samples.csv)</h3>,
 <h3>PN Violations (SDWA_PN_VIOLATION_ASSOC.csv)</h3>,
 <h3>Public Water System (SDWA_PUB_WATER_SYSTEMS.csv)</h3>,
 <h3>ANSI (SDWA_REF_ANSI_AREAS.csv)</h3>,
 <h3>Reference Codes (SDWA_REF_CODE_VALUES.csv)</h3>,
 <h3>Service Areas (SDWA_SERVICE_AREAS.csv)</h3>,
 <h3>Site Visits (SDWA_SITE_VISITS.csv)</h3>,
 <h3>Violations and Enforcement (SDWA_VIOLATIONS_ENFORCEMENT.csv)</h3>]

In [49]:
cols = ['Element', 'Data Type', 'Length']
# filename = 'SDWA_EVENTS_MILESTONES.csv'
# idx = [i for i, item in enumerate(list(html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3'))) if filename in item.getText()][0]
# for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("thead").find_all("th"):
#     cols.append(th.get_text().replace(u'\xa0', u''))
count = 1
rows = []
for h3 in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[2:]:
    for row in h3.find_next_sibling('table').find("tbody").find_all("tr"):
        r = []
        _flag = False
        for td in row.find_all('td'):
            if td.get_text().replace(u'\xa0', u'') in columns:
                _flag = True
            if _flag:
                count += 1
                r.append(td.get_text().replace(u'\xa0', u''))
        rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype = column_datatype[column_datatype.Element.isin(columns)]
column_datatype

Unnamed: 0,Element,Data Type,Length
4,EPA_REGION,Char,3.0
5,PWS_TYPE_CODE,Char,8.0
18,PWS_TYPE_CODE,Char,8.0
31,PWS_TYPE_CODE,Char,8.0
55,PWS_TYPE_CODE,Char,8.0
67,PWS_TYPE_CODE,Char,8.0
83,PWS_TYPE_CODE,Char,8.0
97,EVENT_MILESTONE_CODE,Char,4.0
98,EVENT_REASON_CODE,Char,4.0
108,FACILITY_TYPE_CODE,Char,2.0


In [115]:
# column_datatype[column_datatype.Element.isin(column_datatype[column_datatype.duplicated(subset=['Element'])].Element.unique().tolist())]
column_datatype = column_datatype.drop_duplicates(subset=['Element'])

In [116]:
column_datatype[column_datatype['Data Type'] == 'Num']

Unnamed: 0,Element,Data Type,Length
153,DBPR_SCHEDULE_CAT_CODE,Num,1
156,LT2_SCHEDULE_CAT_CODE,Num,6
159,POP_CAT_2_CODE,Num,1
160,POP_CAT_3_CODE,Num,1
161,POP_CAT_4_CODE,Num,1
162,POP_CAT_5_CODE,Num,1
163,POP_CAT_11_CODE,Num,2
250,RULE_CODE,Num,3
251,RULE_GROUP_CODE,Num,3
252,RULE_FAMILY_CODE,Num,3


In [101]:
rows = []
for file in os.listdir('../SQL'):
    with open('../SQL/' + file, 'r') as f:
        _flag = False
        for line in f.readlines():
            if 'CREATE' in line:
                _flag = True
            if 'InnoDB' in line:
                _flag = False
            if _flag and any([col in line for col in columns]):
                rows.append(line.replace('\t', '').replace('\n', '').replace(',', '').split())
#             if any([col in line for col in columns]) and '@' not in line and 'INDEX' not in line:
#                 print(line)
temp = pd.DataFrame(rows, columns=['Column', 'DataType'])
temp[temp['DataType'] == 'INT']

Unnamed: 0,Column,DataType
24,RULE_CODE,INT
25,RULE_GROUP_CODE,INT
26,RULE_FAMILY_CODE,INT
31,DBPR_SCHEDULE_CAT_CODE,INT
33,LT2_SCHEDULE_CAT_CODE,INT
35,POP_CAT_2_CODE,INT
36,POP_CAT_3_CODE,INT
37,POP_CAT_4_CODE,INT
38,POP_CAT_5_CODE,INT
39,POP_CAT_11_CODE,INT


In [117]:
# temp[temp.Column.isin(['CONTAMINANT_CODE', 'SUBMISSION_STATUS_CODE'])]
# temp.drop_duplicates()

In [122]:
[col for col in columns if col not in column_datatype.Element.tolist()] 

['ACTIVITY_CODE',
 'AREA_TYPE_CODE',
 'COMPLIANCE_STATUS_CODE',
 'COORDINATE_SOURCE_CODE',
 'REFERENCE_POINT_CODE',
 'SITE_VISIT_EVAL_TYPE_CODE',
 'SOURCE_MAP_SCALE_CODE',
 'TREATMENT_OBJECTIVE_CODE',
 'TREATMENT_PROCESS_CODE',
 'DRINKING_WATER_RULE_CODE',
 'GEOMETRY_TYPE_CODE',
 'HORIZ_COLL_METHOD_CODE',
 'HORIZ_REF_DATUM_CODE',
 'ORIGINATOR_CODE',
 'TRIBAL_CODE',
 'VERIFICATION_METHOD_CODE',
 'VERT_COLL_METHOD_CODE',
 'VERT_REF_DATUM_CODE']

In [106]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u'') : p.getText().replace(u'\xa0', u' ')
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in list(columns)
}
pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc

Unnamed: 0,COLUMN,DESCRIPTION
0,AREA_TYPE_CODE,Code identifying the area type where the facility is located.
1,AVAILABILITY_CODE,A single-character code for how the water source is utilized by a water system.
2,AGENCY_TYPE_CODE,The agency type that conducted the site visit.
3,CONTAMINANT_CODE,A code value that represents a contaminant for which a public water system has incurred a violation of a primary drinking water regulation. A full description of the codes can be accessed in the SDWA_REF_CODE_VALUES.csv.
4,DBPR_SCHEDULE_CAT_CODE,Stage 2 Disinfectant Byproducts Rule schedule category code.
5,"ENFORCEMENT_ACTION_TYPE_CODE – A designated attribute which indicates the coded type of enforcement follow up action was taken by a federal or state agency. It also indicates whether enforcement action was formal, informal, subcategory, or others. For a full list of enforcement action codes and their descriptions see SDWA_REF_CODE_VALUES.csv under VALUE_TYPE = ENFORCEMENT_ACTION_TYPE_CODE.",
6,EPA_REGION,A two-character code identifying the EPA Region in which the system is located.
7,EVENT_MILESTONE_CODE,A four-character code identifying the event milestone.
8,EVENT_REASON_CODE,Code identifying the reason for the milestone event.
9,FACILITY_TYPE_CODE,Code identifying the type of facility.


### Generating create SQL statement to create 'events_milestones' table to store events milestones details

In [144]:
temp_dict = {}

def space(n, col):
    return ' ' * (max([len(c) for c in [col, 'DESCRIPTION']]) + 2 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type

for col in columns:
    with open('../SQL/'+col+'_table.sql', 'w') as f:
        f.write(f'CREATE TABLE {col} (\n')
        if col not in temp_dict:
            temp_dict[col] = 'VARCHAR(255)'
        f.write(f'\t{col}{space(len(col), col)}VARCHAR(40),\n')
        f.write(f'\tDESCRIPTION{space(11, col)}{temp_dict[col]}\n')
        f.write(') ENGINE = InnoDB;\n')
        filename = col +'.csv'
        f.write('\n')
        f.write('LOAD DATA INFILE \'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/'+filename+'\'\nINTO TABLE EVENTS_MILESTONES\nFIELDS TERMINATED BY \',\'\nESCAPED BY \'\'\nOPTIONALLY ENCLOSED BY \'"\'\nLINES TERMINATED BY \'\\\n\'\nIGNORE 1 ROWS\n')

In [35]:
events_milestones_df.sort_values(['PWSID', 'EVENT_SCHEDULE_ID'], ascending=[True, True]).to_csv('../data/processed_data/EVENTS_MILESTONES.csv', index=False) # violations.groupby(['PWSID', 'VIOLATION_ID']).size()

In [34]:
columns = events_milestones_df.columns.to_list()

date_columns = ['EVENT_END_DATE', 'EVENT_ACTUAL_DATE', 'FIRST_REPORTED_DATE', 'LAST_REPORTED_DATE']

print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EVENTS_MILESTONES.csv'
INTO TABLE EVENTS_MILESTONES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    if col in date_columns:
        print(f'{col} = IF(@{col} = \'\', NULL, STR_TO_DATE(@{col}, \'%Y-%m-%d\')),')
    else:
        print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/EVENTS_MILESTONES.csv'
INTO TABLE EVENTS_MILESTONES 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@SUBMISSIONYEARQUARTER,@PWSID,@EVENT_SCHEDULE_ID,@EVENT_END_DATE,@EVENT_ACTUAL_DATE,@EVENT_COMMENTS_TEXT,@EVENT_MILESTONE_CODE,@EVENT_REASON_CODE,@FIRST_REPORTED_DATE,@LAST_REPORTED_DATE)
SET
SUBMISSIONYEARQUARTER = IF(@SUBMISSIONYEARQUARTER = '', NULL, @SUBMISSIONYEARQUARTER),
PWSID = IF(@PWSID = '', NULL, @PWSID),
EVENT_SCHEDULE_ID = IF(@EVENT_SCHEDULE_ID = '', NULL, @EVENT_SCHEDULE_ID),
EVENT_END_DATE = IF(@EVENT_END_DATE = '', NULL, STR_TO_DATE(@EVENT_END_DATE, '%Y-%m-%d')),
EVENT_ACTUAL_DATE = IF(@EVENT_ACTUAL_DATE = '', NULL, STR_TO_DATE(@EVENT_ACTUAL_DATE, '%Y-%m-%d')),
EVENT_COMMENTS_TEXT = IF(@EVENT_COMMENTS_TEXT = '', NULL, @EVENT_COMMENTS_TEXT),
EVENT_MILESTONE_CODE = IF(@EVENT_MILESTONE_CODE = '', NULL, @EVENT_MILESTONE_CODE),
EVENT_REASON_CODE = IF(@EVE