In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

https://www.ansi.org/about/introduction

https://www.ansi.org/news/standards-news/all-news/2017/08/epa-approves-ansi-as-a-recognized-accreditation-body-under-the-formaldehyde-emission-standards-for-c-02

https://www.epa.gov/data-standards/federal-national-and-international-data-standards

https://sor.epa.gov/sor_internet/registry/substreg/searchandretrieve/searchbylist/search.do?search=&searchCriteria.substanceList=196&searchCriteria.substanceType=-1

In [2]:
ansi_areas_df = pd.read_csv('../data/ECHO_data_csv/SDWA_REF_ANSI_AREAS.csv', dtype=str)

In [3]:
ansi_areas_df.head()

Unnamed: 0,ANSI_STATE_CODE,ANSI_ENTITY_CODE,ANSI_NAME,STATE_CODE
0,6,95,Solano,CA
1,6,97,Sonoma,CA
2,6,99,Stanislaus,CA
3,6,101,Sutter,CA
4,6,103,Tehama,CA


In [4]:
print(f'Total number of records in ANSI AREAS table is {ansi_areas_df.shape[0]}')

Total number of records in ANSI AREAS table is 3235


In [6]:
ansi_areas_df.ANSI_ENTITY_CODE.nunique()

330

In [9]:
sum([size==1 for size in ansi_areas_df.groupby(['ANSI_STATE_CODE', 'ANSI_ENTITY_CODE']).size().tolist()])

3235

The column set {ANSI_STATE_CODE, ANSI_ENTITY_CODE} uniquely identifies each record

In [10]:
ansi_areas_df.isnull().sum()

ANSI_STATE_CODE     0
ANSI_ENTITY_CODE    0
ANSI_NAME           0
STATE_CODE          0
dtype: int64

In [11]:
len(ansi_areas_df.columns)

4

### Web Scraping Column description and data type

In [12]:
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [13]:
cols = []
filename = 'SDWA_REF_ANSI_AREAS.csv'
idx = [i for i, item in enumerate(list(html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3'))) if filename in item.getText()][0]
for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("thead").find_all("th"):
    cols.append(th.get_text().replace(u'\xa0', u''))
rows = []
for row in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[idx].find_next_sibling('table').find("tbody").find_all("tr"):
    r = []
    for td in row.find_all('td'):
        r.append(td.get_text().replace(u'\xa0', u''))
    rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype = column_datatype[column_datatype.Element.isin(ansi_areas_df.columns)]
column_datatype

Unnamed: 0,Element,Data Type,Length
0,ANSI_STATE_CODE,Num,2.0
1,ANSI_ENTITY_CODE,Num,3.0
2,ANSI_NAME,Char,
3,STATE_CODE,Char,2.0


In [14]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u'') : p.getText().replace(u'\xa0', u' ')
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in list(ansi_areas_df.columns)
}
pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc

Unnamed: 0,COLUMN,DESCRIPTION
0,ANSI_ENTITY_CODE,"American National Standards Institute (ANSI) code for the county, city, or legal statistical area. Taken with the ANSI state code, uniquely identifies a county (or other area) within the country. See the U.S. Census Bureau webpage Exit for more information about ANSI codes."
1,ANSI_NAME,Name of the area associated with the ANSI entity and state code.
2,ANSI_STATE_CODE,Standardized facility location code as issued by the American National Standards Institute (ANSI). A full description of the codes can be accessed in the SDWA_REF_ANSI_AREAS.csv. See the U.S. Census Bureau webpage Exit for more information about ANSI codes.
3,STATE_CODE,A coded value that represents the U.S. Postal Service (USPS) state abbreviation in which a legal entity is located. Must be one of the USPS Postal State Codes.


### Generating create SQL statement to create 'ANSI_AREAS' table to store ANSI areas details

In [15]:
temp_dict = {}

def space(n):
    return ' ' * (max([len(col) for col in ansi_areas_df.columns]) + 5 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type
print('CREATE TABLE ANSI_AREAS (')
for col in ansi_areas_df.columns:
    print(f'\t{col}{space(len(col))}{temp_dict[col]}', end='')
    if col != ansi_areas_df.columns[-1]:
        print(',')
    else:
        print()
print(') ENGINE = InnoDB;')

CREATE TABLE ANSI_AREAS (
	ANSI_STATE_CODE      INT,
	ANSI_ENTITY_CODE     INT,
	ANSI_NAME            VARCHAR(255),
	STATE_CODE           VARCHAR(2)
) ENGINE = InnoDB;


In [None]:
ansi_areas_df.sort_values(['ANSI_STATE_CODE', 'ANSI_ENTITY_CODE'], ascending=[True, True]).to_csv('../data/processed_data/ANSI_AREAS.csv', index=False) # violations.groupby(['PWSID', 'VIOLATION_ID']).size()

In [16]:
columns = ansi_areas_df.columns.to_list()

date_columns = []

print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/ANSI_AREAS.csv'
INTO TABLE ANSI_AREAS 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    if col in date_columns:
        print(f'{col} = IF(@{col} = \'\', NULL, STR_TO_DATE(@{col}, \'%Y-%m-%d\')),')
    else:
        print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/ANSI_AREAS.csv'
INTO TABLE ANSI_AREAS 
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@ANSI_STATE_CODE,@ANSI_ENTITY_CODE,@ANSI_NAME,@STATE_CODE)
SET
ANSI_STATE_CODE = IF(@ANSI_STATE_CODE = '', NULL, @ANSI_STATE_CODE),
ANSI_ENTITY_CODE = IF(@ANSI_ENTITY_CODE = '', NULL, @ANSI_ENTITY_CODE),
ANSI_NAME = IF(@ANSI_NAME = '', NULL, @ANSI_NAME),
STATE_CODE = IF(@STATE_CODE = '', NULL, @STATE_CODE),
