In [8]:
import os
import glob
import urllib
import pandas as pd

The Envirofacts database is maintained by the US Environmental Protection Agency (EPA).
Envirofacts allows you to retrieve environmental information from EPA databases on air, chemicals, facility information, hazardous waste, risk management plans, Superfund, toxic releases, and water permits. We are focussing on Safe Drinking Water Information System. The Safe Drinking Water Information System (SDWIS) contains information about public water systems and their violations of EPA's drinking water regulations. These statutes and accompanying regulations establish maximum contaminant levels, treatment techniques, and monitoring and reporting requirements to ensure that water provided to customers is safe for human consumption.

https://enviro.epa.gov

https://www.epa.gov/enviro/web-services

 

In [9]:
tables = ['ENFORCEMENT_ACTION',
          'GEOGRAPHIC_AREA',
          'LCR_SAMPLE_RESULT',
          'LCR_SAMPLE',
          'SERVICE_AREA',
          'TREATMENT',
          'WATER_SYSTEM',
          'WATER_SYSTEM_FACILITY',
          'VIOLATION',
          'VIOLATION_ENF_ASSOC']

In [10]:
import requests
table_total_rows = {}
for table in tables:
    data_url = 'https://data.epa.gov/efservice/'+table+'/COUNT/JSON'
    response = requests.get(data_url)
    table_total_rows[table] = int(response.json()[0]['TOTALQUERYRESULTS'])

In [15]:
print('Total number of rows in each table (result got from webservice)')
print('-' * 63)
for table in table_total_rows:
    print(table, ':', table_total_rows[table])

Total number of rows in each table (result got from webservice)
---------------------------------------------------------------
ENFORCEMENT_ACTION :	 2108953
GEOGRAPHIC_AREA :	 406044
LCR_SAMPLE_RESULT :	 225184
LCR_SAMPLE :	 224842
SERVICE_AREA :	 414314
TREATMENT :	 428353
WATER_SYSTEM :	 426547
WATER_SYSTEM_FACILITY :	 1456300
VIOLATION :	 2058022
VIOLATION_ENF_ASSOC :	 13788197


In [23]:
import os
for table_name in table_total_rows:
    print(table_name)
    data_dir = 'new_data/'+ table_name.lower() +'/' 
    os.makedirs(data_dir,exist_ok=True)    
    total_end_index = table_total_rows[table_name] // 100000
    print('total_end_index', total_end_index)
    for start_index in range(0, total_end_index):
        start_index = start_index * 100000
        print(table_name, start_index, (start_index+100000))
        data_url = 'https://data.epa.gov/efservice/'+ table_name +'/ROWS/'+str(start_index)+':'+str(start_index+100000)+'/EXCEL'
        urllib.request.urlretrieve(data_url, data_dir + table_name + '_' +str(start_index)+'.csv')
        print('completed')

    start_index = total_end_index * 100000
    end_index = table_total_rows[table_name]
    print(table_name, start_index, end_index)
    data_url = 'https://data.epa.gov/efservice/'+ table_name +'/ROWS/'+str(start_index)+':'+str(end_index-1)+'/EXCEL'
    urllib.request.urlretrieve(data_url, data_dir + table_name + '_' +str(start_index)+'.csv')
    print(data_dir + table_name + '_' +str(start_index)+'.csv')
    print('completed')

ENFORCEMENT_ACTION
total_end_index 21
ENFORCEMENT_ACTION 0 100000
completed
ENFORCEMENT_ACTION 100000 200000
completed
ENFORCEMENT_ACTION 200000 300000
completed
ENFORCEMENT_ACTION 300000 400000
completed
ENFORCEMENT_ACTION 400000 500000
completed
ENFORCEMENT_ACTION 500000 600000
completed
ENFORCEMENT_ACTION 600000 700000
completed
ENFORCEMENT_ACTION 700000 800000
completed
ENFORCEMENT_ACTION 800000 900000
completed
ENFORCEMENT_ACTION 900000 1000000
completed
ENFORCEMENT_ACTION 1000000 1100000
completed
ENFORCEMENT_ACTION 1100000 1200000
completed
ENFORCEMENT_ACTION 1200000 1300000
completed
ENFORCEMENT_ACTION 1300000 1400000
completed
ENFORCEMENT_ACTION 1400000 1500000
completed
ENFORCEMENT_ACTION 1500000 1600000
completed
ENFORCEMENT_ACTION 1600000 1700000
completed
ENFORCEMENT_ACTION 1700000 1800000
completed
ENFORCEMENT_ACTION 1800000 1900000
completed
ENFORCEMENT_ACTION 1900000 2000000
completed
ENFORCEMENT_ACTION 2000000 2100000
completed
ENFORCEMENT_ACTION 2100000 2108953
new_da

completed
VIOLATION_ENF_ASSOC 11200000 11300000
completed
VIOLATION_ENF_ASSOC 11300000 11400000
completed
VIOLATION_ENF_ASSOC 11400000 11500000
completed
VIOLATION_ENF_ASSOC 11500000 11600000
completed
VIOLATION_ENF_ASSOC 11600000 11700000
completed
VIOLATION_ENF_ASSOC 11700000 11800000
completed
VIOLATION_ENF_ASSOC 11800000 11900000
completed
VIOLATION_ENF_ASSOC 11900000 12000000
completed
VIOLATION_ENF_ASSOC 12000000 12100000
completed
VIOLATION_ENF_ASSOC 12100000 12200000
completed
VIOLATION_ENF_ASSOC 12200000 12300000
completed
VIOLATION_ENF_ASSOC 12300000 12400000
completed
VIOLATION_ENF_ASSOC 12400000 12500000
completed
VIOLATION_ENF_ASSOC 12500000 12600000
completed
VIOLATION_ENF_ASSOC 12600000 12700000
completed
VIOLATION_ENF_ASSOC 12700000 12800000
completed
VIOLATION_ENF_ASSOC 12800000 12900000
completed
VIOLATION_ENF_ASSOC 12900000 13000000
completed
VIOLATION_ENF_ASSOC 13000000 13100000
completed
VIOLATION_ENF_ASSOC 13100000 13200000
completed
VIOLATION_ENF_ASSOC 13200000 1

All the tables of SDWIS are downloaded

In [7]:
print('Following files are downloaded using webservice')
print('-' * 47)
for file in filter(lambda x: x.endswith('.csv'), os.listdir('../raw_data')):
    print(file)

Following files are downloaded using webservice
-----------------------------------------------
WATER_SYSTEM_COUNTY.csv
WATER_SYSTEM_FACILITY.csv
TREATMENT.csv
water_system_summary_submission_2021_q4.csv
GEOGRAPHIC_AREA.csv
LCR_SAMPLE_RESULT.csv
VIOLATION.csv
ENFORCEMENT_ACTION.csv
SERVICE_AREA.csv
WATER_SYSTEM.csv
LCR_SAMPLE.csv


In [16]:
directories=[d for d in os.listdir('../raw_data') if os.path.isdir('../raw_data/'+d) and d != '.ipynb_checkpoints']
directories

['water_system',
 'treatment',
 'service_area',
 'water_system_facility',
 'enforcement_action',
 'lcr_sample_result',
 'violation',
 'water_system_county',
 'geographic_area',
 'lcr_sample',
 'violation_enf_assoc']

In [19]:
for table_dir in directories[:-1]:
    print(table_dir)
    all_files = glob.glob(os.path.join('../raw_data/' + table_dir, "*.csv"))
    df = pd.concat((pd.read_csv(f, dtype=str) for f in all_files), ignore_index=True)
    df.columns = [col.replace('SDWISDM_B.'+table_dir.upper()+'.', '') for col in df.columns]
    # print(df.columns)
    df.to_csv('../raw_data/' +table_dir.upper()+'.csv', index=False)

water_system
treatment
service_area
water_system_facility
enforcement_action
lcr_sample_result
violation
water_system_county
geographic_area
lcr_sample


In [20]:
print('Following datasets are fetched using EPA ENVIRO WEBSERVICE')
print('-'*58)
for file in filter(lambda x: x.endswith('.csv'), os.listdir('../raw_data/')):
    print(file)

Following datasets are fetched using EPA ENVIRO WEBSERVICE
----------------------------------------------------------
WATER_SYSTEM_COUNTY.csv
WATER_SYSTEM_FACILITY.csv
TREATMENT.csv
water_system_summary_submission_2021_q4.csv
GEOGRAPHIC_AREA.csv
LCR_SAMPLE_RESULT.csv
VIOLATION.csv
ENFORCEMENT_ACTION.csv
SERVICE_AREA.csv
WATER_SYSTEM.csv
LCR_SAMPLE.csv
