# TDI Capstone Project, Part 0: Download data

In [45]:
import os, stat
import urllib.request
import zipfile
import shutil
import requests
import pandas as pd

from external_variables import data_path

In [39]:
def unzip_file(path_to_file,destination,new_name):
    """Function for unzipping a zip file. 
    
    PARAMETERS:
    **********

    INPUTS: 
    path_to_file = file_path/file_name to zip file. 
    destionation = destination directory for zip file's contents. 

    OUTPUTS: 
    None
    """
    
    files_in_dir_before = os.listdir(destination)
    zip_ref = zipfile.ZipFile(path_to_file, 'r')
    zip_ref.extractall(destination)
    zip_ref.close()
    files_in_dir_after = os.listdir(destination)
    
    unzipped_dir = [di for di in files_in_dir_after if di not in files_in_dir_before]
    #if len(unzipped_dir)==1: 
    #    os.rename(os.path.join(destination,unzipped_dir[0]),os.path.join(destination,new_name))
        
    return

def make_directory(dir_path): 
    try: 
        os.mkdir(dir_path)
    except FileExistsError: 
        print('directory already exists: '+dir_path)
    return None

def download_file_http(url,final_dest,final_name):  
    r = requests.get(url)
    with open(os.path.join(final_dest,final_name), 'wb') as f:
        f.write(r.content)
    return final_dest,final_name

def download_file_ftp(url,final_dest,final_name):
    urllib.request.urlretrieve(url, os.path.join(final_dest,final_name))
    return final_dest,final_name

## Download ICIS-Air data

In [51]:
icis_path = os.path.join(data_path,'ICIS-Air')
make_directory(icis_path)

url = 'https://echo.epa.gov/files/echodownloads/ICIS-AIR_downloads.zip'
file_path,file_name = download_file_http(url,icis_path,'ICIS-Air.zip')
unzip_file(os.path.join(file_path,file_name),file_path,'ICIS-Air')
os.remove(os.path.join(file_path,file_name))

directory already exists: ./data/ICIS-Air


In [70]:
# Unit test to check filenames. 
def TEST_ICISAir_filenames(icis_path): 
    
    correct_fnames = ['ICIS-AIR_FACILITIES.csv','ICIS-AIR_PROGRAMS.csv',
                      'ICIS-AIR_FCES_PCES.csv','ICIS-AIR_PROGRAM_SUBPARTS.csv',
                      'ICIS-AIR_FORMAL_ACTIONS.csv','ICIS-AIR_STACK_TESTS.csv',
                      'ICIS-AIR_INFORMAL_ACTIONS.csv','ICIS-AIR_TITLEV_CERTS.csv',
                      'ICIS-AIR_POLLUTANTS.csv','ICIS-AIR_VIOLATION_HISTORY.csv']
    real_fnames = os.listdir(icis_path)
    missing_files = [fname for fname in correct_fnames if fname not in real_fnames]
    
    message = "The following ICIS-Air files are missing: "+", ".join(missing_files)
    assert not missing_files, message

TEST_ICISAir_filenames(icis_path)

In [93]:
def TEST_ICISAir_columns(): 
    
    # ICIS-AIR_FCES_PCES.csv
    correct_columns = ['PGM_SYS_ID', 'ACTIVITY_ID', 'STATE_EPA_FLAG', 
                       'ACTIVITY_TYPE_CODE', 'ACTIVITY_TYPE_DESC', 
                       'COMP_MONITOR_TYPE_CODE', 'COMP_MONITOR_TYPE_DESC', 
                       'ACTUAL_END_DATE', 'PROGRAM_CODES'] 

    first_ten = pd.read_csv(os.path.join(data_path,'ICIS-Air','ICIS-AIR_FCES_PCES.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in ICIS-AIR_FCES_PCES.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message
    
    # ICIS-AIR_VIOLATION_HISTORY.csv
    correct_columns = ['PGM_SYS_ID', 'ACTIVITY_ID', 'AGENCY_TYPE_DESC', 'STATE_CODE',
                       'AIR_LCON_CODE', 'COMP_DETERMINATION_UID', 'ENF_RESPONSE_POLICY_CODE',
                       'PROGRAM_CODES', 'PROGRAM_DESCS', 'POLLUTANT_CODES', 'POLLUTANT_DESCS',
                       'EARLIEST_FRV_DETERM_DATE', 'HPV_DAYZERO_DATE', 'HPV_RESOLVED_DATE']

    first_ten = pd.read_csv(os.path.join(data_path,'ICIS-Air','ICIS-AIR_VIOLATION_HISTORY.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in ICIS-AIR_VIOLATION_HISTORY.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message
    
    # ICIS-AIR_FACILITIES.csv
    correct_columns = ['PGM_SYS_ID', 'REGISTRY_ID', 'FACILITY_NAME', 'STREET_ADDRESS', 'CITY',
                       'COUNTY_NAME', 'STATE', 'ZIP_CODE', 'EPA_REGION', 'SIC_CODES',
                       'NAICS_CODES', 'FACILITY_TYPE_CODE', 'AIR_POLLUTANT_CLASS_CODE',
                       'AIR_POLLUTANT_CLASS_DESC', 'AIR_OPERATING_STATUS_CODE',
                       'AIR_OPERATING_STATUS_DESC', 'CURRENT_HPV', 'LOCAL_CONTROL_REGION_CODE',
                       'LOCAL_CONTROL_REGION_NAME']

    first_ten = pd.read_csv(os.path.join(data_path,'ICIS-Air','ICIS-AIR_FACILITIES.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in ICIS-AIR_FACILITIES.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message
    
TEST_ICISAir_columns()

In [None]:
ICIS-AIR_FCES_PCES.csv
ICIS-AIR_VIOLATION_HISTORY.csv
ICIS-AIR_FACILITIES.csv

FRS_PROGRAM_LINKS.csv

2014v2facilities.csv
2011neiv2_facility.csv
2008neiv3_facility.csv

ECHO_EXPORTER.csv

## Download ECHO data

In [52]:
echo_path = os.path.join(data_path,'ECHO')
make_directory(echo_path)

url = 'https://echo.epa.gov/files/echodownloads/echo_exporter.zip'
file_path,file_name = download_file_http(url,echo_path,'ECHO.zip')
unzip_file(os.path.join(file_path,file_name),file_path,'ECHO')
os.remove(os.path.join(file_path,file_name))

url = 'https://echo.epa.gov/system/files/echo_exporter_columns_02282019.xlsx'
file_path,file_name = download_file_http(url,echo_path,'echo_exporter_columns_02282019.xlsx')

In [77]:
def TEST_ECHO_filenames(echo_path): 
    
    correct_fnames = ['echo_exporter_columns_02282019.xlsx', 'ECHO_EXPORTER.csv']
    real_fnames = os.listdir(echo_path)
    missing_files = [fname for fname in correct_fnames if fname not in real_fnames]
    
    message = "The following ECHO files are missing: "+", ".join(missing_files)
    assert not missing_files, message
    
TEST_ECHO_filenames(echo_path)

In [96]:
def TEST_ECHO_columns(): 
    
    # ECHO_EXPORTER.csv
    correct_columns = ['REGISTRY_ID','FAC_NAME','FAC_STREET','FAC_CITY','FAC_STATE',
                       'FAC_ZIP','FAC_COUNTY','FAC_FIPS_CODE','FAC_EPA_REGION',
                       'FAC_INDIAN_CNTRY_FLG','FAC_FEDERAL_FLG','FAC_US_MEX_BORDER_FLG',
                       'FAC_CHESAPEAKE_BAY_FLG','FAC_NAA_FLAG','FAC_LAT','FAC_LONG',
                       'FAC_MAP_ICON','FAC_COLLECTION_METHOD','FAC_REFERENCE_POINT',
                       'FAC_ACCURACY_METERS','FAC_DERIVED_TRIBES','FAC_DERIVED_HUC',
                       'FAC_DERIVED_WBD','FAC_DERIVED_STCTY_FIPS','FAC_DERIVED_ZIP',
                       'FAC_DERIVED_CD113','FAC_DERIVED_CB2010','FAC_PERCENT_MINORITY',
                       'FAC_POP_DEN','FAC_MAJOR_FLAG','FAC_ACTIVE_FLAG','FAC_MYRTK_UNIVERSE',
                       'FAC_INSPECTION_COUNT','FAC_DATE_LAST_INSPECTION',
                       'FAC_DAYS_LAST_INSPECTION','FAC_INFORMAL_COUNT',
                       'FAC_DATE_LAST_INFORMAL_ACTION','FAC_FORMAL_ACTION_COUNT',
                       'FAC_DATE_LAST_FORMAL_ACTION','FAC_TOTAL_PENALTIES',
                       'FAC_PENALTY_COUNT','FAC_DATE_LAST_PENALTY','FAC_LAST_PENALTY_AMT',
                       'FAC_QTRS_WITH_NC','FAC_PROGRAMS_WITH_SNC','FAC_COMPLIANCE_STATUS',
                       'FAC_SNC_FLG','FAC_3YR_COMPLIANCE_HISTORY','AIR_FLAG','NPDES_FLAG',
                       'SDWIS_FLAG','RCRA_FLAG','TRI_FLAG','GHG_FLAG','AIR_IDS',
                       'CAA_PERMIT_TYPES','CAA_NAICS','CAA_SICS','CAA_EVALUATION_COUNT',
                       'CAA_DAYS_LAST_EVALUATION','CAA_INFORMAL_COUNT',
                       'CAA_FORMAL_ACTION_COUNT','CAA_DATE_LAST_FORMAL_ACTION',
                       'CAA_PENALTIES','CAA_LAST_PENALTY_DATE','CAA_LAST_PENALTY_AMT',
                       'CAA_QTRS_WITH_NC','CAA_COMPLIANCE_STATUS','CAA_HPV_FLAG',
                       'CAA_3YR_COMPL_QTRS_HISTORY','NPDES_IDS','CWA_PERMIT_TYPES',
                       'CWA_COMPLIANCE_TRACKING','CWA_NAICS','CWA_SICS',
                       'CWA_INSPECTION_COUNT','CWA_DAYS_LAST_INSPECTION',
                       'CWA_INFORMAL_COUNT','CWA_FORMAL_ACTION_COUNT',
                       'CWA_DATE_LAST_FORMAL_ACTION','CWA_PENALTIES',
                       'CWA_LAST_PENALTY_DATE','CWA_LAST_PENALTY_AMT','CWA_QTRS_WITH_NC',
                       'CWA_COMPLIANCE_STATUS','CWA_SNC_FLAG','CWA_13QTRS_COMPL_HISTORY',
                       'CWA_13QTRS_EFFLNT_EXCEEDANCES','CWA_3_YR_QNCR_CODES','RCRA_IDS',
                       'RCRA_PERMIT_TYPES','RCRA_NAICS','RCRA_INSPECTION_COUNT',
                       'RCRA_DAYS_LAST_EVALUATION','RCRA_INFORMAL_COUNT',
                       'RCRA_FORMAL_ACTION_COUNT','RCRA_DATE_LAST_FORMAL_ACTION',
                       'RCRA_PENALTIES','RCRA_LAST_PENALTY_DATE','RCRA_LAST_PENALTY_AMT',
                       'RCRA_QTRS_WITH_NC','RCRA_COMPLIANCE_STATUS','RCRA_SNC_FLAG',
                       'RCRA_3YR_COMPL_QTRS_HISTORY','SDWA_IDS','SDWA_SYSTEM_TYPES',
                       'SDWA_INFORMAL_COUNT','SDWA_FORMAL_ACTION_COUNT','SDWA_COMPLIANCE_STATUS',
                       'SDWA_SNC_FLAG','TRI_IDS','TRI_RELEASES_TRANSFERS',
                       'TRI_ON_SITE_RELEASES','TRI_OFF_SITE_TRANSFERS','TRI_REPORTER_IN_PAST',
                       'FEC_CASE_IDS','FEC_NUMBER_OF_CASES','FEC_LAST_CASE_DATE',
                       'FEC_TOTAL_PENALTIES','GHG_IDS','GHG_CO2_RELEASES','DFR_URL',
                       'FAC_SIC_CODES','FAC_NAICS_CODES','FAC_DATE_LAST_INSPECTION_EPA',
                       'FAC_DATE_LAST_INSPECTION_STATE','FAC_DATE_LAST_FORMAL_ACT_EPA',
                       'FAC_DATE_LAST_FORMAL_ACT_ST','FAC_DATE_LAST_INFORMAL_ACT_EPA',
                       'FAC_DATE_LAST_INFORMAL_ACT_ST','FAC_FEDERAL_AGENCY','TRI_REPORTER',
                       'FAC_IMP_WATER_FLG','EJSCREEN_FLAG_US']

    first_ten = pd.read_csv(os.path.join(data_path,'ECHO','ECHO_EXPORTER.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in ECHO_EXPORTER.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message

TEST_ECHO_columns()

## Download FRS data

In [26]:
frs_path = os.path.join(data_path,'FRS')
make_directory(frs_path)

url = 'https://echo.epa.gov/files/echodownloads/frs_downloads.zip'
file_path,file_name = download_file_http(url,frs_path,'FRS.zip')
unzip_file(os.path.join(file_path,file_name),file_path,'FRS')
os.remove(os.path.join(file_path,file_name))

In [76]:
def TEST_FRS_filenames(frs_path): 
    
    correct_fnames = ['FRS_NAICS_CODES.csv','FRS_FACILITIES.csv',
                      'FRS_SIC_CODES.csv','FRS_PROGRAM_LINKS.csv']
    real_fnames = os.listdir(frs_path)
    missing_files = [fname for fname in correct_fnames if fname not in real_fnames]
    
    message = "The following FRS files are missing: "+", ".join(missing_files)
    assert not missing_files, message

TEST_FRS_filenames(frs_path)

In [105]:
def TEST_FRS_columns(): 
    
    # FRS_PROGRAM_LINKS.csv
    correct_columns = ['PGM_SYS_ACRNM', 'PGM_SYS_ID', 'REGISTRY_ID', 
                       'PRIMARY_NAME', 'LOCATION_ADDRESS', 'SUPPLEMENTAL_LOCATION', 
                       'CITY_NAME', 'COUNTY_NAME','FIPS_CODE', 'STATE_CODE', 
                       'STATE_NAME', 'COUNTRY_NAME', 'POSTAL_CODE']

    first_ten = pd.read_csv(os.path.join(data_path,'FRS','FRS_PROGRAM_LINKS.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in FRS_PROGRAM_LINKS.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message

TEST_FRS_columns()

## Download NEI data

In [40]:
nei_path = os.path.join(data_path,'NEI')
make_directory(nei_path)

# For 2008
url = 'ftp://newftp.epa.gov/air/nei/2008/data_summaries/2008neiv3_facility.zip'
file_path,file_name = download_file_ftp(url,nei_path,'NEI.zip')
unzip_file(os.path.join(file_path,file_name),file_path,'NEI')
os.remove(os.path.join(file_path,file_name))
os.rename(os.path.join(file_path,'2008neiv3_facility','2008neiv3_facility.csv'),
          os.path.join(file_path,'2008neiv3_facility.csv'))
os.remove(os.path.join(file_path,'2008neiv3_facility'))

# For 2011
url = 'ftp://newftp.epa.gov/air/nei/2011/data_summaries/2011v2/2011neiv2_facility.zip'
file_path,file_name = download_file_ftp(url,nei_path,'NEI.zip')
unzip_file(os.path.join(file_path,file_name),file_path,'NEI')
os.remove(os.path.join(file_path,file_name))

# For 2014
url = 'ftp://newftp.epa.gov/air/nei/2014/data_summaries/2014v2/2014neiv2_facility.zip'
file_path,file_name = download_file_ftp(url,nei_path,'NEI.zip')
unzip_file(os.path.join(file_path,file_name),file_path,'NEI')
os.remove(os.path.join(file_path,file_name))

directory already exists: ./data/NEI


In [75]:
def TEST_NEI_filenames(nei_path): 
    
    correct_fnames = ['2014v2facilities.csv', '2011neiv2_facility.csv', 
                      '2008neiv3_facility.csv']
    real_fnames = os.listdir(nei_path)
    missing_files = [fname for fname in correct_fnames if fname not in real_fnames]
    
    message = "The following NEI files are missing: "+", ".join(missing_files)
    assert not missing_files, message
    
TEST_NEI_filenames(nei_path)

In [104]:
def TEST_NEI_columns(): 
    
    # 2014v2facilities.csv
    correct_columns = ['eis_facility_site_id', 'program_system_code', 
                       'alt_agency_id', 'region_cd', 'st_usps_cd', 
                       'county_name', 'state_and_county_fips_code', 
                       'tribal_name', 'facility_site_name', 'naics_cd', 
                       'naics_description', 'facility_source_type', 
                       'latitude_msr', 'longitude_msr', 'location_address_text', 
                       'locality', 'addr_state_cd', 'address_postal_code', 
                       'emissions_operating_type', 'pollutant_cd', 'pollutant_desc', 
                       'total_emissions', 'uom', 'fips_state_code', 'company_name', 
                       'reporting_period']

    first_ten = pd.read_csv(os.path.join(data_path,'NEI','2014v2facilities.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in 2014v2facilities.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message
    
    # 2011neiv2_facility.csv
    correct_columns = ['eis_facility_site_id', 'program_system_cd', 
                       'alt_agency_id', 'region_cd', 'st_usps_cd', 'county_name', 
                       'state_and_county_fips_code', 'tribal_name', 
                       'facility_site_name', 'naics_cd', 'facility_source_description', 
                       'facility_site_status_cd', 'latitude_msr', 'longitude_msr', 
                       'location_address_text', 'locality', 'addr_state_cd', 
                       'address_postal_code', 'emissions_op_type_code', 'pollutant_cd', 
                       'description', 'total_emissions', 'uom']

    first_ten = pd.read_csv(os.path.join(data_path,'NEI','2011neiv2_facility.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in 2011neiv2_facility.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message
    
    # 2008neiv3_facility.csv
    correct_columns = ['eis_facility_site_id', 'program_system_cd', 'alt_agency_id', 
                       'region_cd', 'st_usps_cd', 'county_name', 
                       'state_and_county_fips_code', 'tribal_name', 'facility_site_name', 
                       'naics_cd', 'facility_source_description', 'facility_site_status_cd', 
                       'latitude_msr', 'longitude_msr', 'location_address_text', 'locality', 
                       'addr_state_cd', 'address_postal_code', 'emissions_op_type_code', 
                       'pollutant_cd', 'description', 'total_emissions', 'uom']

    first_ten = pd.read_csv(os.path.join(data_path,'NEI','2008neiv3_facility.csv'),nrows=10)
    missing_columns = [col for col in correct_columns if col not in first_ten.columns]

    message = "The following columns are missing in 2008neiv3_facility.csv: "+\
                                                            ', '.join(missing_columns)
    assert not missing_columns, message

TEST_NEI_columns()