In [3]:
import pandas as pd
import numpy as np
import re

## Start with ProposedNPL PDF

### Initial cleaning:
1. remove extra column
2. remove last row with total count
3. remove row(s) with column names

In [4]:
# df_proposed = pd.read_csv("converted/AllCurrentProposedNPL.csv")

In [5]:
# df_proposed.head(10)

In [6]:
# df_proposed.tail(10)

#### Remove extra column

In [7]:
# df_proposed.drop('Unnamed: 16', axis=1, inplace=True)

In [8]:
# df_proposed.head(2)

#### Remove last row with total count

In [9]:
# df_proposed = df_proposed[:-1]

In [10]:
# df_proposed.tail(2)

#### Remove row(s) with column names

In [11]:
# column_headers = df_proposed.columns.tolist()
# print(column_headers)

In [12]:
# matches = (df_proposed == column_headers).all(axis=1)
# matching_rows = df_proposed[matches]
# print(len(matching_rows))
# print(matching_rows)

In [13]:
# df_proposed = df_proposed[~matches]

In [14]:
# df_proposed.tail(40)

### Flattening rows for certain columns:
1. Native American Entity (NAI Status) 
2. Address 
3. Check if there are any others

General logic for each column:

If row has value in column and DOES have EPA ID (could use other columns as reference but this seems safe) --> 
- check if row below has value in that column and DOES NOT have EPA ID - if so, keep going until reach NaN
- start combining with row above until reach original row that found
- replace value with NaN as join with above

In [15]:
# col is which column we're working with
# df is the relevant df (so can use on future PDFs)

# keep track of which row we want to collapse into
def check_if_combine_into_row(currentRow, col):
    col_value = currentRow[col]
    epa_id = currentRow["EPA ID"]
    return pd.notna(col_value) & pd.notna(epa_id)

# check if row needs to be collapsed
def need_to_collapse_row(currentRow, col):
    col_value = currentRow[col]
    epa_id = currentRow["EPA ID"]
    return pd.notna(col_value) & pd.isna(epa_id)

# create dictionary where key is the index of the row the values need to be combined into, 
# and the value for each key is a list of indeces to collapse into that row
# ChatGPT helped with this function
def create_collapse_dictionary(combine_into_rows_index, collapse_rows_index):
    combine_dict = {}
    
    extended_keys = combine_into_rows_index + [float('inf')]

    # Pointer for collapse_rows_index
    collapse_index = 0  

    # Loop through combine_into_rows_index to construct the dictionary
    for i in range(len(combine_into_rows_index)):
        key = combine_into_rows_index[i]
        next_key = extended_keys[i + 1]  # Get the next key
        
        # Collect consecutive values for this key
        values = []
        while collapse_index < len(collapse_rows_index) and collapse_rows_index[collapse_index] < next_key:
            values.append(collapse_rows_index[collapse_index])
            collapse_index += 1

        combine_dict[key] = values

    return combine_dict

# given the dictionary created above, collapse all values into the correct row
# reset the value of the good rows to the combined values
# reset the value of the collapsed rows to NaN
def handle_collapse_rows(collapse_dictionary, df, col):
    for combine_into_index, collapse_index_list in collapse_dictionary.items():
        combine_into_row = df.iloc[combine_into_index]
        value_combined = combine_into_row[col]
        for collapse_index in collapse_index_list:
            collapse_row = df.iloc[collapse_index]
            value_combined += " " + collapse_row[col]
            collapse_row[col] = np.nan
        # print(value_combined)
        combine_into_row[col] = value_combined

# call all of the functions above
# 1. determine which rows to combine into
# 2. determine which rows have values to collapse
# 3. create dictionary to determine which rows collapse into which
# 4. collapse the rows and reset values
def combine_rows(col, df):
    combine_into_rows_index = []
    collapse_rows_index = []

    for i in range(len(df)):
        currentRow = df.iloc[i]

        combine_into_row = check_if_combine_into_row(currentRow, col)
        if combine_into_row:
            # print(currentRow[col])
            combine_into_rows_index.append(i)

        need_to_collapse = need_to_collapse_row(currentRow, col)
        if need_to_collapse:
            # print(currentRow[col])
            collapse_rows_index.append(i)

    # print(combine_into_rows_index)
    # print(collapse_rows_index)

    collapse_dictionary = create_collapse_dictionary(combine_into_rows_index, collapse_rows_index)
    # print(collapse_dictionary)
    handle_collapse_rows(collapse_dictionary, df, col)

In [16]:
# combine_rows('Native American Entity (NAI Status)', df_proposed)

In [17]:
# df_proposed.head(20)

In [18]:
# cols = df_proposed["Native American Entity (NAI Status)"]
# print(cols[0])

In [19]:
# combine_rows('Address', df_proposed)

In [20]:
# df_proposed.head(40)

In [21]:
# combine_rows('Site Name', df_proposed)

In [22]:
# df_proposed.head(40)

In [23]:
# combine_rows('County', df_proposed)

In [24]:
# df_proposed.head(60)

#### Delete all rows with just NaN values

In [25]:
# test_row = df_proposed.iloc[1]
# print(test_row.isna().all())

# test_row_false = df_proposed.iloc[0]
# print(test_row_false.isna().all())

In [26]:
# proposed_filtered_df = df_proposed[~df_proposed.isna().all(axis=1)]
# proposed_filtered_df = proposed_filtered_df.reset_index(drop=True)

In [27]:
# proposed_filtered_df.tail(40)

In [28]:
# has_nan = proposed_filtered_df['EPA ID'].isna().any()
# print(has_nan)

In [29]:
# proposed_filtered_df.to_csv('AllCurrentProposedNPL_formatted.csv', index=False)

## Now do Final NPL PDF

1. do the cleaning steps
2. check for rows that have EPA column null, and make note of which column that is
3. collapse all rows
4. filter out NaN rows

In [30]:
df_final = pd.read_csv("converted/AllCurrentFinalNPL.csv")

In [31]:
df_final.head(10)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date,Unnamed: 15,Unnamed: 16
0,1.0,CT,BARKHAMSTED-NEW HARTFORD LANDFILL,100255.0,CTD980732333,ROUTE 44,BARKHAMSTED,6063.0,LITCHFIELD,N,N,,41.893947,-72.989337,10/04/89,,
1,1.0,CT,BEACON HEIGHTS LANDFILL,100180.0,CTD072122062,BLACKBERRY HILL ROAD,BEACON FALLS,6403.0,NEW HAVEN,N,N,,41.43195,-73.035281,09/08/83,,
2,1.0,CT,DURHAM MEADOWS,100108.0,CTD001452093,MAIN ST,DURHAM,6422.0,MIDDLESEX,N,N,,41.48111,-72.681388,10/04/89,,
3,1.0,CT,GALLUP'S QUARRY,100201.0,CTD108960972,ROUTE 12,PLAINFIELD,6374.0,WINDHAM,N,N,,41.665281,-71.924161,10/04/89,,
4,1.0,CT,KELLOGG-DEERING WELL FIELD,100252.0,CTD980670814,NORWALK WATER DEPARTMENT,NORWALK,6856.0,FAIRFIELD,N,N,,41.13055,-73.43195,09/21/84,,
5,1.0,CT,"LAUREL PARK, INC.",100232.0,CTD980521165,HUNTERS MTN RD,NAUGATUCK,6770.0,NEW HAVEN,N,N,,41.476939,-73.071661,09/08/83,,
6,,,,,,,BOROUGH,,,,,,,,,,
7,1.0,CT,LINEMASTER SWITCH CORP.,100041.0,CTD001153923,29 PLAINE HILL ROAD,WOODSTOCK,6281.0,WINDHAM,N,N,,41.940561,-71.966939,02/21/90,,
8,1.0,CT,NEW LONDON SUBMARINE BASE,100261.0,CTD980906515,ROUTE 12 CRYSTAL LAKE RD,NEW LONDON,6349.0,NEW LONDON,Y,N,,41.400139,-72.087081,08/30/90,,
9,1.0,CT,PRECISION PLATING CORP.,100156.0,CTD051316313,1050 HARTFORD TURNPIKE ROAD,VERNON,6066.0,TOLLAND,N,N,,41.84975,-72.447911,10/04/89,,


In [32]:
df_final.tail(10)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date,Unnamed: 15,Unnamed: 16
2204,10,WA,QUENDALL TERMINALS,1000875.0,WAD980639215,4503 LK WASHINGTON BLVD N,RENTON,98055.0,KING,N,Y,Muckleshoot Indian Tribe,47.533333,-122.2,04/19/06,,
2205,,,,,,,,,,,,(Current),,,,,
2206,10,WA,SEATTLE MUNICIPAL LANDFILL (KENT,1000889.0,WAD980639462,NE OF MILITARY RD AND KENT DES,KENT,98031.0,KING,N,Y,Muckleshoot Indian Tribe,47.391669,-122.2792,08/30/90,,
2207,,,HIGHLANDS),,,MOINES RD,,,,,,(Current),,,,,
2208,10,WA,"WESTERN PROCESSING CO., INC.",1000662.0,WAD009487513,7215 S 196TH ST,KENT,98031.0,KING,N,Y,Muckleshoot Indian Tribe,47.425,-122.2417,09/08/83,,
2209,,,,,,,,,,,,(Current),,,,,
2210,10,WA,WYCKOFF CO./EAGLE HARBOR,1000612.0,WAD009248295,5350 CREOSOTE PL NE,BAINBRIDGE ISLAND,98110.0,KITSAP,N,Y,Suquamish Indian Tribe of,47.621669,-122.5167,07/22/87,,
2211,,,,,,,,,,,,the Port Madison Reservation,,,,,
2212,,,,,,,,,,,,(Current),,,,,
2213,Total: 1340,,,,,,,,,,,,,,,,


### Initial cleaning:
1. remove extra column
2. remove last row with total count
3. remove row(s) with column names

#### Remove extra column

In [34]:
# print(df_final['Unnamed: 15'].isna().all())
# print(df_final['Unnamed: 16'].isna().all())

df_final.drop('Unnamed: 15', axis=1, inplace=True)
df_final.drop('Unnamed: 16', axis=1, inplace=True)

In [35]:
df_final.head(2)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
0,1,CT,BARKHAMSTED-NEW HARTFORD LANDFILL,100255,CTD980732333,ROUTE 44,BARKHAMSTED,6063,LITCHFIELD,N,N,,41.893947,-72.989337,10/04/89
1,1,CT,BEACON HEIGHTS LANDFILL,100180,CTD072122062,BLACKBERRY HILL ROAD,BEACON FALLS,6403,NEW HAVEN,N,N,,41.43195,-73.035281,09/08/83


#### Remove last row with total count

In [38]:
# df_final.tail(1)
df_final = df_final[:-1]

In [39]:
df_final.tail(2)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
2211,,,,,,,,,,,,the Port Madison Reservation,,,
2212,,,,,,,,,,,,(Current),,,


#### Remove row(s) with column names

In [40]:
column_headers_final = df_final.columns.tolist()
print(column_headers_final)

['Region', 'State', 'Site Name', 'Site ID', 'EPA ID', 'Address', 'City', 'Zip', 'County', 'FF Ind', 'NAI', 'Native American Entity (NAI Status)', 'Latitude', 'Longitude', 'NPL Status Date']


In [43]:
matches_final = (df_final == column_headers_final).all(axis=1)
matching_rows_final = df_final[matches_final]
print(len(matching_rows_final))
# print(matching_rows_final)

29


In [44]:
df_final = df_final[~matches_final]

In [47]:
# df_final.tail(40)

### Flattening rows for certain columns

1. find all columns that have multirow values
2. run function above to collapse rows for each relevant column

In [48]:
nan_rows = df_final[df_final['EPA ID'].isna()]

In [57]:
# print(len(nan_rows))
# nan_rows

cols_to_collapse = []

for index, row in nan_rows.iterrows():
    for col in row.index:
        if pd.notna(row[col]):
            if col not in cols_to_collapse:
                cols_to_collapse.append(col)

print(cols_to_collapse)

['City', 'Site Name', 'Native American Entity (NAI Status)', 'Address', 'County']


In [58]:
for col in cols_to_collapse:
    combine_rows(col, df_final)

In [61]:
df_final.tail(20)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
2193,,,,,,,,,,,,,,,
2194,10.0,WA,PACIFIC SOUND RESOURCES,1000611.0,WAD009248287,2801 SW FLORIDA ST.,SEATTLE,98126.0,KING,N,Y,Muckleshoot Indian Tribe (Current),47.582332,-122.366667,05/31/94
2195,,,,,,,,,,,,,,,
2196,10.0,WA,PALERMO WELL FIELD GROUND WATER CONTAMINATION,1001761.0,WA0000026534,PALERMO AVENUE & O STREET,TUMWATER,98501.0,THURSTON,N,Y,Nisqually Indian Tribe (Current),47.001667,-122.904167,04/01/97
2197,,,,,,,,,,,,,,,
2198,10.0,WA,PASCO SANITARY LANDFILL,1001098.0,WAD991281874,KAHLOTUS RD & HWY 12,PASCO,99301.0,FRANKLIN,N,N,,46.255281,-119.0478,02/21/90
2199,10.0,WA,PUGET SOUND NAVAL SHIPYARD COMPLEX,1001107.0,WA2170023418,Postal Address is unavailable for the Site,BREMERTON,98310.0,KITSAP,Y,Y,Suquamish Indian Tribe of the Port Madison Res...,47.559166,-122.647222,05/31/94
2200,,,,,,,,,,,,,,,
2201,,,,,,,,,,,,,,,
2202,10.0,WA,QUEEN CITY FARMS,1000835.0,WAD980511745,S 1/2 SEC 28-MAPLE VALLEY QUAD,MAPLE VALLEY,98038.0,KING,N,Y,Muckleshoot Indian Tribe (Current),47.45,-122.0417,09/21/84


#### Delete all rows with just NaN values

In [105]:
final_filtered_df = df_final[~df_final.isna().all(axis=1)]
final_filtered_df = final_filtered_df.reset_index(drop=True)

In [106]:
final_filtered_df

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
0,01,CT,BARKHAMSTED-NEW HARTFORD LANDFILL,0100255,CTD980732333,ROUTE 44,BARKHAMSTED,06063,LITCHFIELD,N,N,,+41.893947,-72.989337,10/04/89
1,01,CT,BEACON HEIGHTS LANDFILL,0100180,CTD072122062,BLACKBERRY HILL ROAD,BEACON FALLS,06403,NEW HAVEN,N,N,,+41.431950,-73.035281,09/08/83
2,01,CT,DURHAM MEADOWS,0100108,CTD001452093,MAIN ST,DURHAM,06422,MIDDLESEX,N,N,,+41.481110,-72.681388,10/04/89
3,01,CT,GALLUP'S QUARRY,0100201,CTD108960972,ROUTE 12,PLAINFIELD,06374,WINDHAM,N,N,,41.665281,-71.924161,10/04/89
4,01,CT,KELLOGG-DEERING WELL FIELD,0100252,CTD980670814,NORWALK WATER DEPARTMENT,NORWALK,06856,FAIRFIELD,N,N,,+41.130550,-073.431950,09/21/84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348,10,WA,QUEEN CITY FARMS,1000835,WAD980511745,S 1/2 SEC 28-MAPLE VALLEY QUAD,MAPLE VALLEY,98038,KING,N,Y,Muckleshoot Indian Tribe (Current),+47.450000,-122.041700,09/21/84
1349,10,WA,QUENDALL TERMINALS,1000875,WAD980639215,4503 LK WASHINGTON BLVD N,RENTON,98055,KING,N,Y,Muckleshoot Indian Tribe (Current),+47.533333,-122.200000,04/19/06
1350,10,WA,SEATTLE MUNICIPAL LANDFILL (KENT HIGHLANDS),1000889,WAD980639462,NE OF MILITARY RD AND KENT DES MOINES RD,KENT,98031,KING,N,Y,Muckleshoot Indian Tribe (Current),+47.391669,-122.279200,08/30/90
1351,10,WA,"WESTERN PROCESSING CO., INC.",1000662,WAD009487513,7215 S 196TH ST,KENT,98031,KING,N,Y,Muckleshoot Indian Tribe (Current),+47.425000,-122.241700,09/08/83


In [66]:
total_final_npl_list = 1340

In [67]:
print(len(final_filtered_df) - total_final_npl_list)

13


#### 13 extra rows in the df... need to investigate 

#### It looks like a number of rows have the address in the EPA ID column - find those, separate the Address, and then collapse

In [107]:
nan_address_rows_after_clean = final_filtered_df[final_filtered_df['Address'].isna()]
nan_address_rows_after_clean_indeces = nan_address_rows_after_clean.index.tolist()
nan_address_rows_after_clean

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
861,5.0,OH,FEED MATERIALS PRODUCTION CENTER (USDOE),504934.0,OH6890008976 2 MI W OF JUNCTION RT 128 & RT 126,,FERNALD,45030.0,,Y,N,+39.299450,,-84.688331,11/21/89
862,5.0,OH,FIELDS BROOK,504723.0,OHD980614572 WATER BED OF CREEK R3W T13N,,ASHTABULA,44004.0,ASHTABULA,N,N,+41.891500,,-80.7754,09/08/83
863,5.0,OH,FULTZ LANDFILL,504824.0,OHD980794630 CO RD 52,,JACKSON TOWNSHIP,43723.0,GUERNSEY,N,N,39.984719,,-81.541661,09/08/83
864,5.0,OH,INDUSTRIAL EXCESS LANDFILL,504014.0,OHD000377911 4MI S INTER 619 & CLEVELAND AVENUE,,UNIONTOWN,44685.0,STARK,N,N,40.968331,,-81.404169,06/10/86
865,5.0,OH,LAMMERS BARREL FACTORY,504896.0,OHD981537582 EAST PATTERSON & GRANGE HALL,,BEAVERCREEK,45385.0,GREENE,N,N,39.726389,,-84.084306,09/29/03
866,5.0,OH,LITTLE SCIOTO RIVER,509950.0,OHN000509950 HOLLAND ROAD AT LITTLE SCIOTO,,MARION COUNTY,43302.0,MARION,N,N,40.593061,,-83.183461,09/23/09
1073,7.0,MO,ORONOGO-DUENWEG MINING BELT,701290.0,MOD980686281 VARIOUS LOCATIONS,,JOPLIN,64801.0,JASPER,N,Y,Absentee-Shawnee Tribe of Indians of Oklahoma ...,37.101111,-94.420561,08/30/90
1074,7.0,MO,POOLS PRAIRIE,702918.0,MO0000958835 US HWY. 60 AND US HWY. 71 (2 MILES,,NEOSHO,64850.0,NEWTON,N,N,,36.803056,-94.390278,09/17/99
1075,7.0,MO,QUALITY PLATING,701442.0,MOD980860555 RT 2,,SIKESTON,63801.0,SCOTT,N,N,,36.96305,-89.557219,06/10/86
1076,7.0,MO,RIVERFRONT,702089.0,"MOD981720246 PLUME, NEW HAVEN",,NEW HAVEN,63068.0,FRANKLIN,N,N,,38.613889,-91.214722,12/01/00


In [108]:
print(nan_address_rows_after_clean_indeces)

[861, 862, 863, 864, 865, 866, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1298, 1299, 1300]


In [109]:
print(len(nan_address_rows_after_clean))

28


In [110]:
for index, row in nan_address_rows_after_clean.iterrows():  

    # address = row["Address"]
    # if pd.notna(address):
    #     print(row)
    
    epa_id = row["EPA ID"]
    split = re.split(r"^([A-Za-z0-9]{12})", epa_id)
    
    correct_epa_id = None
    correct_address = None
    
    if len(split) == 3:
        correct_epa_id = split[1]
        correct_address = split[2]
    else:
        correct_epa_id = np.nan
        correct_address = split[0]

    final_filtered_df.at[index, "EPA ID"] = correct_epa_id
    final_filtered_df.at[index, "Address"] = correct_address

In [112]:
final_filtered_df.iloc[1300]

Region                                                                               NaN
State                                                                                NaN
Site Name                                                                            NaN
Site ID                                                                              NaN
EPA ID                                                                               NaN
Address                                                                         PORTLAND
City                                                                                 NaN
Zip                                                                                  NaN
County                                                                               NaN
FF Ind                                                                               NaN
NAI                                                                                  NaN
Native American Entit

In [113]:
final_filtered_df[final_filtered_df['EPA ID'].isna()]

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
1079,,,,,,HWY WW,COUNTY,,,,,Oklahoma (Current); Miami Tribe of Oklahoma (C...,,,
1087,,,MINES,,,47,,,,,,Eastern Shawnee Tribe of Oklahoma (Current); S...,,,
1089,,,POTOSI,,,ROAD,,,,,,Eastern Shawnee Tribe of Oklahoma (Current); S...,,,
1091,,,RICHWOODS,,,WASHINGTON COUNTY,,,,,,Eastern Shawnee Tribe of Oklahoma (Current); S...,,,
1300,,,,,,PORTLAND,,,,,,Bands of the Yakama Nation (Current); Confeder...,,,


#### Run function again to collapse rows based on NaN in EPA ID column

In [114]:
for col in cols_to_collapse:
    combine_rows(col, final_filtered_df)

In [115]:
final_filtered_df = final_filtered_df[~final_filtered_df.isna().all(axis=1)]
final_filtered_df = final_filtered_df.reset_index(drop=True)

In [117]:
print(len(final_filtered_df) - total_final_npl_list)

8


#### Now have 8 extra rows in the df... need to investigate 

#### no columns except for Native American Entity (NAI Status) should have NaN values

In [120]:
non_nan_cols = [x for x in column_headers_final if x != 'Native American Entity (NAI Status)']
non_nan_cols

['Region',
 'State',
 'Site Name',
 'Site ID',
 'EPA ID',
 'Address',
 'City',
 'Zip',
 'County',
 'FF Ind',
 'NAI',
 'Latitude',
 'Longitude',
 'NPL Status Date']

In [122]:
cols_to_clean = []
# row_indeces_to_clean = []

for index, row in final_filtered_df.iterrows():  
    for col in non_nan_cols:
        if pd.isna(row[col]):
            # row_indeces_to_clean.append(index)
            if col not in cols_to_clean:
                cols_to_clean.append(col)

In [124]:
print(cols_to_clean)
# print(row_indeces_to_clean)

['NAI', 'County', 'Latitude', 'Zip']


#### Fix the Latitude column

In [126]:
nan_latitude = final_filtered_df[final_filtered_df['Latitude'].isna()]
nan_latitude

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
486,3,WV,SHAFFER EQUIPMENT/ARBUCKLE CREEK AREA,304017,WVD988768909,WV ROUTE 17 (A.K.A. MINDEN ROAD),MINDEN,25879,FAYETTE,N,N,37.97651,,-81.1265,05/15/19
487,3,WV,SHARON STEEL CORP (FAIRMONT COKE WORKS),302883,WVD000800441,LAFAYETTE ST,FAIRMONT,26554,MARION,N,N,39.493611,,-80.114444,12/23/96
488,3,WV,VIENNA TETRACHLOROETHENE,304759,WVD988798401,"30TH STREET, GRAND CENTRAL AVE",VIENNA,26105,WOOD,N,N,39.325167,,-81.548778,10/22/99
489,3,WV,WEST VIRGINIA ORDNANCE (USARMY),303066,WVD980713036,ROUTE 1 BOX 125,POINT PLEASANT,25550,MASON,Y,N,38.926389,,-82.076389,09/08/83
490,4,AL,ALABAMA ARMY AMMUNITION PLANT,400449,AL6210020008,STATE HWY 235,CHILDERSBURG,35044,TALLADEGA,Y,N,33.3381,,-86.3268,07/22/87
491,4,AL,"ALABAMA PLATING COMPANY, INC.",400129,ALD004022448,570 HIGHWAY 231 NORTH,VINCENT,35178,SHELBY,N,N,33.399722,,-86.405,09/18/12
492,4,AL,AMERICAN BRASS INC.,406299,ALD981868466,HIGHWAY 134 WEST,HEADLAND,36345,HENRY,N,N,31.325,,-85.404167,05/10/99
493,4,AL,ANNISTON ARMY DEPOT (SOUTHEAST INDUSTRIAL AREA),400443,AL3210020027,OFF AL HWY 202,ANNISTON,36253,CALHOUN,Y,N,33.659439,,-85.969439,03/13/89
494,4,AL,CIBA-GEIGY CORP. (MCINTOSH PLANT),400073,ALD001221902,OFF HWY 43,MCINTOSH,36553,WASHINGTON,N,N,31.279,,-87.9955,09/21/84
495,4,AL,INTERSTATE LEAD CO. (ILCO),404344,ALD041906173,"8551 Borden Ave. SE,",LEEDS,35094,JEFFERSON,N,N,33.539016,,-86.533258,06/10/86


In [135]:
for index, row in nan_latitude.iterrows():  
    lat_pattern = r"^\+?[0-9]+(\.[0-9]+)?$"
    nai_status = row["Native American Entity (NAI Status)"]
    match = re.match(lat_pattern, str(nai_status))
    if bool(match):
        final_filtered_df.at[index, "Native American Entity (NAI Status)"] = np.nan
        final_filtered_df.at[index, "Latitude"] = nai_status

In [136]:
final_filtered_df[final_filtered_df['Latitude'].isna()]

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date


#### Fix the NAI column

In [137]:
nan_nai = final_filtered_df[final_filtered_df['NAI'].isna()]
nan_nai

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
43,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date
454,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date
473,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date
822,Region,State,Site Name,Site ID,EPA ID,Address,City TOWNSHIP,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date
850,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date
958,Region,State,Site Name,Site ID,EPA ID,Address PUEBLO,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date
1050,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date
1282,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date


#### Looks like the remaining 8 rows!

In [142]:
# manually fix the 2 that have values to collapse then delete all the rest

# final_filtered_df.iloc[821]
final_filtered_df.at[821, "City"] = "HOWARD TOWNSHIP"

# final_filtered_df.iloc[957]
final_filtered_df.at[821, "Address"] = "SR279 NEAR PAGUATE, LAGUNA PUEBLO"

In [145]:
# delete all the rest of the rows
for index, row in nan_nai.iterrows():  
    final_filtered_df.drop(index, inplace=True)

In [146]:
final_filtered_df[final_filtered_df['NAI'].isna()]

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date


In [148]:
for cols in non_nan_cols:
    nan = final_filtered_df[final_filtered_df[col].isna()]
    print(len(nan))

0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [149]:
print(len(final_filtered_df) - total_final_npl_list)

0


### it looks like this is finished - export to csv now

In [150]:
final_filtered_df.to_csv('AllCurrentFinalNPL_formatted.csv', index=False)

## Now do DELETED NPL PDF

1. do the cleaning steps
2. check for rows that have EPA column null, and make note of which column that is
3. collapse all rows
4. filter out NaN rows

In [152]:
df_deleted = pd.read_csv("converted/AllCurrentDeletedNPL.csv")

### Initial cleaning:
1. remove extra column
2. remove last row with total count
3. remove row(s) with column names

#### Remove extra column

In [153]:
df_deleted.head(10)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date,Unnamed: 15,Unnamed: 16
0,1.0,CT,CHESHIRE GROUND WATER CONTAMINATION,100265.0,CTD981067317,604 WEST JOHNSON AVENUE,CHESHIRE,6410.0,NEW HAVEN,N,N,,41.5575,-72.910833,07/02/97,,
1,1.0,CT,NUTMEG VALLEY ROAD,100250.0,CTD980669261,NUTMEG VALLEY ROAD,WOLCOTT,6716.0,NEW HAVEN,N,N,,41.5748,-72.9986,09/23/05,,
2,1.0,CT,OLD SOUTHINGTON LANDFILL,100251.0,CTD980670806,OLD TURNPIKE RD,SOUTHINGTON,6489.0,HARTFORD,N,N,,41.579169,-72.881939,09/12/18,,
3,1.0,CT,REVERE TEXTILE PRINTS CORP.,100121.0,CTD004532610,RAILROAD AVENUE,STERLING,6377.0,WINDHAM,N,N,,41.708331,-71.829169,09/02/94,,
4,1.0,MA,CANNON ENGINEERING CORP. (CEC),100585.0,MAD079510780,FIRST ST,BRIDGEWATER,2324.0,PLYMOUTH,N,N,,41.9727,-71.0256,09/24/13,,
5,1.0,MA,FORT DEVENS-SUDBURY TRAINING ANNEX,100685.0,MAD980520670,HUDSON RD,SUDBURY,1775.0,MIDDLESEX,Y,N,,42.406944,-71.475278,01/29/02,,
6,1.0,MA,HATHEWAY & PATTERSON,102724.0,MAD001060805,15 COUNTY ROAD,MANSFIELD,2048.0,BRISTOL,N,N,,42.038194,-71.2225,02/16/18,,
7,1.0,MA,MATERIALS TECHNOLOGY LABORATORY,100953.0,MA0213820939,OFF OF US HWY 20/N BEACON ST,WATERTOWN,2172.0,MIDDLESEX,Y,N,,42.360416,-71.166111,11/21/06,,
8,,,(USARMY),,,,,,,,,,,,,,
9,1.0,MA,NORWOOD PCBS,100732.0,MAD980670566,NEAR RTE 1 & DEAN STREET,NORWOOD,2062.0,NORFOLK,N,N,,42.180831,-71.1925,05/31/11,,


In [155]:
print(df_deleted['Unnamed: 15'].isna().all())
print(df_deleted['Unnamed: 16'].isna().all())

True
True


In [156]:
df_deleted.drop('Unnamed: 15', axis=1, inplace=True)
df_deleted.drop('Unnamed: 16', axis=1, inplace=True)

In [157]:
df_deleted.head(2)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
0,1,CT,CHESHIRE GROUND WATER CONTAMINATION,100265,CTD981067317,604 WEST JOHNSON AVENUE,CHESHIRE,6410,NEW HAVEN,N,N,,41.5575,-72.910833,07/02/97
1,1,CT,NUTMEG VALLEY ROAD,100250,CTD980669261,NUTMEG VALLEY ROAD,WOLCOTT,6716,NEW HAVEN,N,N,,41.5748,-72.9986,09/23/05


#### Remove last row with total count

In [160]:
# df_deleted.tail(1)
df_deleted = df_deleted[:-1]

In [161]:
df_deleted.tail(2)

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
602,,,,,,,,,,,,Bands of the Yakama Nation,,,
603,,,,,,,,,,,,(Current),,,


#### Remove row(s) with column names

In [162]:
column_headers_deleted = df_deleted.columns.tolist()
print(column_headers_deleted)

['Region', 'State', 'Site Name', 'Site ID', 'EPA ID', 'Address', 'City', 'Zip', 'County', 'FF Ind', 'NAI', 'Native American Entity (NAI Status)', 'Latitude', 'Longitude', 'NPL Status Date']


In [163]:
matches_deleted = (df_deleted == column_headers_deleted).all(axis=1)
matching_rows_deleted = df_deleted[matches_deleted]
print(len(matching_rows_deleted))
# print(matching_rows_final)

9


In [164]:
df_deleted = df_deleted[~matches_deleted]

In [167]:
# df_deleted.tail(20)

### Flattening rows for certain columns

1. find all columns that have multirow values
2. run function above to collapse rows for each relevant column

In [170]:
nan_rows_deleted = df_deleted[df_deleted['EPA ID'].isna()]

In [171]:
cols_to_collapse_deleted = []

for index, row in nan_rows_deleted.iterrows():
    for col in row.index:
        if pd.notna(row[col]):
            if col not in cols_to_collapse_deleted:
                cols_to_collapse_deleted.append(col)

print(cols_to_collapse_deleted)

['Site Name', 'City', 'Address', 'Native American Entity (NAI Status)', 'County']


In [172]:
for col in cols_to_collapse_deleted:
    combine_rows(col, df_deleted)

In [175]:
# df_deleted.tail(20)

#### Delete all rows with just NaN values

In [176]:
deleted_filtered_df = df_deleted[~df_deleted.isna().all(axis=1)]
deleted_filtered_df = deleted_filtered_df.reset_index(drop=True)

In [177]:
deleted_filtered_df

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
0,01,CT,CHESHIRE GROUND WATER CONTAMINATION,0100265,CTD981067317,604 WEST JOHNSON AVENUE,CHESHIRE,06410,NEW HAVEN,N,N,,+41.557500,-72.910833,07/02/97
1,01,CT,NUTMEG VALLEY ROAD,0100250,CTD980669261,NUTMEG VALLEY ROAD,WOLCOTT,06716,NEW HAVEN,N,N,,+41.574800,-072.998600,09/23/05
2,01,CT,OLD SOUTHINGTON LANDFILL,0100251,CTD980670806,OLD TURNPIKE RD,SOUTHINGTON,06489,HARTFORD,N,N,,+41.579169,-72.881939,09/12/18
3,01,CT,REVERE TEXTILE PRINTS CORP.,0100121,CTD004532610,RAILROAD AVENUE,STERLING,06377,WINDHAM,N,N,,41.708331,-71.829169,09/02/94
4,01,MA,CANNON ENGINEERING CORP. (CEC),0100585,MAD079510780,FIRST ST,BRIDGEWATER,02324,PLYMOUTH,N,N,,+41.972700,-071.025600,09/24/13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,10,WA,TOFTDAHL DRUMS,1000961,WAD980723506,22033 NE 189 ST,BRUSH PRAIRIE,98606,CLARK,N,N,,+45.754900,-122.446400,12/23/88
455,10,WA,TULALIP LANDFILL,1000878,WAD980639256,TULALIP INDIAN RESERVATION,MARYSVILLE,98270,SNOHOMISH,N,Y,Tulalip Tribes of Washington (Current),+48.041667,-122.182222,09/18/02
456,10,WA,VANCOUVER WATER STATION #1 CONTAMINATION,1001733,WAD988519708,E. RESERVE AND N.E. FOURTH PLAIN BLVD.,VANCOUVER,98663,CLARK,N,N,,+45.638471,-122.645417,02/06/18
457,10,WA,VANCOUVER WATER STATION #4 CONTAMINATION,1001371,WAD988475158,5TH + BLANDFORD ST,VANCOUVER,98661,CLARK,N,N,,+45.619500,-122.622472,02/06/18


In [178]:
total_deleted_npl_list = 458

In [179]:
print(len(deleted_filtered_df) - total_deleted_npl_list)

1


### 1 extra row to investigate

In [181]:
non_nan_cols_deleted = [x for x in column_headers_deleted if x != 'Native American Entity (NAI Status)']
non_nan_cols_deleted

['Region',
 'State',
 'Site Name',
 'Site ID',
 'EPA ID',
 'Address',
 'City',
 'Zip',
 'County',
 'FF Ind',
 'NAI',
 'Latitude',
 'Longitude',
 'NPL Status Date']

In [182]:
cols_to_clean_deleted = []

for index, row in deleted_filtered_df.iterrows():  
    for col in non_nan_cols_deleted:
        if pd.isna(row[col]):
            # row_indeces_to_clean.append(index)
            if col not in cols_to_clean:
                cols_to_clean_deleted.append(col)

print(cols_to_clean_deleted)

['Address']


#### investigate NaN value in Address column (which is only column that seems to have a NaN value where it shouldn't)

In [184]:
deleted_filtered_df[deleted_filtered_df['Address'].isna()]

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
44,2,NJ,MONTCLAIR/WEST ORANGE RADIUM SITE,200997,NJD980785653,,MONTCLAIR/WEST ORANGE,7044,ESSEX,N,N,,40.7929,-74.2257,09/02/09


In [186]:
# the NaN value is actually correct, but going to replace with N/A which is what the pdf has

In [187]:
deleted_filtered_df.at[44, "Address"] = "N/A"

In [190]:
# deleted_filtered_df.head(50)

#### Check if there are any other rows that have column headings

In [191]:
deleted_filtered_df[deleted_filtered_df["Region"] == "Region"]

Unnamed: 0,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind,NAI,Native American Entity (NAI Status),Latitude,Longitude,NPL Status Date
339,Region,State,Site Name,Site ID,EPA ID,Address,City,Zip,County,FF Ind NAI Native American Entity\n(NAI Sta...,,,Latitude,Longitude,NPL Status Date


In [192]:
# deleted row with column heading

deleted_filtered_df.drop(339, inplace=True)

In [193]:
print(len(deleted_filtered_df) - total_deleted_npl_list)

0


### it looks like this is finished - export to csv now

In [194]:
deleted_filtered_df.to_csv('AllCurrentDeletedNPL_formatted.csv', index=False)