In [52]:
import pandas as pd
import geopandas as gp
import numpy as np
import os
import itertools
import glob
from datetime import date

In [53]:
def drop_levels(df):
    df.reset_index(inplace = True)
    df.columns = df.columns.droplevel(1)
    return df

In [54]:
## initiate file paths
sample_outing_name = 'feb24'
folder_path = "/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/2023 Screening Results/2024.2"

In [55]:
# folder containing spreadsheets from F & B
output_results_path = f"/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/{sample_outing_name}_results.csv"

In [56]:
# initate folder paths for data to read in
sl_path = "/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/Lookup Tables/Master_Screening_Levels.xlsx"
pcb_arc_lookup_path = "/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/Lookup Tables/PCB_aroclor_lookup.csv"

PROCESS RESULTS SPREADSHEETS

In [57]:
file_extension = '*.xlsx'

# List all files with the specified extension in the folder
files = glob.glob(os.path.join(folder_path, file_extension))

results_df = []
length = 0
# Iterate through each file and read its content
for file_path in files:
    print(file_path)
    df = pd.read_excel(file_path)
    df.columns = df.columns.str.replace("_"," ")
    results_df.append(df)

results_df = pd.concat(results_df)
print(len(results_df))

/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/2023 Screening Results/2024.2/UWB UW IAS 312084 additional SGS EDD.xlsx


/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/2023 Screening Results/2024.2/UWB UW IAS 312067 additional SGS EDD.xlsx
/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/2023 Screening Results/2024.2/UWB University of Washington 402216 EIM.xlsx
1219


In [58]:
# remove any rows that were not field data
results_df =  results_df[results_df['Field Collection Start Date'].isna() == False]
len(results_df)

527

In [59]:
# create new column of sample type based on sample matrix and sample source columns
results_df['Sample Matrix_clean'] = np.where(results_df['Sample Matrix']=='Aqueous', 'Water', results_df['Sample Matrix'])
results_df['Sample Matrix_clean'] = np.where(results_df['Sample Matrix'].str.contains('Solid'), 'Soil', results_df['Sample Matrix_clean'])
results_df['Sample Matrix_clean'] = np.where(results_df['Sample Source']=='Groundwater', 'Water', results_df['Sample Matrix_clean'])

In [60]:
# clean up pcb values in order to make the join correctly with F&B results
def clean_fun(val):
    if 'PCB' in val:
        val = val.replace("/"," ").split(" ")[1]
    return val

results_df['Result Parameter Name_clean'] = results_df['Result Parameter Name'].apply(lambda x: clean_fun(x))

In [61]:
# replace Lube Oil to Diesel Range Organics

results_df['Result Parameter Name_clean'] = np.where(results_df['Result Parameter Name_clean'] == 'Lube Oil', 'Diesel Range Organics', results_df['Result Parameter Name_clean'])

In [62]:
# remove trailing spaces in sample id
results_df['Sample ID'] = results_df['Sample ID'].str.strip()

# change sample ID DPS1 -> DPS-1
results_df['Sample ID'] = np.where(results_df['Sample ID'] == 'DPS1', 'DPS-1', results_df['Sample ID'])

# change sample ID SPB-0159-S-1 -> SPB-O159-S-1
results_df['Sample ID'] = np.where(results_df['Sample ID'] == 'SPB-0159-S-1', 'SPB-O159-S-1', results_df['Sample ID'])

# replace typo from 11/17 -> 11/07
results_df['Field Collection Start Date'] = np.where(results_df['Field Collection Start Date']=='2023-11-17 00:00:00', np.datetime64('2023-11-07'), results_df['Field Collection Start Date'])

In [63]:
# calculate total PCBs for epa1668
tot_pcbs = results_df[results_df['Result Method'] == 'EPA1668C']

In [64]:
tot_pcbs = tot_pcbs.groupby(by =['Sample ID', 'Field Collection Start Date', 'Sample Matrix', 'Sample Matrix_clean','Result Value Units']).agg({'Result Value': ['sum']}).reset_index()

In [65]:
drop_levels(tot_pcbs)
tot_pcbs['Result Parameter Name_clean'] = 'Total PCBs'

In [66]:
results_df = pd.concat([results_df, tot_pcbs])

In [67]:
# remove unnecessary columns from raw data
results_df = results_df[['Sample ID','Field Collection Start Date','Sample Matrix_clean','Sample Matrix','Sample Source',
                         'Result Parameter Name','Result Parameter Name_clean','Result Value', 'Result Value Units', 'Result Reporting Limit', 
                         'Result Reporting Limit Type', 'Result Detection Limit','Result Detection Limit Type', 'Result Data Qualifier', 'Result Method']]

In [68]:
results_df.to_csv(output_results_path, index = False)