In [319]:
import pandas as pd
import geopandas as gp
import numpy as np
import os
import itertools
import glob
from datetime import date

In [320]:
## This script takes a path to a folder containing results from F&B and the name of the sampling outing 
## Note: All results must be in .xlsx format in order to be read by the script

In [321]:
def drop_levels(df):
    df.reset_index(inplace = True)
    df.columns = df.columns.droplevel(1)
    return df

def clean_pcb(val):
    if 'aroclor' in val:
        val = val
    elif 'PCB' in val:
        val = val.replace("/"," ").split(" ")[1]
    return val

In [322]:
## initiate file paths
sample_outing_name = 'feb24_add'
folder_path = "/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/2023 Screening Results/2024.2 add"
qaqc_path = f"/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling Results/qaqc"

In [323]:
# initate folder paths for data to read in
sl_path = f"/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling Results/Lookup Tables/Master_Screening_Levels.xlsx"
pcb_arc_lookup_path = f"/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling Results/Lookup Tables/PCB_aroclor_lookup.csv"

In [324]:
# folder containing spreadsheets from F & B
output_results_path = f"/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/{sample_outing_name}_results.csv"
qaqc_results_path = f"/home/nweiss/gdrive/Year 2/Summer - Duwamish/Sampling_Results/qaqc/{sample_outing_name}_qaqc.csv"

PROCESS RESULTS SPREADSHEETS

In [325]:
file_extension = '*.xlsx'

# List all files with the specified extension in the folder
files = glob.glob(os.path.join(folder_path, file_extension))

results_df = []
file_path = []
records = []
sample_ids = []
methods = []

# Iterate through each file and read its content
for file in files:
    df = pd.read_excel(file)
    df.columns = df.columns.str.replace("_"," ")
    results_df.append(df)

    try:
        # write out file paths, length of data frame, and sample IDs for qa / qc
        sample_ids.append(list(df['Sample ID'].unique()))
        file_path.append(file)
        records.append(len(df))
        methods.append(list(df['Result Method'].unique()))
    except:
        print(file)

results_df = pd.concat(results_df)

In [326]:
qaqc = {'file_path': file_path, 'records': records, 'sample_ids': sample_ids, 'method': methods}
qaqc_df = pd.DataFrame(data = qaqc)
qaqc_df.to_csv(qaqc_results_path, index = False)

In [327]:
# remove any rows that were not field data
results_df =  results_df[results_df['Field Collection Start Date'].isna() == False]

In [328]:
# create new column of sample type based on sample matrix and sample source columns
results_df['Sample Matrix_clean'] = np.where(results_df['Sample Matrix']=='Aqueous', 'Water', results_df['Sample Matrix'])
results_df['Sample Matrix_clean'] = np.where(results_df['Sample Matrix'].str.contains('Solid'), 'Soil', results_df['Sample Matrix_clean'])
results_df['Sample Matrix_clean'] = np.where(results_df['Sample Source']=='Groundwater', 'Water', results_df['Sample Matrix_clean'])

In [329]:
# clean up pcb values in order to make the join correctly with F&B results
results_df['Result Parameter Name_clean'] = results_df['Result Parameter Name'].apply(lambda x: clean_fun(x))

In [330]:
# replace Lube Oil to Diesel Range Organics
results_df['Result Parameter Name_clean'] = np.where(results_df['Result Parameter Name_clean'] == 'Lube Oil', 'Diesel Range Organics', results_df['Result Parameter Name_clean'])

In [331]:
# remove trailing spaces in sample id
results_df['Sample ID'] = results_df['Sample ID'].str.strip()

# change sample ID DPS1 -> DPS-1
results_df['Sample ID'] = np.where(results_df['Sample ID'] == 'DPS1', 'DPS-1', results_df['Sample ID'])

# change sample ID SPB-0159-S-1 -> SPB-O159-S-1
results_df['Sample ID'] = np.where(results_df['Sample ID'] == 'SPB-0159-S-1', 'SPB-O159-S-1', results_df['Sample ID'])

# replace typo from 11/17 -> 11/07
results_df['Field Collection Start Date'] = np.where(results_df['Field Collection Start Date']=='2023-11-17 00:00:00', np.datetime64('2023-11-07'), results_df['Field Collection Start Date'])

In [332]:
# calculate total PCBs for epa1668
tot_pcbs = results_df[results_df['Result Method'] == 'EPA1668C']
tot_pcbs = tot_pcbs.groupby(by =['Sample ID', 'Field Collection Start Date', 'Sample Matrix', 'Sample Matrix_clean','Result Value Units']).agg({'Result Value': ['sum']}).reset_index()

In [333]:
drop_levels(tot_pcbs)
tot_pcbs['Result Parameter Name_clean'] = 'Total PCBs'

In [334]:
results_df = pd.concat([results_df, tot_pcbs])

In [335]:
# remove unnecessary columns from raw data
results_df = results_df[['Sample ID','Field Collection Start Date','Sample Matrix_clean','Sample Matrix','Sample Source',
                         'Result Parameter Name','Result Parameter Name_clean','Result Value', 'Result Value Units', 'Result Reporting Limit', 
                         'Result Reporting Limit Type', 'Result Detection Limit','Result Detection Limit Type', 'Result Data Qualifier', 'Result Method']]

In [336]:
results_df.to_csv(output_results_path, index = False)