In [2]:
import pandas as pd
import os
import glob

# 1. Load Files Here: (extension name needed)
input_folder = 'test/BP_Samples/corrected/neg/'
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

# Loop through each file in the folder
for input_file_path in csv_files:
    if input_file_path.endswith('_processed.csv'):
        continue
    # Raw Data:
    input_file = pd.read_csv(input_file_path, skiprows=6).dropna()
    # Library File:
    library_file = pd.read_csv('../PFAS_libraries/PFAS_lib_C2-30.csv')
    # Output File:
    # output_file_name = os.path.splitext(os.path.basename(input_file_path))[0] + '_processed.csv'
    # output_file = os.path.join(input_folder, output_file_name)
    # if os.path.exists(output_file):
    #     continue

    # 2. Set Parameters Here:
    # S_to_N_threshold = 3
    ppm_error = 0.2
    PROTON = 1.00727647
    
    # 3. Input File Peaks List
    ## 3a. Check input file
    # input_file = input_file.loc[input_file['S/N'] >= S_to_N_threshold, :]
    raw_pks = len(input_file.index)
    
    ## 3b. Convert all of the uppercase column labels to lowercase and replace the spaces with `_`.
    input_file.columns = (
        input_file.columns.str.lower()
        .str.strip()
        .str.replace(r'\s+', '_', regex=True)
    )
    input_file['nominal_mass'] = round(input_file['peak_location']).astype(int)
    
    ## 3c. Calculate experimental KMD values of CF2, CH2, and C2H4O series
    input_file['KMD_CF2'] = (input_file['peak_location'].round() - input_file['peak_location']*50/49.99681).round(4)
    input_file['KMD_CH2'] = (input_file['peak_location'].round() - input_file['peak_location']*14/14.01565).round(4)
    input_file['KMD_C2H4O'] = (input_file['peak_location'].round() - input_file['peak_location']*44/44.02621).round(4)
    
    # 4. Library Peak List
    ## 4a. Check library file
    lib_pks = len(library_file.index)
    library_file.shape
    # Convert all of the uppercase column labels to lowercase and replace the spaces with `_`.
    library_file.columns = (
        library_file.columns.str.lower()
        .str.strip()
        .str.replace(r'\s+', '_', regex=True)
    )
    
    ## 4b. Compute protonated/deprotonated mass of the library
    list_suspects = library_file.assign(
        mz_neg_H = library_file.loc[:, 'mass'] - PROTON,
        nominal_mz_neg_H = round(library_file.loc[:, 'mass'] - PROTON).astype(int)
    )
    list_suspects.sort_values(by='mass', inplace=True)

    # 5. Suspect List Matches
    def search_suspect_list(input_file, list_suspects, ppm_error):
        matches = pd.DataFrame()
        
        for index, row in input_file.iterrows():
            
            LOWER = row['peak_location'] - row['peak_location'] * ppm_error / 1E6
            UPPER = row['peak_location'] + row['peak_location'] * ppm_error / 1E6
                    
            temp_matches = list_suspects[(list_suspects['mz_neg_H'] >= LOWER) & (list_suspects['mz_neg_H'] <= UPPER)]
            
            if len(temp_matches):
                new_df = temp_matches.copy()
                new_df.loc[:, 'exper_mz'] = row['peak_location']
                new_df.loc[:, 'height'] = row['peak_height']
                new_df.loc[:, 'abund'] = row['scaled_abundance']
                new_df.loc[:, 'exp_KMD_CF2'] = row['KMD_CF2']
                new_df.loc[:, 'exp_KMD_CH2'] = row['KMD_CH2']
                new_df.loc[:, 'exp_KMD_C2H4O'] = row['KMD_C2H4O']
                new_df.loc[:, 'S/N'] = row['s/n']
                new_df.loc[:, 'ppm_error'] = 1E6 * (row['peak_location'] - temp_matches['mz_neg_H']) / temp_matches['mz_neg_H']          
                matches = pd.concat([matches, new_df])
                
        return matches

    results = search_suspect_list(input_file, list_suspects, ppm_error)
    
    # 6. Reformat and Print Results
    results.rename(columns={'mass': 'lib_mass'}, inplace=True)
    results.rename(columns={'mz_neg_H': 'library_neg_mz'}, inplace=True)
    results.rename(columns={'c': 'C'}, inplace=True)
    results.rename(columns={'h': 'H'}, inplace=True)
    results.rename(columns={'br': 'Br'}, inplace=True)
    results.rename(columns={'cl': 'Cl'}, inplace=True)
    results.rename(columns={'f': 'F'}, inplace=True)
    results.rename(columns={'i': 'I'}, inplace=True)
    results.rename(columns={'n': 'N'}, inplace=True)
    results.rename(columns={'o': 'O'}, inplace=True)
    results.rename(columns={'p': 'P'}, inplace=True)
    results.rename(columns={'s': 'S'}, inplace=True)
    results.rename(columns={'dbe': 'DBE'}, inplace=True)
    results.columns
    results = results.loc[:, ['exper_mz', 'height', 'abund', 'lib_mass', 'ppm_error', 'S/N', 'formula', 
                            'C', 'H', 'Br', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', 'DBE']]
    results = results.reset_index()
    
    # Remove those don't have 'H' (which means they can't be deprotonated)
    # results.drop_duplicates(subset=['formula'],inplace=True)
    results = results.loc[results['formula'].str.contains('H')]
    
    #Count the number of unique peaks have assigned
    num_unique = results.loc[:, 'exper_mz'].nunique()

    # Save the temporary results so far:
    temp_file_name = os.path.splitext(os.path.basename(input_file_path))[0] + '_pre_processed.csv'
    temp_file = os.path.join(input_folder, temp_file_name)
    if os.path.exists(temp_file):
        continue
    results.to_csv(temp_file, index=False)