In [1]:
import pandas as pd
import os
import glob

# 1. Load Files Here: (extension name needed)
input_folder = 'test/Na_K_NH4_adducts/'
csv_files = glob.glob(os.path.join(input_folder, '*.csv'))

# Loop through each file in the folder
for input_file_path in csv_files:
    # Raw Data:
    results = pd.read_csv(input_file_path).dropna()
    # Output File:
    output_file_name_stem = os.path.splitext(os.path.basename(input_file_path))[0] 
    output_file_name = f"{output_file_name_stem.replace('_pre', '')}.csv" 
    output_file = os.path.join(input_folder, output_file_name)
    if os.path.exists(output_file):
        continue
    
    raw_folder = 'test/Na_K_NH4_adducts/raw/'
    raw_peak_name = f"{output_file_name_stem.replace('_pre_processed', '')}.csv" 
    raw_peak_file = os.path.join(raw_folder, raw_peak_name)
    raw_peak_list = pd.read_csv(raw_peak_file, skiprows=6).dropna()
    
    # 2. Set Parameters Here:
    S_to_N_threshold = 3
    ppm_error = 0.2
    PROTON = 1.00727647
    
    # 3. Input File Peaks List
    # input_file = input_file.loc[input_file['S/N'] >= S_to_N_threshold, :]
    raw_pks = len(raw_peak_list.index)
    
    # 4. Check the isotopologue
    ## 4a. Define isotopes and calculate iso mass
    C_mass = 12.000000
    Br_mass = 78.918336
    Cl_mass = 34.968853
    O_mass = 15.994915
    S_mass = 31.972072
    # Define the isotopes:
    C_13_mass = 13.003355
    C_13_abund = 0.0107/0.9893
    Br_81_mass = 80.916290	
    Br_81_abund = 0.4931/0.5069
    Cl_37_mass = 36.965903	
    Cl_37_abund = 0.2423/0.7577
    O_18_mass = 17.999159	
    O_18_abund = 0.0020/0.99757
    S_34_mass = 33.967868
    S_34_abund = 0.0421/0.9493
              
    results.loc[results['C'] > 0, '13C_mass'] = results['exper_mz'] + (C_13_mass - C_mass)
    results.loc[results['Br'] > 0, '81Br_mass'] = results['exper_mz'] + (Br_81_mass - Br_mass)
    results.loc[results['Cl'] > 0, '37Cl_mass'] = results['exper_mz'] + (Cl_37_mass - Cl_mass)
    results.loc[results['S'] > 0, '34S_mass'] = results['exper_mz'] + (S_34_mass - S_mass)
    results.loc[results['O'] > 0, '18O_mass'] = results['exper_mz'] + (O_18_mass - O_mass)
    
    results.loc[results['C'] > 0, '13C_abund'] = results['abund'] * results['C'] * C_13_abund
    results.loc[results['Br'] > 0, '81Br_abund'] = results['abund'] * results['Br'] * Br_81_abund
    results.loc[results['Cl'] > 0, '37Cl_abund'] = results['abund'] * results['Cl'] * Cl_37_abund
    # results.loc[results['Cl'] > 0, '37Cl2_abund'] = results['abund'] * results['Cl'] * (results['Cl'] - 1)/2 * Cl_37_abund * Cl_37_abund
    results.loc[results['S'] > 0, '34S_abund'] = results['abund'] * results['S'] * S_34_abund
    results.loc[results['O'] > 0, '18O_abund'] = results['abund'] * results['O'] * O_18_abund
        
    ## 4b. Search for isotope peaks
    def search_isotope(results, column_name, intensity_error):
        results[column_name + '_iso?'] = ''
        results[column_name + '_abund?'] = ''
        results[column_name + '_abund_err?'] = ''
        results[column_name + '_match?'] = ''
        
        for i, row in results.iterrows():
            ppm_lower_bound = row[column_name + '_mass'] - row[column_name + '_mass'] * ppm_error / 1E6
            ppm_upper_bound = row[column_name + '_mass'] + row[column_name + '_mass'] * ppm_error / 1E6
            intensity_lower_bound = row[column_name + '_abund'] * (1 - intensity_error)
            intensity_upper_bound = row[column_name + '_abund'] * (1 + intensity_error)
            # Check if any 'peaklist' value in raw data CSV falls within the range
            match = ((raw_peak_list['Peak Location'] >= ppm_lower_bound) & 
                    (raw_peak_list['Peak Location'] <= ppm_upper_bound) & 
                    (raw_peak_list['Scaled Abundance'] > intensity_lower_bound) & 
                    (raw_peak_list['Scaled Abundance'] < intensity_upper_bound))

            if any(match):
                # Get the matching 'peak_location' and 'scaled_abundance' values
                matched_peak_location = raw_peak_list.loc[match, 'Peak Location'].values[0]
                matched_scaled_abundance = raw_peak_list.loc[match, 'Scaled Abundance'].values[0]
                abundance_error = ((matched_scaled_abundance - row[column_name + '_abund']) / row[column_name + '_abund']).round(4)
                # Assign the matched values to the respective columns
                results.at[i, column_name + '_iso?'] = matched_peak_location
                results.at[i, column_name + '_abund?'] = matched_scaled_abundance
                results.at[i, column_name + '_abund_err?'] = abundance_error
                results.at[i, column_name + '_match?'] = 'Y'
        return results

    results = search_isotope(results, '13C', 0.15)
    results = search_isotope(results, '81Br', 0.15)
    results = search_isotope(results, '37Cl', 0.15)
    results = search_isotope(results, '18O', 0.60)
    results = search_isotope(results, '34S', 0.15)
     
    ## 4c. Eliminate those assignments that contain Br/Cl/S/C but the isotopologue does not match. *Keep all if abundance of isotopologue < minimum
    #           those don't have this elem   those isotopologues match          those isopeak's abundance is < minimum
    results = results[(results['Br'] < 1) | (results['81Br_match?'] == 'Y') | ((results['Br'] >= 1) & (results['abund'] < results['abund'].min() / (results['Br'] * Br_81_abund)))]
    results = results[(results['Cl'] < 1) | (results['37Cl_match?'] == 'Y') | ((results['Cl'] >= 1) & (results['abund'] < results['abund'].min() / (results['Cl'] * Cl_37_abund)))]
    results = results[(results['O'] < 1) | (results['18O_match?'] == 'Y') | ((results['O'] >= 1) & (results['abund'] < results['abund'].min() / (results['O'] * O_18_abund)))]
    results = results[(results['S'] < 1) | (results['34S_match?'] == 'Y') | ((results['S'] >= 1) & (results['abund'] < results['abund'].min() / (results['S'] * S_34_abund)))]
    results = results[(results['13C_match?'] == 'Y') | ((results['abund'] < results['abund'].min() / (results['C'] * C_13_abund)))]
    peak_number = results.groupby('exper_mz').ngroup() + 1
    assgnmnts_number = len(results.index)
       
    # 5. Only keep the entries that at least 3 of them have the same KMD value
    results['KMD_CF2'] = (results['lib_mass'].round() - results['lib_mass']*50/49.99681).round(4)
    results['z*_CF2'] = (results['lib_mass'].round() % 50) - 50
    results['KMD_CH2'] = (results['lib_mass'].round() - results['lib_mass']*14/14.01565).round(4)
    results['z*_CH2'] = (results['lib_mass'].round() % 14) - 14
    results['KMD_C2H4O'] = (results['lib_mass'].round() - results['lib_mass']*44/44.02621).round(4)
    results['z*_C2H4O'] = (results['lib_mass'].round() % 44) - 44

    results_CF2_filtered = results.groupby(['KMD_CF2', 'z*_CF2', 'H', 'Cl', 'Br', 'P', 'S']).filter(lambda x: len(x) >= 3)
    results_CH2_filtered = results.groupby(['KMD_CH2', 'z*_CH2', 'F', 'Cl', 'Br', 'P', 'S']).filter(lambda x: len(x) >= 3)
    results_C2H4O_filtered = results.groupby(['KMD_C2H4O', 'z*_C2H4O', 'F', 'Cl', 'Br', 'P', 'S']).filter(lambda x: len(x) >= 3)
    
    results_filtered = pd.concat([results_CF2_filtered, results_CH2_filtered, results_C2H4O_filtered])
    results_unique = results_filtered.drop_duplicates()
    
    final = results_filtered.loc[:, ['exper_mz', 'height', 'abund', 'lib_mass', 'ppm_error', 'S/N', 
                                    'KMD_CF2', 'z*_CF2', 'KMD_CH2', 'z*_CH2', 'KMD_C2H4O', 'z*_C2H4O', 
                                    'formula', 'C', 'H', 'Br', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', 'DBE', 
                                    '13C_match?', '13C_iso?', '13C_abund?', '13C_abund_err?', 
                                    '81Br_match?', '81Br_iso?', '81Br_abund?', '81Br_abund_err?',
                                    '37Cl_match?', '37Cl_iso?', '37Cl_abund?', '37Cl_abund_err?',  
                                    '18O_match?', '18O_iso?', '18O_abund?', '18O_abund_err?',
                                    '34S_match?', '34S_iso?', '34S_abund?', '34S_abund_err?']]
    		
    final = final.reset_index()
    
    # 6. Add counter columns for some stat numbers
    peak_number_after_KMD = results_unique.groupby('exper_mz').ngroup() + 1
    # final['peak_number'] = peak_number_after_KMD
    # final = final.sort_values('peak_number')
    # results['assignment_number'] = results.groupby('exper_mz').cumcount() + 1     # This counts the # of assignments of a peak
    final.loc[0, '#raw_pks'] = int(raw_pks)
    try:
        final.loc[0, '#matched_pks'] = int(peak_number.max())
        final.loc[0, '#assgnmnts'] = int(assgnmnts_number)
    except ValueError as e:
        if 'cannot convert float NaN to integer' in str(e):
            print('Warning: No matches found. Setting value to 0.')
            final.loc[0, '#matched_pks'] = 0
            final.loc[0, '#assgnmnts'] = 0           
        else:
            raise e
    try:    
        final.loc[0, '#KMD>3_pks'] = int(peak_number_after_KMD.max())
        final.loc[0, '#KMD>3_asmnts'] = int(len(results_unique.index))
    except ValueError as e:
        if 'cannot convert float NaN to integer' in str(e):
            print('Warning: No matches found. Setting value to 0.')
            final.loc[0, '#KMD>3_pks'] = 0
            final.loc[0, '#KMD>3_asmnts'] = 0  
            
    final.to_csv(output_file, index=False)