In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import glob

In [2]:
# Load Files Here: (extension name needed)
input_folder = 'test/'
output_folder = 'test/'
xlsx_files = glob.glob(os.path.join(input_folder, '*.xlsx'))

# Loop through each file in the folder
for input_file_path in xlsx_files:
    # Input File:
    df = pd.read_excel(input_file_path)

    # 0. Prepare the dataframe:
    input_data = pd.DataFrame(columns = ['m/z','mass','Category','formula','C','H','F','Cl','Br','N','P','S',
                                         'KMD','z*','exp_KMD','O/C','F/C','abund','abund_norm','NOSC'])
    input_data['m/z'] = df['exper_mz']
    input_data['mass'] = df['lib_mass']
    input_data['formula'] = df['formula']
    input_data['Category'] = df['category']
    input_data['C'] = df['C']
    input_data['H'] = df['H']
    input_data['F'] = df['F']
    input_data['Cl'] = df['Cl']
    input_data['Br'] = df['Br']
    input_data['N'] = df['N']
    input_data['P'] = df['P']
    input_data['S'] = df['S']
    input_data['KMD'] = df['KMD_CF2']
    input_data['z*'] = df['z*_CF2']
    input_data['exp_KMD'] = (df['exper_mz'].round() - df['exper_mz']*50/49.99681).round(4)
    input_data['O/C'] = df['O'] / df['C']
    input_data['F/C'] = df['F'] / df['C']
    input_data['abund'] = df['abund']
    input_data['abund_norm'] = df['abund'] / df['abund'].max()
    input_data['NOSC'] = 4 - ((4*df['C'] - df['F'] + df['H'] - 2*df['O'] + 4*df['S'] - 3*df['N'] - df['Cl'] + 5*df['P']) / df['C'])
    input_data['mass_defect'] = df['lib_mass'] - df['lib_mass'].round()
    input_data['Abund. ($log_{10}$)'] = np.log10(input_data['abund_norm'])
    input_data['13C_match?'] = df['13C_match?']
    input_data['18O_match?'] = df['18O_match?']
    input_data['34S_match?'] = df['34S_match?']
    input_data['37Cl_match?'] = df['37Cl_match?']
    input_data['81Br_match?'] = df['81Br_match?']
    
    Cat_A = input_data[input_data['Category'] == 'A']
    Cat_BC = input_data[(input_data['Category'] == 'B') | (input_data['Category'] == 'C')]
    Cat_BC = Cat_BC.groupby(['KMD', 'z*', 'H', 'Cl', 'P', 'S']).filter(lambda x: len(x) >= 3)
    Cat_ABC = [Cat_A, Cat_BC]
    Cat_ABC_data = pd.concat(Cat_ABC)
    
    # 1. van Krevlen plot for O/C vs. F/C:
    ## Heteroatoms
    def heteroatom_type(row):
        if (row['Cl'] == 0) & (row['Br'] == 0) & (row['N'] == 0) & (row['P'] == 0):
            return 'regular'
        elif (row['Cl'] > 0) & (row['Br'] == 0) & (row['N'] == 0) & (row['P'] == 0):
            return 'Cl'
        elif (row['Cl'] == 0) & (row['Br'] > 0) & (row['N'] == 0) & (row['P'] == 0):
            return 'Br'
        elif (row['Cl'] == 0) & (row['Br'] == 0) & (row['N'] > 0) & (row['P'] == 0):
            return 'N'
        elif (row['Cl'] == 0) & (row['Br'] == 0) & (row['N'] == 0) & (row['P'] > 0):
            return 'P'
        else:
            return '>1 type' 

    Cat_ABC_data['Heteroatoms'] = Cat_ABC_data.apply(heteroatom_type, axis=1)

    ## Color scheme
    sns.set_theme(style='white')
    palette_dict = {'regular':'black', 'Cl':'#669900', 'Br':'#C00000', 'N':'#4472C4', 'P':'#D60093', '>1 type':'#969696'}

    hue_order_list = []
    if 'regular' in Cat_ABC_data['Heteroatoms'].values:
        hue_order_list.append('regular')
    if 'N' in Cat_ABC_data['Heteroatoms'].values:
        hue_order_list.append('N')
    if 'P' in Cat_ABC_data['Heteroatoms'].values:
        hue_order_list.append('P')
    if 'Cl' in Cat_ABC_data['Heteroatoms'].values:
        hue_order_list.append('Cl')
    if 'Br' in Cat_ABC_data['Heteroatoms'].values:
        hue_order_list.append('Br')
    if '>1 type' in Cat_ABC_data['Heteroatoms'].values:
        hue_order_list.append('>1 type')
        
    # Checkpoint
    ### Let's save a temporary file to check the data:
    input_file_name = os.path.splitext(os.path.basename(input_file_path))[0]
    output_file_name = input_file_name.replace('_series', '') + '_vkplots.csv'
    output_file_path = os.path.join(output_folder, output_file_name)
    Cat_ABC_data.to_csv(output_file_path, index=False)
    ### 

    marker_dict = {'known': 'o', 'unknown': 'X'}
    Cat_ABC_data['Category'] = Cat_ABC_data['Category'].replace({'A': 'known'})    
    Cat_ABC_data['Category'] = Cat_ABC_data['Category'].replace({'B': 'unknown', 'C': 'unknown'})

    ## Plotting  
    g = sns.relplot(data=Cat_ABC_data, x=Cat_ABC_data['O/C'], y=Cat_ABC_data['F/C'], 
                    size='Abund. ($log_{10}$)', hue='Heteroatoms', hue_order=hue_order_list, 
                    palette=palette_dict, markers=marker_dict, style='Category', sizes=(3, 30), facet_kws=dict(despine=False), 
                    height=5, aspect=1)
    
    g.set(xscale='linear', yscale='linear')
    g.ax.set_xlim([-0.1, 1.5])  
    g.ax.set_ylim([-0.1, 3.0])
    plt.xlabel('O/C', fontsize=15)
    plt.ylabel('F/C', fontsize=15) 

    ## Set x and y gridline locations
    x_start, x_end = g.ax.get_xlim()
    y_start, y_end = g.ax.get_ylim()
    g.ax.xaxis.set_ticks(np.arange(x_start, x_end, 0.01), minor=True)
    g.ax.yaxis.set_ticks(np.arange(y_start, y_end, 0.01), minor=True)
    g.ax.grid(True, which='major', linewidth=.2, color='silver')
    # g.ax.grid(True, which='minor', linewidth=.2)
    # g.despine(left=True, bottom=True)
    g._legend.remove()

    # Put the legend inside the figure
    plt.legend(prop={'size': 7}, loc='lower right')
        
    input_file_name = os.path.splitext(os.path.basename(input_file_path))[0]
    output_file_name = input_file_name.replace('_series', '') + '_OCFC.png'
    output_file_path = os.path.join(output_folder, output_file_name)

    plt.savefig(output_file_path, dpi=900, bbox_inches='tight')
    plt.close()

    # 2. van Krevlen plot for Mass vs. NOSC:      
    g = sns.relplot(data=Cat_ABC_data, x=Cat_ABC_data['mass'], y=Cat_ABC_data['NOSC'], 
                    size='Abund. ($log_{10}$)', hue='Heteroatoms', hue_order=hue_order_list, 
                    palette=palette_dict, markers=marker_dict, style='Category', sizes=(3, 30), facet_kws=dict(despine=False),
                    height=5, aspect=1.02)
        
    g.set(xscale='linear', yscale='linear')
    g.ax.set_xlim([150, 1150])  
    g.ax.set_ylim([-2.5, 4])
    g.ax.set_xticks([200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100])  
    plt.xlabel('Mass (Da)', fontsize=15)
    plt.ylabel('NOSC', fontsize=15) 
        
    ## Set x and y gridline locations
    x_start, x_end = g.ax.get_xlim()
    y_start, y_end = g.ax.get_ylim()
    g.ax.xaxis.set_ticks(np.arange(x_start, x_end, 5), minor=True)
    g.ax.yaxis.set_ticks(np.arange(y_start, y_end, 0.05), minor=True)
    g.ax.grid(True, which='major', linewidth=.2, color='silver')
    # g.ax.grid(True, which='minor', linewidth=.2)
    # g.despine(left=True, bottom=True)
    g._legend.remove()
    
    # Put the legend inside the figure
    plt.legend(prop={'size': 7}, loc='lower right')
       
    input_file_name = os.path.splitext(os.path.basename(input_file_path))[0]
    output_file_name = input_file_name.replace('_series', '') + '_NOSC.png'
    output_file_path = os.path.join(output_folder, output_file_name)

    plt.savefig(output_file_path, dpi=900, bbox_inches='tight')
    plt.close()