In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import glob
import os

# Load Files Here: (extension name needed)
input_folder = 'test/'
output_folder = 'test/'
xlsx_files = glob.glob(os.path.join(input_folder, '*.xlsx'))

# Loop through each file in the folder:
for input_file_path in xlsx_files:    
    df = pd.read_excel(input_file_path, sheet_name='CF2_series')
    df = df[~df['category'].isin(['E'])].reset_index(drop=True)
    peak_values = df['exper_mz']
    peak_values.drop_duplicates(inplace=True)
    peak_values.reset_index(drop=True, inplace=True)

    input_file_name = os.path.splitext(os.path.basename(input_file_path))[0]
            
    tolerance = 2E-4 # This value corresponds to 0.2 ppm at m/z 1000
    G = nx.Graph()

    for peak in peak_values:
        G.add_node(peak)

    dummy_nodes = []
    for i in range(len(peak_values)):
        for j in range(i+1, len(peak_values)):
            delta_mz = abs(peak_values.iloc[i] - peak_values.iloc[j])
            if abs(delta_mz - 49.99681) <= tolerance:
                G.add_edge(peak_values.iloc[i], peak_values.iloc[j], weight=49.99681, color='red')
            if abs(delta_mz - 44.02621) <= tolerance:
                G.add_edge(peak_values.iloc[i], peak_values.iloc[j], weight=44.02621, color='green')
            if abs(delta_mz - 14.01565) <= tolerance:
                G.add_edge(peak_values.iloc[i], peak_values.iloc[j], weight=14.01565, color='silver')
            if abs(delta_mz - 99.99362) <= tolerance:
                dummy_node = round((peak_values.iloc[i] + peak_values.iloc[j]) / 2, 5)
                if not any(abs(dummy_node - peak) <= tolerance for peak in peak_values):
                    G.add_node(dummy_node, is_dummy=True)
                    dummy_nodes.append(dummy_node)
                    G.add_edge(peak_values.iloc[i], dummy_node, weight=49.99681, color='red', style='dotted')
                    G.add_edge(dummy_node, peak_values.iloc[j], weight=49.99681, color='red', style='dotted')
                
    isolated_nodes = [n for n in G.nodes() if G.degree(n) == 0]
    G.remove_nodes_from(isolated_nodes)
    
    plt.figure(figsize=(10, 10))
    print(f'{input_file_path}:')
    print(G)
    print(f'with {len(dummy_nodes)} dummy nodes.')
    print(nx.number_connected_components(G), 'connected components')
    
    # Get connected components and sort them based on their edge count
    connected_components = [G.subgraph(c).copy() for c in nx.connected_components(G)]
    sorted_components = sorted(connected_components, key=lambda x: x.size(), reverse=True)
    
    # _________________________Main_Plot____________________________
    # Create a new figure and axes for the main plot
    fig1, ax1 = plt.subplots()
    
    pos = nx.nx_agraph.graphviz_layout(G, prog='neato')

    for edge in G.edges():
        length = G.edges()[edge]['weight']
        color = G.edges()[edge]['color']
        style = G.edges()[edge].get('style', 'solid')
        nx.draw_networkx_edges(G, pos, edgelist=[edge], width=0.5, alpha=0.8, edge_color=color, style=style, ax=ax1)  

    nx.draw_networkx_labels(G, pos, font_size=2.5, font_color='black', font_family='arial', ax=ax1)

    dummy_nodes_main = [n for n in G.nodes() if G.nodes[n].get('is_dummy')]
    real_nodes_main = [n for n in G.nodes() if not G.nodes[n].get('is_dummy')]
    node_values = {node: node for node in real_nodes_main}
    vmin, vmax = min(node_values.values()), max(node_values.values())
    colors = [plt.cm.Blues((node_values[v] - vmin) / (vmax - vmin)) for v in real_nodes_main]
    nx.draw_networkx_nodes(G, pos, nodelist=real_nodes_main, node_size=5, node_color=colors, alpha=1, ax=ax1)
    nx.draw_networkx_nodes(G, pos, nodelist=dummy_nodes_main, node_shape='', alpha=1, ax=ax1)  

    ax1.axis('off')
    # Save the main plot
    output_file_name = input_file_name.replace('_series', '') + '_network.png'
    output_file_path = os.path.join(output_folder, output_file_name)

    plt.savefig(output_file_path, dpi=600, bbox_inches='tight')
    plt.close()

    # _________________________Sub_Plots____________________________
    # Create a new figure and axes for the most crowded subgraph plot
    # Change this value to control the number of top connected subgraphs to draw and save
    top_n_subgraphs = 6
    counter = 1
    for rank, subgraph in enumerate(sorted_components[:top_n_subgraphs], start=1):
        fig, ax = plt.subplots()

        pos = nx.nx_agraph.graphviz_layout(subgraph, prog='neato')
   
        for edge in subgraph.edges():
            length = subgraph.edges()[edge]['weight']
            color = subgraph.edges()[edge]['color']
            style = subgraph.edges()[edge].get('style', 'solid')
            nx.draw_networkx_edges(subgraph, pos, edgelist=[edge], width=2, alpha=0.8, edge_color=color, style=style, ax=ax)
        
        nx.draw_networkx_labels(subgraph, pos, font_size=7, font_color='black', font_family='arial', ax=ax)

        dummy_nodes = [n for n in subgraph.nodes() if subgraph.nodes[n].get('is_dummy')]
        real_nodes = [n for n in subgraph.nodes() if not subgraph.nodes[n].get('is_dummy')]
        node_values = {node: node for node in real_nodes}
        vmin, vmax = min(node_values.values()), max(node_values.values())
        colors = [plt.cm.Blues((node_values[v] - vmin) / (vmax - vmin)) for v in real_nodes]
        nx.draw_networkx_nodes(subgraph, pos, nodelist=real_nodes, node_size=20, node_color=colors, alpha=1, ax=ax)
        # nx.draw_networkx_nodes(subgraph, pos, nodelist=dummy_nodes, node_size=20, node_color='white', edgecolors='gray', alpha=1, node_shape='o', ax=ax)
        nx.draw_networkx_nodes(subgraph, pos, nodelist=dummy_nodes, alpha=1, node_shape='', ax=ax)
        
        Trademark = f'subnetwork {counter}: {subgraph} (with {len(dummy_nodes)} dummy nodes)'
        ax.text(0.0, 0.0, Trademark, 
                verticalalignment='bottom', 
                horizontalalignment='left', 
                transform=ax.transAxes, 
                color='black', fontsize=5)        

        # _________________________General formula of each subnetwork____________________________ 
        # Save the nodes list of each connected components        
        sub_df_all = pd.DataFrame(columns=df.columns)
        for each_node in real_nodes:
            each_df = df[df['exper_mz'].isin(real_nodes)]
            sub_df_all = pd.concat([sub_df_all, each_df])
        
        sub_df_all.drop_duplicates(subset='formula', inplace=True)
        sub_df_all.drop(columns=['stats','#_pks','#_assgns','#_series'],inplace=True)
    
        sub_df = sub_df_all.copy()
        
        mode_DBE = sub_df['DBE'].mode()[0]
        mode_N = sub_df['N'].mode()[0]
        mode_S = sub_df['S'].mode()[0]
        # We are not screening phosphorus here because there are so many false assignments containing P for level 3 PFAS.
        # Or we will mess up with real mode_P being the one from false assignments.
        
        sub_df = sub_df[(sub_df['DBE'] == mode_DBE) & (sub_df['N'] == mode_N) & (sub_df['S'] == mode_S)]
        
        if (sub_df['category'] == 'D').all():
            sub_df = pd.DataFrame(columns=df.columns)          
               
        # Get the common formula
        try:
            min_mass_row = sub_df[sub_df['exper_mz'] == sub_df['exper_mz'].min()]

            min_mass_formula = min_mass_row['formula'].iloc[0]
            min_mass_C = min_mass_row['C'].iloc[0]
            min_mass_H = min_mass_row['H'].iloc[0]
            min_mass_F = min_mass_row['F'].iloc[0]
            min_mass_O = min_mass_row['O'].iloc[0]

            sub_df['delta_C'] = sub_df['C'] - min_mass_C
            sub_df['delta_H'] = sub_df['H'] - min_mass_H
            sub_df['delta_F'] = sub_df['F'] - min_mass_F
            sub_df['delta_O'] = sub_df['O'] - min_mass_O
            sub_df['delta_CF2'] = sub_df['delta_F']/2
            sub_df['delta_CH2'] = (sub_df['delta_H'] - 4*sub_df['delta_O'])/2
            sub_df['delta_C2H4O'] = sub_df['delta_O']
            
            # This code is used to filter off phosphorus:
            sub_df = sub_df[sub_df['delta_CH2'] % 1 == 0]            
            
            max_count_CF2 = int(sub_df['delta_CF2'].max())
            max_count_CH2 = int(sub_df['delta_CH2'].max())
            max_count_C2H4O = int(sub_df['delta_C2H4O'].max())
            min_count_CF2 = int(sub_df['delta_CF2'].min())
            min_count_CH2 = int(sub_df['delta_CH2'].min())
            min_count_C2H4O = int(sub_df['delta_C2H4O'].min())
                        
            sub_df.reset_index(drop=True, inplace=True)

            subfile_name = input_file_name.replace('_series', '') + f'_subnetwork_{counter}.xlsx'
            subfile_path = os.path.join(output_folder, subfile_name)
            with pd.ExcelWriter(subfile_path, engine='xlsxwriter') as writer:
                sub_df.to_excel(writer, sheet_name='simplified', index=False)
                sub_df_all.to_excel(writer, sheet_name='all_assign', index=False)
        
            # Print out markers
            num_of_lv_up = str(len(sub_df)) + f'/' + str(len(real_nodes)) + f' nodes: '
            if min_count_C2H4O == max_count_C2H4O == 0:                      
                General_formula = num_of_lv_up + str(min_mass_formula) + f'(CF2)' + str(min_count_CF2) + f'~' + str(max_count_CF2) + f'(CH2)' + str(min_count_CH2) + f'~' + str(max_count_CH2) 
            else:
                General_formula = num_of_lv_up + str(min_mass_formula) + f'(CF2)' + str(min_count_CF2) + f'~' + str(max_count_CF2) + f'(CH2)' + str(min_count_CH2) + f'~' + str(max_count_CH2) + f'(C2H4O)' + str(min_count_C2H4O) + f'~' + str(max_count_C2H4O)
            
            ax.text(1.0, 0.0, General_formula, 
                    verticalalignment='bottom', 
                    horizontalalignment='right', 
                    transform=ax.transAxes, 
                    color='black', fontsize=5)
            
        except IndexError as e:
            if 'single positional indexer is out-of-bounds' in str(e):
                print(f'Warning: No general formula extracted from subnetwork {counter}.')
                General_formula = f'(no general formula)'
                ax.text(1.0, 0.0, General_formula, 
                    verticalalignment='bottom', 
                    horizontalalignment='right', 
                    transform=ax.transAxes, 
                    color='black', fontsize=5)        
            else:
                raise e
        
        counter += 1    
        ax.axis('off')
        #___________________________________________________________________________________
        
        # Save the plot
        output_file_name = input_file_name.replace('_series', '') + f'_network_sub{rank}.png'
        output_file_path = os.path.join(output_folder, output_file_name)
        
        plt.savefig(output_file_path, dpi=300, bbox_inches='tight')
        plt.close()
 

test/AFFF_PetersonBucket_3sigma+_recal_rev_series.xlsx:
Graph with 69 nodes and 63 edges
with 9 dummy nodes.
11 connected components


  sub_df_all = pd.concat([sub_df_all, each_df])
  sub_df_all = pd.concat([sub_df_all, each_df])
  sub_df_all = pd.concat([sub_df_all, each_df])
  sub_df_all = pd.concat([sub_df_all, each_df])
  sub_df_all = pd.concat([sub_df_all, each_df])
  sub_df_all = pd.concat([sub_df_all, each_df])


<Figure size 1000x1000 with 0 Axes>