In [2]:
import os
import numpy as np
import pandas as pd
import re
import seaborn as sns
import time

In [21]:
# Functions

def masses_from_metaboscape(csv_file_path, intensity_filter=5000, regex=r'([0-9]+\.[0-9]+) Da'):
    """Reads in a csv file from Metaboscape and returns a dictionary or samples and their respective masses. This also removes the retention time from the buceket label."""
    csv = pd.read_csv(csv_file_path)
    # Make the first column the index
    csv = csv.set_index(csv.columns[0])
    # Add each column to a dictionary with the column name as the key
    samples_dict = {}
    for column in csv.columns:
        samples_dict[column] = csv[column]  
    # Remove all values below the intensity filter  
    regex = re.compile(regex)
    for value in samples_dict.values():
        for index, item in enumerate(value):
            if item <= intensity_filter:
                value[index] = np.nan
        value.dropna(inplace=True)
    df = pd.DataFrame.from_dict(samples_dict)
    df.transpose()
    # display(df.head())
    return df

def find_sodium_adducts(sample_df, RT_window=30):
    "This function takes in a pandas DataFrame of samples with intensities for each 'Bucket Label' and computes a subtraction matrix of the masses. It then returns a list of the mass pairs correlated to sodium adducts. The pairs are then filtered by a retention time window that defaults to 30 seconds, but can be adjusted.\n\nThe function returns a list of deduplicated tuples with the mass pairs and their retention times.\n\nExample Input: \n\nprint(sample_df)\n\n['Bucket Label', Sample1, Sample2]\n\n['241.17878 Da 120.62 s', 20732.0, 4203.0]\n\n['226.11049 Da 126.64 s', 7396.0, 10345.4]\n\n['241.17878 Da 120.62 s', 20732.0, 6819.0]\n\n['263.15878 Da 121.50 s', 7396.0, 9203.0]\n\nExample Output: sodium_adducts=[['241.17878 Da 120.62 s' , '263.15878 Da 121.50 s', 21.98999 Da, 0.88 s]]"

def return_sample_sodium_adducts(sample_df, RT_window=30):
    "This function takes in a pandas DataFrame of a single sample with intensities for each 'Bucket Label' and computes a subtraction matrix of the masses. It then returns a list of the mass pairs correlated to sodium adducts. The pairs are then filtered by a retention time window that defaults to 30 seconds, but can be adjusted.\n\nThe function returns a list of deduplicated tuples with the mass pairs and their retention times.\n\nExample Input: \n\nprint(sample_df)\n\n['Bucket Label', Sample1]\n\n['241.17878 Da 120.62 s', 20732.0]\n\n['226.11049 Da 126.64 s', 7396.0]\n\n['241.17878 Da 120.62 s', 20732.0]\n\n['263.15878 Da 121.50 s', 7396.0]\n\nExample Output: sodium_adducts=[['241.17878 Da 120.62 s' , '263.15878 Da 121.50 s', 21.98999 Da, 0.88 s]]"
    start = time.time() 
    
    sample_df.reset_index(inplace=True)
    
    # Retrieve the samples name
    # Retrieve the bucket labels
    
    # Create mass list and extract the masses from the bucket labels
    mass_list = []
    for i in bucket_labels:
        mass_list.append(float(re.search(r'([0-9]+\.[0-9]+) Da', i).group(1)))
    
    # Create a subtraction matrix
    subtraction_matrix = compute_subtraction_matrix(mass_list)
    
    # Define the bounds for the sodium adducts
    adduct_mass_difference = 22.98976
    low_bound, high_bound = (adduct_mass_difference - 0.001), (adduct_mass_difference + 0.001) #These can be improved with ppm instead of set values, maybe even specifying for mass ranges since accuracy improves with lower masses
    
    # Create a mask for the subtraction matrix
    column_names, row_names = mass_list, mass_list
    condition_mask = np.logical_and(subtraction_matrix >= low_bound, subtraction_matrix <= high_bound)
    row_indices, col_indices = np.where(condition_mask)
    sodium_adducts = {}
    for row_idx, col_idx in zip(row_indices, col_indices):
        key = f"{row_names[row_idx]}_{column_names[col_idx]}"
        sodium_adducts[key] = subtraction_matrix[row_idx, col_idx]
    
    sodium_adducts_df = pd.DataFrame.from_dict(sodium_adducts, orient='index')
    stop = time.time()
    print("Time taken: ", stop - start)
    print("Number of matching values: ", len(sodium_adducts_df))
    return sodium_adducts_df

def compute_subtraction_matrix(mass_list):
    np.array(mass_list)
    subtration_matrix = (np.subtract.outer(mass_list, mass_list))
    subtration_matrix[subtration_matrix < 0] = np.nan #NaN all negative values. This removes the negative values in the matrix since they are duplicates of the same interaction between the two values.
    return subtration_matrix

def matrix_mask(mass_list, low_bound, high_bound):
    subtraction_matrix = compute_subtraction_matrix(mass_list)
    condition_mask = np.logical_and(subtraction_matrix >= low_bound, subtraction_matrix <= high_bound)
    row_indices, col_indices = np.where(condition_mask)
    return row_indices, col_indices

In [5]:
sample_dict = masses_from_metaboscape("C:/Users/nbrittin/Desktop/test/Metallophore_ID/test_masses.csv")
display(sample_dict)

Unnamed: 0_level_0,"MSMS_D937_A_2-E,1_01_20306","MSMS_D937_R_2-E,2_01_20308"
Bucket label,Unnamed: 1_level_1,Unnamed: 2_level_1
1000.54232 Da 675.77 s,10764.0,
1001.48144 Da 342.09 s,5912.0,
1001.48281 Da 342.09 s,9920.0,
1002.47756 Da 364.91 s,16472.0,
1002.58673 Da 560.75 s,5324.0,


Unnamed: 0_level_0,"MSMS_D937_A_2-E,1_01_20306","MSMS_D937_R_2-E,2_01_20308"
Bucket label,Unnamed: 1_level_1,Unnamed: 2_level_1
1000.54232 Da 675.77 s,10764.0,
1001.48144 Da 342.09 s,5912.0,
1001.48281 Da 342.09 s,9920.0,
1002.47756 Da 364.91 s,16472.0,
1002.58673 Da 560.75 s,5324.0,
...,...,...
996.56784 Da 592.33 s,17384.0,
997.53828 Da 582.79 s,10680.0,
998.59221 Da 697.17 s,125232.0,
998.62047 Da 693.24 s,,11128.0


In [22]:
df = pd.DataFrame.from_dict(sample_dict)
df.transpose()
sample_df = pd.DataFrame(df["MSMS_D937_A_2-E,1_01_20306"])
sodium_adducts = return_sample_sodium_adducts(sample_df)
print(sodium_adducts)

Time taken:  0.36300182342529297
Number of matching values:  239
239
                              0
1017.51944_994.52943   22.99001
1017.51944_994.53016   22.98928
1028.62603_1005.63717  22.98886
1065.54567_1042.55604  22.98963
1069.50923_1046.51895  22.99028
...                         ...
898.48175_875.49185    22.98990
927.47199_904.48162    22.99037
930.43707_907.4467     22.99037
943.59694_920.60663    22.99031
968.5698_945.57905     22.99075

[239 rows x 1 columns]


In [23]:
display(sample_df.head())
sample_df.reset_index(inplace=True)
display(sample_df.head())

Unnamed: 0_level_0,"MSMS_D937_A_2-E,1_01_20306"
Bucket label,Unnamed: 1_level_1
1000.54232 Da 675.77 s,10764.0
1001.48144 Da 342.09 s,5912.0
1001.48281 Da 342.09 s,9920.0
1002.47756 Da 364.91 s,16472.0
1002.58673 Da 560.75 s,5324.0


Unnamed: 0,Bucket label,"MSMS_D937_A_2-E,1_01_20306"
0,1000.54232 Da 675.77 s,10764.0
1,1001.48144 Da 342.09 s,5912.0
2,1001.48281 Da 342.09 s,9920.0
3,1002.47756 Da 364.91 s,16472.0
4,1002.58673 Da 560.75 s,5324.0


In [16]:
bucket_labels = df.index.to_list()
print(len(bucket_labels))
mass_list = []
for i in bucket_labels:
    mass_list.append(float(re.search(r'([0-9]+\.[0-9]+) Da', i).group(1)))
print(len(mass_list))

5504
5504
