In [3]:
import os
import numpy as np
import pandas as pd
import re
import seaborn as sns
import time

In [38]:
# Functions

def masses_from_metaboscape(csv_file_path, intensity_filter=5000, regex=r'([0-9]+\.[0-9]+) Da'):
    """Reads in a csv file from Metaboscape and returns a dictionary or samples and their respective masses. This also removes the retention time from the buceket label."""
    csv = pd.read_csv(csv_file_path)
    # Make the first column the index
    csv = csv.set_index(csv.columns[0])
    # Add each column to a dictionary with the column name as the key
    samples_dict = {}
    for column in csv.columns:
        samples_dict[column] = csv[column]  
    # Remove all values below the intensity filter  
    regex = re.compile(regex)
    for value in samples_dict.values():
        for index, item in enumerate(value):
            if item <= intensity_filter:
                value[index] = np.nan
        value.dropna(inplace=True)
    df = pd.DataFrame.from_dict(samples_dict)
    df.transpose()
    display(df.head())
    return df

def find_sodium_adducts(sample_dict, RT_window=30):
    "This function takes in a pandas DataFrame of samples with intensities for each 'Bucket Label' and computes a subtraction matrix of the masses. It then returns a list of the mass pairs correlated to sodium adducts. The pairs are then filtered by a retention time window that defaults to 30 seconds, but can be adjusted.\n\nThe function returns a list of deduplicated tuples with the mass pairs and their retention times.\n\nExample Input: \n\nprint(sample_df)\n\n['Bucket Label', Sample1, Sample2]\n\n['241.17878 Da 120.62 s', 20732.0, 4203.0]\n\n['226.11049 Da 126.64 s', 7396.0, 10345.4]\n\n['241.17878 Da 120.62 s', 20732.0, 6819.0]\n\n['263.15878 Da 121.50 s', 7396.0, 9203.0]\n\nExample Output: sodium_adducts=[['241.17878 Da 120.62 s' , '263.15878 Da 121.50 s', 21.98999 Da, 0.88 s]]"
    sample_names = list(sample_dict.keys())
    bucket_labels = sample_dict['Bucket label'].to_list()
    # print(len(bucket_labels))
    mass_list = []
    for i in bucket_labels:
        mass = re.search(r'([0-9]+\.[0-9]+) Da', i).group(1)
        mass_list.append(float(mass))
    # print("Number of Buckets: ", len(mass_list))
    np.array(mass_list)
    subtraction_matrix = np.absolute(np.subtract.outer(mass_list, mass_list))
    # print("Subtraction Matrix Shape: ", subtraction_matrix.shape)
    # print(subtraction_matrix)
    adduct_mass_difference = 22.9897692820
    low_bound = adduct_mass_difference - 0.001 #These can be improved with ppm instead of set values, maybe even specifying for mass ranges since accuracy improves with lower masses
    high_bound = adduct_mass_difference + 0.001 #These can be improved with ppm instead of set values, maybe even specifying for mass ranges since accuracy improves with lower masses
    data_array = subtraction_matrix
    start = time.time()
    column_names = mass_list
    row_names = mass_list
    condition_mask = np.logical_and(data_array >= low_bound, data_array <= high_bound)
    row_indices, col_indices = np.where(condition_mask)
    matching_values_dict = {}
    for row_idx, col_idx in zip(row_indices, col_indices):
        key = f"{row_names[row_idx]}_{column_names[col_idx]}"
        matching_values_dict[key] = data_array[row_idx, col_idx]
    stop = time.time()
    print("Time taken: ", stop - start)
    print("Number of matching values: ", len(matching_values_dict) / 2)
    return matching_values_dict

In [41]:
sample_dict = masses_from_metaboscape("C:/Users/nbrittin/Desktop/test/Metallophore_ID/test_masses.csv")
display(sample_dict)

{'MSMS_D937_A_2-E,1_01_20306': Bucket label
 241.17878 Da 120.62 s    20732.0
 226.11049 Da 126.64 s     7396.0
 323.13811 Da 130.67 s    14972.0
 342.18043 Da 132.69 s    28360.0
 301.19977 Da 134.98 s    11732.0
                           ...   
 465.34421 Da 719.45 s    14096.0
 758.49340 Da 717.24 s    28828.0
 354.27548 Da 717.51 s    17596.0
 264.17198 Da 719.45 s    22572.0
 322.19028 Da 719.45 s    39388.0
 Name: MSMS_D937_A_2-E,1_01_20306, Length: 3733, dtype: float64,
 'MSMS_D937_R_2-E,2_01_20308': Bucket label
 241.17878 Da 120.62 s     6504.0
 226.11049 Da 126.64 s     7396.0
 323.13811 Da 130.67 s    18124.0
 342.18043 Da 132.69 s    21716.0
 301.19977 Da 134.98 s    26852.0
                           ...   
 546.35368 Da 718.70 s     5604.0
 758.49340 Da 717.24 s    24272.0
 588.43732 Da 718.70 s     5376.0
 354.27548 Da 717.51 s     6948.0
 322.19028 Da 719.45 s    38076.0
 Name: MSMS_D937_R_2-E,2_01_20308, Length: 3943, dtype: float64}

In [55]:
df = pd.DataFrame.from_dict(sample_dict)
df.transpose()
display(df.head())

Unnamed: 0_level_0,"MSMS_D937_A_2-E,1_01_20306","MSMS_D937_R_2-E,2_01_20308"
Bucket label,Unnamed: 1_level_1,Unnamed: 2_level_1
1000.54232 Da 675.77 s,10764.0,
1001.48144 Da 342.09 s,5912.0,
1001.48281 Da 342.09 s,9920.0,
1002.47756 Da 364.91 s,16472.0,
1002.58673 Da 560.75 s,5324.0,
