In [1]:
import os
import numpy as np
import pandas as pd
import re
import seaborn as sns
import time

In [None]:
# Functions

def masses_from_metaboscape(csv_file_path):
    """Reads in a csv file from Metaboscape and returns a dictionary or samples and their respective masses."""
    csv = pd.read_csv(csv_file_path)
    # Make the first column the index
    csv = csv.set_index(csv.columns[0])
    samples_dict
    # Intensity filter to remove peaks with intensity below 5000, as they are less likely to be accurate masses or significant.
    intensity_filter = 10000
    csv_masses_filtered = {}
    for index, row in csv_masses.iterrows():
        if row["MSMS_D937_A_2-E,1_01_20306"] > intensity_filter:
            csv_masses_filtered[row["Bucket label"]] = row["MSMS_D937_A_2-E,1_01_20306"]
    csv_masses_filtered = pd.DataFrame.from_dict(csv_masses_filtered, orient="index")
    # Add integer index and column name to the filtered masses dataframe.
    csv_masses_filtered = csv_masses_filtered.reset_index()
    csv_masses_filtered.columns = ["Bucket label", "MSMS_D937_A_2-E,1_01_20306"]

In [35]:
# Import the masses csv

csv_masses = pd.read_csv("C:/Users/nbrittin/Desktop/test/Metallophore_ID/test_masses.csv")

print(csv_masses.shape)
display(csv_masses.head())

# Drop the second sample column, as it is not needed.
csv_masses = csv_masses.drop(columns=["MSMS_D937_R_2-E,2_01_20308"])
print("Masses shape: ", csv_masses.shape)
# Intensity filter to remove peaks with intensity below 5000, as they are less likely to be accurate masses or significant.
intensity_filter = 10000
csv_masses_filtered = {}
for index, row in csv_masses.iterrows():
    if row["MSMS_D937_A_2-E,1_01_20306"] > intensity_filter:
        csv_masses_filtered[row["Bucket label"]] = row["MSMS_D937_A_2-E,1_01_20306"]
csv_masses_filtered = pd.DataFrame.from_dict(csv_masses_filtered, orient="index")
# Add integer index and column name to the filtered masses dataframe.
csv_masses_filtered = csv_masses_filtered.reset_index()
csv_masses_filtered.columns = ["Bucket label", "MSMS_D937_A_2-E,1_01_20306"]

print("Filtered Masses shape: ", csv_masses_filtered.shape)
display(csv_masses_filtered.head())

(12511, 3)


Unnamed: 0,Bucket label,"MSMS_D937_A_2-E,1_01_20306","MSMS_D937_R_2-E,2_01_20308"
0,625.30578 Da 120.43 s,1532.0,0.0
1,278.12929 Da 120.43 s,4772.0,0.0
2,644.30171 Da 120.43 s,2360.0,0.0
3,241.17878 Da 120.62 s,20732.0,6504.0
4,150.01609 Da 122.32 s,1780.0,2064.0


Masses shape:  (12511, 2)
Filtered Masses shape:  (2324, 2)


Unnamed: 0,Bucket label,"MSMS_D937_A_2-E,1_01_20306"
0,241.17878 Da 120.62 s,20732.0
1,323.13811 Da 130.67 s,14972.0
2,342.18043 Da 132.69 s,28360.0
3,301.19977 Da 134.98 s,11732.0
4,530.26933 Da 146.02 s,15108.0


In [36]:
bucket_labels = csv_masses_filtered['Bucket label'].to_list()
# print(len(bucket_labels))
masses = []
for i in bucket_labels:
    mass = re.search(r'([0-9]+\.[0-9]+) Da', i).group(1)
    masses.append(float(mass))
print("Number of Buckets: ", len(masses))

# Make a subtraction matrix
np.array(masses)
subtraction_matrix = np.absolute(np.subtract.outer(masses, masses))
print("Subtraction Matrix Shape: ", subtraction_matrix.shape)
print(subtraction_matrix)


Number of Buckets:  2324
Subtraction Matrix Shape:  (2324, 2324)
[[  0.       81.95933 101.00165 ... 113.0967   22.9932   81.0115 ]
 [ 81.95933   0.       19.04232 ...  31.13737  58.96613   0.94783]
 [101.00165  19.04232   0.      ...  12.09505  78.00845  19.99015]
 ...
 [113.0967   31.13737  12.09505 ...   0.       90.1035   32.0852 ]
 [ 22.9932   58.96613  78.00845 ...  90.1035    0.       58.0183 ]
 [ 81.0115    0.94783  19.99015 ...  32.0852   58.0183    0.     ]]


In [37]:
adduct_mass_difference = 54.92712
low_bound = adduct_mass_difference - 0.001
high_bound = adduct_mass_difference + 0.001
data_array = subtraction_matrix
start = time.time()
column_names = masses
row_names = masses
condition_mask = np.logical_and(data_array >= low_bound, data_array <= high_bound)
row_indices, col_indices = np.where(condition_mask)
matching_values_dict = {}
for row_idx, col_idx in zip(row_indices, col_indices):
    key = f"{row_names[row_idx]}_{column_names[col_idx]}"
    matching_values_dict[key] = data_array[row_idx, col_idx]
stop = time.time()
print("Time taken: ", stop - start)
print("Number of matching values: ", len(matching_values_dict) / 2)
# print(matching_values_dict)

Time taken:  0.033011436462402344
Number of matching values:  13.0
