In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("all_identifications.tsv", sep="\t")

In [3]:
small_object_df = df[["full_CCMS_path", "SpectrumID"]]

In [4]:
len(small_object_df.drop_duplicates())

2710199

In [33]:
small_object_df["full_CCMS_path"] = small_object_df["full_CCMS_path"].apply(lambda x : "f." + x)

In [34]:
small_object_df.head()

Unnamed: 0,full_CCMS_path,SpectrumID
0,f.MSV000082953/ccms_peak/raw data/P1_B6.mzML,CCMSLIB00003139594
1,f.MSV000080673/ccms_peak/2017.AmericanGut3K.mz...,CCMSLIB00000852361
2,f.MSV000083004/ccms_peak/Bile_acid_gavage_expe...,CCMSLIB00003125773
3,f.MSV000082312/ccms_peak/raw/Samples/SD_01-201...,CCMSLIB00004713744
4,f.MSV000080918/ccms_peak/Steatohepatitis progr...,CCMSLIB00003127388


In [35]:
small_object_df.to_csv("compound_occurrences.tsv", sep="\t", index=False)

In [36]:
ones_set = set()
for id_object in small_object_df.to_dict(orient="records"):
    filename = id_object["full_CCMS_path"]
    compound_name = id_object["SpectrumID"]
    key = "%s_%s" % (filename, compound_name)
    ones_set.add(key)

In [37]:
all_filenames = list(set(list(small_object_df["full_CCMS_path"])))
all_compounds = list(set(list(small_object_df["SpectrumID"])))

In [38]:
import uuid
import os

def split_formatting(compounds_list):
    all_records = []
    for compound in compounds_list:
        record_dict = {}
        record_dict["#OTU ID"] = compound
        for filename in all_filenames:
            key = "%s_%s" % (filename, compound)
            if key in ones_set:
                record_dict[filename] = "1"
            else:
                record_dict[filename] = "0"
        all_records.append(record_dict)
    
    output_filename = os.path.join("output", str(uuid.uuid4()) + ".tsv")
    pd.DataFrame(all_records).to_csv(output_filename, sep="\t", index=False)
    

In [21]:
split_formatting(all_compounds[:10])

In [None]:
from joblib import Parallel, delayed

def divide_chunks(l, n): 
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

compound_chunks = list(divide_chunks(all_compounds, 1000))

Parallel(n_jobs=8)(delayed(split_formatting)(compound_chunk) for compound_chunk in compound_chunks)


In [None]:
!awk '(NR == 1) || (FNR > 1)' output/* > merged/merged.tsv

In [None]:
!biom convert -i ./merged/merged.tsv -o ./merged/merged.biom --table-type="OTU table" --to-hdf5