In [1]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
import shutil

DEFINE THE FOLLOWING PARAMETERS

In [2]:
# Define the user
user = 'LuisQ'  # Options: PaolaH, OlivierK, ArnaudG, or LuisQ ATTENTION: NAME as the folder name in the NMR 600 folder!!!!

# Do not change the following lines #
repo_path = os.path.normpath(r'J:\FASIE_LAB\SINERGIA_PROJECT')  # Use raw string or double backslashes
raw_data_path = os.path.join(repo_path, r'pure_compounds_data\raw_data')  # Raw string for sub-paths
output_path = os.path.join(repo_path, r'pure_compounds_data\organized_data')  # Raw string for sub-paths
data_path = os.path.join(raw_data_path, user)  # Define the path to the raw data
ms_data_local = os.path.join(data_path, r'ms_data_local')  # Path to MS data extracts
ecd_data_local = os.path.join(data_path, r'ecd_data_local')  # Path to MS data extracts
uv_data_local = os.path.join(data_path, r'uv_data_local')  # Path to MS data extracts
nmr_repo = os.path.join(r'J:\COMMON FASIE-FATHO\NMR 600', user )

# Check if all paths exist
for path in [repo_path, raw_data_path, output_path, data_path, ms_data_local]:
    if not os.path.exists(path):
        print(f"Warning: The path {path} does not exist. Please verify.")

In [3]:
## Automatically locate the TSV file
tsv_file = None
for file in os.listdir(data_path):
    if file.endswith('.tsv'):  # Look for files with .tsv extension
        tsv_file = os.path.join(data_path, file)
        break

if not tsv_file:
    raise FileNotFoundError(f"No TSV file found in the folder: {data_path}")

print(f"Found TSV file: {tsv_file}")

Found TSV file: J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\raw_data\LuisQ\Luis_compounds.tsv


In [None]:
# Load the TSV file
data = pd.read_csv(tsv_file, sep='\t')

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Iterate over unique 'pf_code' values with a progress bar
for pf_code, group in tqdm(data.groupby('pf_code'), desc="Processing pf_codes"):
    # Create the root folder for the pf_code
    pf_folder = os.path.join(output_path, pf_code)
    os.makedirs(pf_folder, exist_ok=True)

    # Create the 'pf_code_metadata.tsv' file inside the folder
    metadata_file = os.path.join(pf_folder, f"{pf_code}_metadata.tsv")
    group.to_csv(metadata_file, sep='\t', index=False)

    # Create the 'ms_extracts_data' folder
    ms_folder = os.path.join(pf_folder, 'ms_extracts_data')
    os.makedirs(ms_folder, exist_ok=True)

    # Copy the files from 'ms_filename_extract_pos' and 'ms_filename_extract_neg' columns
    for ms_file_column in ['ms_filename_extract_pos', 'ms_filename_extract_neg']:
        for ms_file in tqdm(group[ms_file_column].dropna(), desc=f"Copying MS Extracts for {pf_code}", leave=False):
            source_path = os.path.join(ms_data_local, ms_file)
            destination_path = os.path.join(ms_folder, ms_file)
            if os.path.exists(source_path):
                shutil.copy(source_path, destination_path)
            else:
                print(f"Warning: File {source_path} does not exist and will be skipped.")

    # Create the 'compounds' folder
    compounds_folder = os.path.join(pf_folder, 'compounds')
    os.makedirs(compounds_folder, exist_ok=True)

    # Iterate over each compound with a progress bar
    for _, compound_row in tqdm(group.iterrows(), total=len(group), desc=f"Processing compounds for {pf_code}", leave=False):
        compound_folder = os.path.join(compounds_folder, str(compound_row['compound_labsample']))
        os.makedirs(compound_folder, exist_ok=True)

        # Create the 'ms_data_compound' folder
        ms_data_compound_folder = os.path.join(compound_folder, 'ms_data_compound')
        os.makedirs(ms_data_compound_folder, exist_ok=True)

        # Copy files for the compound from specified columns
        for ms_compound_column in ['ms_filename_compound_pos', 'ms_filename_compound_neg', 'ms_filename_compound_dual']:
            ms_compound_file = compound_row[ms_compound_column]
            if pd.notna(ms_compound_file):
                source_path = os.path.join(ms_data_local, ms_compound_file)
                destination_path = os.path.join(ms_data_compound_folder, ms_compound_file)
                if os.path.exists(source_path):
                    shutil.copy(source_path, destination_path)
                else:
                    print(f"Warning: File {source_path} does not exist and will be skipped.")

        # Create the 'NMR data' folder
        nmr_data_folder = os.path.join(compound_folder, 'NMR data')
        os.makedirs(nmr_data_folder, exist_ok=True)

        # Copy the folder for the compound_id_NMR
        compound_id_nmr = compound_row['compound_id_NMR']
        if pd.notna(compound_id_nmr):
            nmr_source_folder = os.path.join(nmr_repo, compound_id_nmr)
            nmr_destination_folder = os.path.join(nmr_data_folder, compound_id_nmr)
            if os.path.exists(nmr_source_folder):
                shutil.copytree(nmr_source_folder, nmr_destination_folder)
            else:
                print(f"Warning: Folder {nmr_source_folder} does not exist and will be skipped.")

        # Create the 'ecd_data_compound' folder
        ecd_data_compound_folder = os.path.join(compound_folder, 'ecd_data_compound')
        os.makedirs(ecd_data_compound_folder, exist_ok=True)

        # Copy the files listed in 'ecd_filename_compound' from 'ecd_data_local'
        ecd_filename_compound = compound_row.get('ecd_filename_compound', None)
        if pd.notna(ecd_filename_compound):
            source_path = os.path.join(ecd_data_local, ecd_filename_compound)
            destination_path = os.path.join(ecd_data_compound_folder, ecd_filename_compound)
            if os.path.exists(source_path):
                shutil.copy(source_path, destination_path)
            else:
                print(f"Warning: File {source_path} does not exist and will be skipped.")

        # Create the 'uv_data_compound' folder
        uv_data_compound_folder = os.path.join(compound_folder, 'uv_data_compound')
        os.makedirs(uv_data_compound_folder, exist_ok=True)

        # Copy the files listed in 'ecd_filename_compound' from 'ecd_data_local'
        uv_filename_compound = compound_row.get('uv_filename_compound', None)
        if pd.notna(uv_filename_compound):
            source_path = os.path.join(uv_data_local, uv_filename_compound)
            destination_path = os.path.join(uv_data_compound_folder, uv_filename_compound)
            if os.path.exists(source_path):
                shutil.copy(source_path, destination_path)
            else:
                print(f"Warning: File {source_path} does not exist and will be skipped.")

print(f"Folders and files have been successfully created in '{output_path}'.")

Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]







Processing pf_codes:   0%|          | 0/2 [13:39<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Load the TSV file
data = pd.read_csv(tsv_file, sep='\t')

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Iterate over unique 'pf_code' values with a progress bar
for pf_code, group in tqdm(data.groupby('pf_code'), desc="Processing pf_codes"):
    # Create the root folder for the pf_code
    pf_folder = os.path.join(output_path, pf_code)
    os.makedirs(pf_folder, exist_ok=True)

    # Create the 'pf_code_metadata.tsv' file inside the folder
    metadata_file = os.path.join(pf_folder, f"{pf_code}_metadata.tsv")
    group.to_csv(metadata_file, sep='\t', index=False)

    # Create the 'ms_extracts_data' folder
    ms_folder = os.path.join(pf_folder, 'ms_extracts_data')
    os.makedirs(ms_folder, exist_ok=True)

    # Copy missing files from 'ms_filename_extract_pos' and 'ms_filename_extract_neg'
    for ms_file_column in ['ms_filename_extract_pos', 'ms_filename_extract_neg']:
        for ms_file in tqdm(group[ms_file_column].dropna(), desc=f"Copying MS Extracts for {pf_code}", leave=False):
            source_path = os.path.join(ms_data_local, ms_file)
            destination_path = os.path.join(ms_folder, ms_file)
            if os.path.exists(source_path) and not os.path.exists(destination_path):
                shutil.copy(source_path, destination_path)

    # Create the 'compounds' folder
    compounds_folder = os.path.join(pf_folder, 'compounds')
    os.makedirs(compounds_folder, exist_ok=True)

    # Iterate over each compound with a progress bar
    for _, compound_row in tqdm(group.iterrows(), total=len(group), desc=f"Processing compounds for {pf_code}", leave=False):
        compound_folder = os.path.join(compounds_folder, str(compound_row['compound_labsample']))
        os.makedirs(compound_folder, exist_ok=True)

        # Create and copy missing files to 'ms_data_compound'
        ms_data_compound_folder = os.path.join(compound_folder, 'ms_data_compound')
        os.makedirs(ms_data_compound_folder, exist_ok=True)
        for ms_compound_column in ['ms_filename_compound_pos', 'ms_filename_compound_neg', 'ms_filename_compound_dual']:
            ms_compound_file = compound_row[ms_compound_column]
            if pd.notna(ms_compound_file):
                source_path = os.path.join(ms_data_local, ms_compound_file)
                destination_path = os.path.join(ms_data_compound_folder, ms_compound_file)
                if os.path.exists(source_path) and not os.path.exists(destination_path):
                    shutil.copy(source_path, destination_path)

        # Copy missing NMR data folder
        nmr_data_folder = os.path.join(compound_folder, 'NMR data')
        os.makedirs(nmr_data_folder, exist_ok=True)
        compound_id_nmr = compound_row['compound_id_NMR']
        if pd.notna(compound_id_nmr):
            nmr_source_folder = os.path.join(nmr_repo, compound_id_nmr)
            nmr_destination_folder = os.path.join(nmr_data_folder, compound_id_nmr)
            if os.path.exists(nmr_source_folder) and not os.path.exists(nmr_destination_folder):
                shutil.copytree(nmr_source_folder, nmr_destination_folder)

        # Copy missing ECD data files
        ecd_data_compound_folder = os.path.join(compound_folder, 'ecd_data_compound')
        os.makedirs(ecd_data_compound_folder, exist_ok=True)
        ecd_filename_compound = compound_row.get('ecd_filename_compound', None)
        if pd.notna(ecd_filename_compound):
            source_path = os.path.join(ecd_data_local, ecd_filename_compound)
            destination_path = os.path.join(ecd_data_compound_folder, ecd_filename_compound)
            if os.path.exists(source_path) and not os.path.exists(destination_path):
                shutil.copy(source_path, destination_path)

        # Copy missing UV data files
        uv_data_compound_folder = os.path.join(compound_folder, 'uv_data_compound')
        os.makedirs(uv_data_compound_folder, exist_ok=True)
        uv_filename_compound = compound_row.get('uv_filename_compound', None)
        if pd.notna(uv_filename_compound):
            source_path = os.path.join(uv_data_local, uv_filename_compound)
            destination_path = os.path.join(uv_data_compound_folder, uv_filename_compound)
            if os.path.exists(source_path) and not os.path.exists(destination_path):
                shutil.copy(source_path, destination_path)

print(f"Folders and files have been successfully created in '{output_path}' with only missing files copied.")


Processing pf_codes:   0%|          | 0/2 [04:10<?, ?it/s]


KeyboardInterrupt: 

In [9]:
from tqdm import tqdm
import pandas as pd
import os
import shutil

# Load the TSV file
data = pd.read_csv(tsv_file, sep='\t')

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Iterate over unique 'pf_code' values with a progress bar
for pf_code, group in tqdm(data.groupby('pf_code'), desc="Processing pf_codes"):
    # Create the root folder for the pf_code
    pf_folder = os.path.join(output_path, pf_code)
    os.makedirs(pf_folder, exist_ok=True)

    # Create the 'pf_code_metadata.tsv' file inside the folder
    metadata_file = os.path.join(pf_folder, f"{pf_code}_metadata.tsv")
    group.to_csv(metadata_file, sep='\t', index=False)

    # Create the 'ms_extracts_data' folder
    ms_folder = os.path.join(pf_folder, 'ms_extracts_data')
    os.makedirs(ms_folder, exist_ok=True)

    # Copy the files from 'ms_filename_extract_pos' and 'ms_filename_extract_neg' columns
    for ms_file_column in ['ms_filename_extract_pos', 'ms_filename_extract_neg']:
        for ms_file in tqdm(group[ms_file_column].dropna().unique(), desc=f"Copying MS Extracts for {pf_code}", leave=True):
            source_path = os.path.join(ms_data_local, ms_file)
            destination_path = os.path.join(ms_folder, ms_file)
            if os.path.exists(source_path):
                if not os.path.exists(destination_path):
                    shutil.copy(source_path, destination_path)
                else:
                    tqdm.write(f"File {destination_path} already exists, skipping.")
            else:
                tqdm.write(f"Warning: File {source_path} does not exist and will be skipped.")

    # Create the 'compounds' folder
    compounds_folder = os.path.join(pf_folder, 'compounds')
    os.makedirs(compounds_folder, exist_ok=True)

    # Iterate over each compound with a progress bar
    for _, compound_row in tqdm(group.iterrows(), total=len(group), desc=f"Processing compounds for {pf_code}", leave=True):
        compound_folder = os.path.join(compounds_folder, str(compound_row['compound_labsample']))
        os.makedirs(compound_folder, exist_ok=True)

        # Print current subfolder being processed
        print(f"\n===== Processing compound folder: {compound_folder} =====")

        # Create the 'ms_data_compound' folder
        ms_data_compound_folder = os.path.join(compound_folder, 'ms_data_compound')
        os.makedirs(ms_data_compound_folder, exist_ok=True)

        # Copy files for the compound from specified columns
        for ms_compound_column in ['ms_filename_compound_pos', 'ms_filename_compound_neg', 'ms_filename_compound_dual']:
            ms_compound_file = compound_row[ms_compound_column]
            if pd.notna(ms_compound_file):
                source_path = os.path.join(ms_data_local, ms_compound_file)
                destination_path = os.path.join(ms_data_compound_folder, ms_compound_file)
                if os.path.exists(source_path):
                    if not os.path.exists(destination_path):
                        shutil.copy(source_path, destination_path)
                    else:
                        tqdm.write(f"File {destination_path} already exists, skipping.")
                else:
                    tqdm.write(f"Warning: File {source_path} does not exist and will be skipped.")

        # Create the 'NMR data' folder
        nmr_data_folder = os.path.join(compound_folder, 'NMR data')
        os.makedirs(nmr_data_folder, exist_ok=True)

        # Copy the folder for the compound_id_NMR
        compound_id_nmr = compound_row['compound_id_NMR']
        if pd.notna(compound_id_nmr):
            nmr_source_folder = os.path.join(nmr_repo, compound_id_nmr)
            nmr_destination_folder = os.path.join(nmr_data_folder, compound_id_nmr)
            if os.path.exists(nmr_source_folder):
                if not os.path.exists(nmr_destination_folder):
                    shutil.copytree(nmr_source_folder, nmr_destination_folder)
                else:
                    tqdm.write(f"Folder {nmr_destination_folder} already exists, skipping.")
            else:
                tqdm.write(f"Warning: Folder {nmr_source_folder} does not exist and will be skipped.")

        # Create the 'ecd_data_compound' folder
        ecd_data_compound_folder = os.path.join(compound_folder, 'ecd_data_compound')
        os.makedirs(ecd_data_compound_folder, exist_ok=True)

        # Copy the files listed in 'ecd_filename_compound' from 'ecd_data_local'
        ecd_filename_compound = compound_row.get('ecd_filename_compound', None)
        if pd.notna(ecd_filename_compound):
            source_path = os.path.join(ecd_data_local, ecd_filename_compound)
            destination_path = os.path.join(ecd_data_compound_folder, ecd_filename_compound)
            if os.path.exists(source_path):
                if not os.path.exists(destination_path):
                    shutil.copy(source_path, destination_path)
                else:
                    tqdm.write(f"File {destination_path} already exists, skipping.")
            else:
                tqdm.write(f"Warning: File {source_path} does not exist and will be skipped.")

        # Create the 'uv_data_compound' folder
        uv_data_compound_folder = os.path.join(compound_folder, 'uv_data_compound')
        os.makedirs(uv_data_compound_folder, exist_ok=True)

        # Copy the files listed in 'uv_filename_compound' from 'uv_data_local'
        uv_filename_compound = compound_row.get('uv_filename_compound', None)
        if pd.notna(uv_filename_compound):
            source_path = os.path.join(uv_data_local, uv_filename_compound)
            destination_path = os.path.join(uv_data_compound_folder, uv_filename_compound)
            if os.path.exists(source_path):
                if not os.path.exists(destination_path):
                    shutil.copy(source_path, destination_path)
                else:
                    tqdm.write(f"File {destination_path} already exists, skipping.")
            else:
                tqdm.write(f"Warning: File {source_path} does not exist and will be skipped.")

print(f"Folders and files have been successfully created in '{output_path}'.")


Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]
Copying MS Extracts for V113071: 100%|██████████| 1/1 [00:00<00:00, 60.84it/s]


File J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\ms_extracts_data\20230215_RMG_LQ_1_LQ019102_1_pos.mzML already exists, skipping.



Copying MS Extracts for V113071: 100%|██████████| 1/1 [00:00<00:00, 72.50it/s]


File J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\ms_extracts_data\20230215_RMG_LQ_3_LQ019102_1_neg.mzML already exists, skipping.



Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]              
Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]              
Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]              
Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]              
Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]              


===== Processing compound folder: J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\compounds\LQ020712 =====
File J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\compounds\LQ020712\ms_data_compound\20230327_LFX_LQ_72_LQ020712_1_dualPOSNEG.mzML already exists, skipping.
Folder J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\compounds\LQ020712\NMR data\LQ020712 already exists, skipping.
File J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\compounds\LQ020712\ecd_data_compound\070323-LQ020712.csv already exists, skipping.

===== Processing compound folder: J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\compounds\LQ020714 =====
Folder J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\compounds\LQ020714\NMR data\LQ020714 already exists, skipping.



Processing pf_codes:   0%|          | 0/2 [00:00<?, ?it/s]                      


===== Processing compound folder: J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\organized_data\V113071\compounds\LQ020716 =====


Processing compounds for V113071:  13%|█▎        | 2/15 [00:31<03:25, 15.81s/it]
Processing pf_codes:   0%|          | 0/2 [00:31<?, ?it/s]


KeyboardInterrupt: 