In [5]:
import pandas as pd
import shutil
import os

DEFINE THE FOLLOWING PARAMETERS

In [23]:
# Define the user
user = 'LuisQ'  # Options: PaolaH, OlivierK, ArnaudG, or LuisQ #same as the folder name in the NMR 600 folder

# Do not change the following lines #
repo_path = os.path.normpath(r'J:\FASIE_LAB\SINERGIA_PROJECT')  # Use raw string or double backslashes
raw_data_path = os.path.join(repo_path, r'pure_compounds_data\raw_data')  # Raw string for sub-paths
output_path = os.path.join(repo_path, r'pure_compounds_data\organized_data')  # Raw string for sub-paths
data_path = os.path.join(raw_data_path, user)  # Define the path to the raw data
ms_data_local = os.path.join(data_path, r'ms_data_local')  # Path to MS data extracts

nmr_repo = os.path.join(r'J:\COMMON FASIE-FATHO\NMR 600', user )
# Check if all paths exist
for path in [repo_path, raw_data_path, output_path, data_path, ms_data_local]:
    if not os.path.exists(path):
        print(f"Warning: The path {path} does not exist. Please verify.")


In [24]:
## Automatically locate the TSV file
tsv_file = None
for file in os.listdir(data_path):
    if file.endswith('.tsv'):  # Look for files with .tsv extension
        tsv_file = os.path.join(data_path, file)
        break

if not tsv_file:
    raise FileNotFoundError(f"No TSV file found in the folder: {data_path}")

print(f"Found TSV file: {tsv_file}")

Found TSV file: J:\FASIE_LAB\SINERGIA_PROJECT\pure_compounds_data\raw_data\LuisQ\LQ_compounds_metadata.tsv


In [25]:
# Load the TSV file
data = pd.read_csv(tsv_file, sep='\t')

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Iterate over unique 'pf_code' values
for pf_code, group in data.groupby('pf_code'):
    # Create the root folder for the pf_code
    pf_folder = os.path.join(output_path, pf_code)
    os.makedirs(pf_folder, exist_ok=True)
    
    # Create the 'pf_code_metadata.tsv' file inside the folder
    metadata_file = os.path.join(pf_folder, f"{pf_code}_metadata.tsv")
    group.to_csv(metadata_file, sep='\t', index=False)
    
    # Create the 'ms_extracts_data' folder
    ms_folder = os.path.join(pf_folder, 'ms_extracts_data')
    os.makedirs(ms_folder, exist_ok=True)
    
    # Copy the files from 'ms_filename_extract_pos' and 'ms_filename_extract_neg' columns
    for ms_file_column in ['ms_filename_extract_pos', 'ms_filename_extract_neg']:
        for ms_file in group[ms_file_column].dropna():
            source_path = os.path.join(ms_data_local, ms_file)
            destination_path = os.path.join(ms_folder, ms_file)
            if os.path.exists(source_path):
                shutil.copy(source_path, destination_path)
            else:
                print(f"Warning: File {source_path} does not exist and will be skipped.")
    
    # Create the 'compounds' folder
    compounds_folder = os.path.join(pf_folder, 'compounds')
    os.makedirs(compounds_folder, exist_ok=True)
    
    # Create a folder for each 'compound_labsample'
    for compound_labsample, compound_row in group.iterrows():
        compound_folder = os.path.join(compounds_folder, str(compound_row['compound_labsample']))
        os.makedirs(compound_folder, exist_ok=True)
        
        # Create the 'ms_data_compound' folder
        ms_data_compound_folder = os.path.join(compound_folder, 'ms_data_compound')
        os.makedirs(ms_data_compound_folder, exist_ok=True)
        
        # Copy files for the compound from specified columns
        for ms_compound_column in ['ms_filename_compound_pos', 'ms_filename_compound_neg', 'ms_filename_compound_dual']:
            ms_compound_file = compound_row[ms_compound_column]
            if pd.notna(ms_compound_file):
                source_path = os.path.join(ms_data_local, ms_compound_file)
                destination_path = os.path.join(ms_data_compound_folder, ms_compound_file)
                if os.path.exists(source_path):
                    shutil.copy(source_path, destination_path)
                else:
                    print(f"Warning: File {source_path} does not exist and will be skipped.")
        
        # Create the 'NMR data' folder
        nmr_data_folder = os.path.join(compound_folder, 'NMR data')
        os.makedirs(nmr_data_folder, exist_ok=True)
        
        # Copy the folder for the compound_id_NMR
        compound_id_nmr = compound_row['compound_id_NMR']
        if pd.notna(compound_id_nmr):
            nmr_source_folder = os.path.join(nmr_repo, compound_id_nmr)
            nmr_destination_folder = os.path.join(nmr_data_folder, compound_id_nmr)
            if os.path.exists(nmr_source_folder):
                shutil.copytree(nmr_source_folder, nmr_destination_folder)
            else:
                print(f"Warning: Folder {nmr_source_folder} does not exist and will be skipped.")

print(f"Folders and files have been successfully created in '{output_path}'.")



KeyboardInterrupt: 