In [5]:
import pandas as pd

# Path to the CSV file with embeddings
embeddings_csv_path = 'outputs/FASTA_embeddings_20240213.csv'

# Load the CSV file. Assuming no header, specify column names manually
# The first column is 'TC_Number', followed by embedding dimensions
col_names = ['TC_Number'] + [f'dim{i}' for i in range(1, 1281)]
embeddings_df = pd.read_csv(embeddings_csv_path, header=None, names=col_names)

# Convert the dataframe to a dictionary {TC_Number: embedding_vector}
embeddings_dict = embeddings_df.set_index('TC_Number').T.to_dict('list')

In [8]:
import glob
import os

# Original directory containing your positive/negative CSV files
original_files_dir = 'Positive_Negative/'

# New directory to save updated CSV files
updated_files_dir = 'Positive_Negative_Mapping/'

# Ensure the updated_files_dir exists, create if it doesn't
if not os.path.exists(updated_files_dir):
    os.makedirs(updated_files_dir)

# Iterate over each CSV file in the original directory
for csv_file in glob.glob(f'{original_files_dir}*.csv'):
    # Load the positive/negative CSV file
    pos_neg_df = pd.read_csv(csv_file, header=None, names=['positive', 'negative'])

    # Map the 'positive' and 'negative' TC_Numbers to their embeddings
    # Here, just storing a placeholder for the embedding reference
    pos_neg_df['positive_embedding'] = pos_neg_df['positive'].map(
        lambda x: 'Embedding_Ref' if x in embeddings_dict else 'Not Available')
    pos_neg_df['negative_embedding'] = pos_neg_df['negative'].map(
        lambda x: 'Embedding_Ref' if x in embeddings_dict else 'Not Available')

    # Construct the new file path in the updated directory
    # os.path.basename(csv_file) gets the file name from the original path
    new_file_path = os.path.join(updated_files_dir, os.path.basename(csv_file))

    # Save the updated DataFrame to the new file path
    pos_neg_df.to_csv(new_file_path, index=False, header=False)

    print(f'Updated file saved: {new_file_path}')

Updated file saved: Positive_Negative_Mapping/CHEBI:26271 proline_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:61425 psicosyllysine_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:18070 cytochrome c_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:3371 capreomycin_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:17237 (E)-4-(trimethylammonio)but-2-enoate_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:2453 acyclovir_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:15741 succinic acid_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:15603 L-leucine_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:46898 cholesteryl oleate_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:16469 17beta-estradiol_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:22921 brassinosteroid_output.csv
Updated file saved: Positive_Negative_Mapping/CHEBI:84688 Fe(III)-comple