In [None]:
import pandas as pd

def postprocess_kg_conversion(cell_dict_csv_path, node_dict_csv_path, input_csv_path, output_csv_path):
    """
    Reads a CSV file to create a dictionary mapping cell types and nodes to IDs,
    then reads another CSV file and replaces 'tbd' in x_id/y_id with the correct IDs
    where x_type/y_type is 'celltype/state'. It also mirrors the edges to match the PrimeKG format.
    Finally, it saves the updated data to a new CSV file.

    :param cell_dict_csv_path: str. The file path for the latest cell_data_dict.csv
    :param node_dict_csv_path: str. The file path for the node CSV file from primekg
    :param input_csv_path: str. The alzkg file
    :param output_csv_path: str. The file path to save the updated CSV file.
    """
    # Function to create a dictionary from dict CSVs
    def create_dict_from_csv(csv_path, index_col, value_col):
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip(' "\'')
        mapping_dict = pd.Series(df[value_col].values, index=df[index_col]).to_dict()
        # Clean up the dictionary
        mapping_dict = {k: v.strip(' "') for k, v in mapping_dict.items()}
        return mapping_dict

    # Function to replace 'tbd' in the alzkg CSV
    def replace_tbd_with_id(row, cell_type_dict, node_dict):
        if row['x_type'] == 'celltype/state' and row['x_id'] == 'tbd':
            row['x_id'] = cell_type_dict.get(row['x_name'], row['x_id'])
        elif row['x_type'] == 'gene/protein' and row['x_id'] == 'tbd':
            row['x_id'] = node_dict.get(row['x_name'], row['x_id'])
        if row['y_type'] == 'celltype/state' and row['y_id'] == 'tbd':
            row['y_id'] = cell_type_dict.get(row['y_name'], row['y_id'])
        elif row['y_type'] == 'gene/protein' and row['y_id'] == 'tbd':
            row['y_id'] = node_dict.get(row['y_name'], row['y_id'])
        return row

    # 1. Read the CSV files and create dictionaries
    cell_type_dict = create_dict_from_csv(cell_dict_csv_path, 'cell_type_state_data_dict', 'ID')
    node_dict = create_dict_from_csv(node_dict_csv_path, 'node_name', 'node_id')

    # 2. Read the alzkg CSV file and apply the function to replace 'tbd'
    input_df = pd.read_csv(input_csv_path)
    updated_df = input_df.apply(lambda row: replace_tbd_with_id(row, cell_type_dict, node_dict), axis=1)

    # 3. Mirror the edges in the updated DataFrame
    interchanged_df = updated_df.copy()

    # Specify columns to interchange for mirroring
    # Adjust the column names based on your specific CSV format
    columns_to_interchange = ['id', 'type', 'name', 'source']
    for col in columns_to_interchange:
        # Adjust 'x_col' and 'y_col' to match your CSV's column naming pattern
        interchanged_df[f'x_{col}'], interchanged_df[f'y_{col}'] = updated_df[f'y_{col}'], updated_df[f'x_{col}']

    # Concatenate the original and mirrored dataframes
    final_df = pd.concat([updated_df, interchanged_df], ignore_index=True)

    # Save the final dataframe to the provided CSV file path
    final_df.to_csv(output_csv_path, index=False)

    return f"Updated file saved to {output_csv_path}"

# Example usage:
# Import the drive module from Google Colab for file access, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
filepath = "/content/drive/My Drive/primekg_files/"
postprocess_kg_conversion(
    filepath + 'cell_data_dict.csv',
    filepath + 'nodes_filtered.csv',
    filepath + 'kgraw_with_mathys_lau.csv',
    filepath + 'final.csv'
)
