In [None]:
import pandas as pd

def append_to_master_kg(master_csv_path, additional_csv_paths):
    """
    Appends a list of CSV files to a master AlzKG CSV file, mirroring each edge to match the PrimeKG format.

    Parameters:
    - master_csv_path: str, the path to the master AlzKG CSV file.
    - additional_csv_paths: list, a list of paths to additional CSV files to append.

    Returns:
    - final_df: DataFrame, the combined DataFrame with mirrored edges.
    """
    # Load the master AlzKG CSV file
    kgraw = pd.read_csv(master_csv_path)

    # Load and concatenate the additional CSV files, ignoring their headers
    additional_dfs = [pd.read_csv(file_path, header=None, skiprows=1) for file_path in additional_csv_paths]
    combined_df = pd.concat(additional_dfs, ignore_index=True)

    # Assuming kgraw and additional CSVs share the same column structure
    combined_df.columns = kgraw.columns

    # Create a copy for mirroring
    interchanged_df = combined_df.copy()

    # Specify columns to interchange for mirroring
    # Adjust the column names based on your specific CSV format
    columns_to_interchange = ['id', 'type', 'name', 'source']
    for col in columns_to_interchange:
        # Adjust 'x_col' and 'y_col' to match your CSV's column naming pattern
        interchanged_df[f'x_{col}'], interchanged_df[f'y_{col}'] = combined_df[f'y_{col}'], combined_df[f'x_{col}']

    # Now concatenate kgraw with the original and mirrored dataframes
    final_df = pd.concat([kgraw, combined_df, interchanged_df], ignore_index=True)

    return final_df

# Example usage:
# Import the drive module from Google Colab for file access, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
filepath = "/content/drive/My Drive/primekg_files"

master_csv_path = f'{filepath}/kg_raw_orig_filtered.csv'

additional_csv_paths = [
    f'{filepath}/Ex_kg.csv',
    f'{filepath}/In_kg.csv',
    f'{filepath}/Oli_kg.csv',
    f'{filepath}/Opc_kg.csv',
    f'{filepath}/Mic_kg.csv',
    f'{filepath}/Ast_kg.csv'
]

# Append the additional CSV files to the master CSV and mirror edges
final_df = append_to_master_kg(master_csv_path, additional_csv_paths)

# Save the final DataFrame to a new CSV file
final_df.to_csv(f'{filepath}/kgraw_with_mathys.csv', index=False)


In [None]:
import pandas as pd

def update_celltype_node_ids(cell_dict_csv_path, node_dict_csv_path, input_csv_path, output_csv_path):
    """
    Creates dictionaries mapping node names to node IDs,
    then reads the alzkg file and replaces 'tbd' in x_id/y_id with the correct IDs
    Finally, it saves the updated data to a new CSV file.
    This function can be called after processing a dataset from a single paper or after a set of papers.

    :param cell_dict_csv_path: str. The file path for the latest cell_data_dict.csv
    :param node_dict_csv_path: str. The file path for the nodes CSV file from primekg
    :param input_csv_path: str. The alzkg file
    :param output_csv_path: str. The file path to save the updated CSV file.
    """
    # Function to create a dictionary from dict CSVs
    def create_dict_from_csv(csv_path, index_col, value_col):
        df = pd.read_csv(csv_path)
        df.columns = df.columns.str.strip(' "\'')
        mapping_dict = pd.Series(df[value_col].values, index=df[index_col]).to_dict()
        # Clean up the dictionary
        mapping_dict = {k: v.strip(' "') for k, v in mapping_dict.items()}
        return mapping_dict

    # Function to replace 'tbd' in the alzkg CSV
    def replace_tbd_with_id(row, cell_type_dict, node_dict):
        if row['x_type'] == 'celltype/state' and row['x_id'] == 'tbd':
            row['x_id'] = cell_type_dict.get(row['x_name'], row['x_id'])
        elif row['x_type'] == 'gene/protein' and row['x_id'] == 'tbd':
            row['x_id'] = node_dict.get(row['x_name'], row['x_id'])
        if row['y_type'] == 'celltype/state' and row['y_id'] == 'tbd':
            row['y_id'] = cell_type_dict.get(row['y_name'], row['y_id'])
        elif row['y_type'] == 'gene/protein' and row['y_id'] == 'tbd':
            row['y_id'] = node_dict.get(row['y_name'], row['y_id'])
        return row

    # 1. Read the CSV files and create dictionaries
    cell_type_dict = create_dict_from_csv(cell_dict_csv_path, 'cell_type_state_data_dict', 'ID')
    node_dict = create_dict_from_csv(node_dict_csv_path, 'node_name', 'node_id')

    # 2. Read the alzkg CSV file and apply the function to replace 'tbd'
    input_df = pd.read_csv(input_csv_path)
    updated_df = input_df.apply(lambda row: replace_tbd_with_id(row, cell_type_dict, node_dict), axis=1)

    # Save the updated dataframe to the provided CSV file path
    updated_df.to_csv(output_csv_path, index=False)

    return f"Updated file saved to {output_csv_path}"


# Example usage:
# Import the drive module from Google Colab for file access, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
filepath = "/content/drive/My Drive/primekg_files/"
update_celltype_node_ids(
    filepath + 'cell_data_dict.csv',
    filepath + 'nodes_filtered.csv',
    filepath + 'kgraw_with_mathys_lau.csv',
    filepath + 'final.csv'
)

