In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd

In [2]:
# Function to parse KGML file and create a gene relationship matrix
def parse_kgml(kgml_path):
    # Parse KGML file
    tree = ET.parse(kgml_path)
    root = tree.getroot()

    # Extract entries with their gene names
    entries = {}
    genes = {}
    for entry in root.findall('entry'):
        entry_id = entry.get('id')
        entries[entry_id] = entry
        if entry.get('type') == 'gene':
            gene_ids = entry.get('name').split()
            for gene_id in gene_ids:
                genes[gene_id] = entry_id  # Maps 'hsa:xxxx' to entry id

    # Initialize the relationship matrix with zeros
    matrix_size = len(genes)
    relation_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

    # Create a mapping from gene to index for matrix
    gene_to_index = {gene: idx for idx, gene in enumerate(sorted(genes.keys()))}

    # Populate the matrix with gene-gene relationships
    for relation in root.findall('relation'):
        entry1_id, entry2_id = relation.get('entry1'), relation.get('entry2')
        entry1, entry2 = entries.get(entry1_id), entries.get(entry2_id)
        if entry1 is not None and entry2 is not None:
            if entry1.get('type') == 'gene' and entry2.get('type') == 'gene':
                for gene1 in entry1.get('name').split():
                    for gene2 in entry2.get('name').split():
                        index1, index2 = gene_to_index[gene1], gene_to_index[gene2]
                        relation_matrix[index1][index2] = 1  # Assuming a directed relation

    # Convert the matrix to a DataFrame for better readability
    matrix_df = pd.DataFrame(relation_matrix, index=sorted(genes.keys()), columns=sorted(genes.keys()))
    return matrix_df

In [3]:
# Path to the local KGML file
kgml_file_path = 'Pathway.xml'

In [4]:
# Parse the KGML file and create the gene relationship matrix
gene_relation_matrix_df = parse_kgml(kgml_file_path)

In [5]:
# Save the matrix to a CSV file
output_csv_path = kgml_file_path+'.gene_relation_network_matrix.csv'
gene_relation_matrix_df.to_csv(output_csv_path)