# Report for primary knowledge sources in the Matrix KG

This notebook is about drafting a useful high level summary report for KGs highlighting information like licensing for primary knowledge sources and their relevance for drug repurposing modeling.

In [4]:
import pandas as pd
from pathlib import Path

# This contains all the code for this notebook
from pks_utils import (
    parse_infores,
    parse_reusabledata,
    parse_kgregistry,
    parse_matrixcurated,
    parse_matrixreviews,
    create_pks_subset_relevant_to_matrix,
    generate_list_of_pks_markdown_strings,
    generate_pks_markdown_documentation,
    load_json_file,
    load_yaml_file,
    save_markdown_file,
    save_yaml_file
)

# Outputs
pks_md_file = Path("primary-knowledge-sources.md")
pks_yaml_file = Path("primary-knowledge-sources.yml")

# Data sources
infores_file = Path("sources/infores_catalog.yaml")
reusabledata_file = Path("sources/reusabledata.json")
kgregistry_file = Path("sources/kgregistry.yml")

# Mappings
reusabledata_infores_mapping_file = Path("mappings/reusabledata-infores.sssom.tsv")
kgregistry_infores_mapping_file = Path("mappings/kgregistry-infores.sssom.tsv")

# Matrix curated information
matrixcurated_file = Path("sources/matrixcurated.tsv")
matrixreviews_file = Path("sources/matrixreviews.tsv")

# Load infores_catalog.yml (YAML)
infores_d = load_yaml_file(infores_file)

# Load reusabledata.json (JSON)
reusabledata_d = load_json_file(reusabledata_file)

# Mapping: reusabledata -> infores
reusabledata_mapping_d = pd.read_csv(reusabledata_infores_mapping_file, sep='\t', comment="#")
reusabledata_mapping = {
    row["subject_id"]: row["object_id"]
    for _, row in reusabledata_mapping_d.iterrows()
}

# Load kgregistry.yml (YAML)
kgregistry_d = load_yaml_file(kgregistry_file)
kgregistry_mapping_d = pd.read_csv(kgregistry_infores_mapping_file, sep="\t", comment="#")
kgregistry_mapping = {
    row["subject_id"]: row["object_id"]
    for _, row in kgregistry_mapping_d.iterrows()
}

# Load manually curated data (TSV)
matrixcurated_d = pd.read_csv(matrixcurated_file, sep="\t", dtype=str)
matrixreviews_d = pd.read_csv(matrixreviews_file, sep="\t", dtype=str)

# Create a list of relevant sources from the matrix curated data
# Note: this list might be overwritten depending on where we want it to come from
relevant_sources = [src.replace("infores:", "") for src in matrixcurated_d['primary_knowledge_source'].unique().tolist()]



In [5]:
# Parse all sources and combine into primary_knowledge_sources

primary_knowledge_sources = {}
parse_infores(infores_d, primary_knowledge_sources)
parse_kgregistry(kgregistry_d, primary_knowledge_sources, kgregistry_mapping)
parse_reusabledata(reusabledata_d, primary_knowledge_sources, reusabledata_mapping)
parse_matrixcurated(matrixcurated_d, primary_knowledge_sources)
parse_matrixreviews(matrixreviews_d, primary_knowledge_sources)

In [6]:
# Produce final outputs (yaml file and markdown):

## Create subset of PKS relevant to the matrix and only output that
matrix_subset_relevant_sources = create_pks_subset_relevant_to_matrix(primary_knowledge_sources, relevant_sources)
save_yaml_file(matrix_subset_relevant_sources, pks_yaml_file)

## Generate the documentation
pks_documentation_texts = generate_list_of_pks_markdown_strings(matrix_subset_relevant_sources)
documentation_md = generate_pks_markdown_documentation(pks_documentation_texts)
save_markdown_file(documentation_md, pks_md_file)