Add a new table for managing metadata.

open-prophetdb · Apr 2, 2024 · bea2f0c · bea2f0c
1 parent 91a0dce
commit bea2f0c
Show file tree

Hide file tree

Showing 11 changed files with 291 additions and 8 deletions.
diff --git a/data/README.md b/data/README.md
@@ -20,9 +20,10 @@ cargo install json2parquet
 Get additional data for each compound from [DrugBank](https://www.drugbank.ca/). You might need to request access to the DrugBank data. If you have access, download the DrugBank XML file and save it to the `data` directory. We assume the file is named `drugbank_5.1_2024-01-03.xml`.
 
 ```bash
-python3 data/drugbank.py tojson --input-file data/drugbank_5.1_2024-01-03.xml --output-dir data
+python3 data/drugbank.py tojson --input-file data/drugbank/drugbank_5.1_2024-01-03.xml --output-dir data/drugbank
 
-json2parquet data/drugbank_5.1_2024-01-03.json data/drugbank_5.1_2024-01-03.parquet
+python3 data/drugbank.py tojson --input-file data/drugbank/drugbank_5.1_2024-01-03.xml --output-dir data/drugbank --format linejson
+json2parquet data/drugbank/drugbank_5.1_2024-01-03.jsonl data/drugbank/drugbank_5.1_2024-01-03.parquet
 ```
 
 ### Gene
diff --git a/data/drugbank.py b/data/drugbank.py
@@ -239,6 +239,16 @@ def format_salts_drugbank_id(drug):
     for path, default in uncorrected_paths:
         drugs_data = [set_default_if_empty(drug, path, default) for drug in drugs_data]
 
+    # We don't like the following fields, because they might cause issues when we import the data into a database.
+    for drug in drugs_data:
+        if "type" in drug:
+            drug["compound_type"] = drug["type"]
+            del drug["type"]
+
+        if "state" in drug:
+            drug["compound_state"] = drug["state"]
+            del drug["state"]
+
     # Prepare the output JSON file path using version and exported-on attributes
     json_file_path = os.path.join(output_dir, f"drugbank_{version}_{exported_on}.json")
 
@@ -251,6 +261,7 @@ def format_salts_drugbank_id(drug):
 
     data = checktypes_wrapper(output_file, json_file_path)
     if format == "linejson":
+        json_file_path = json_file_path.replace(".json", ".jsonl")
         with open(json_file_path, "w", encoding="utf-8") as json_file:
             for drug in data:
                 json_file.write(json.dumps(drug, ensure_ascii=False) + "\n")

diff --git a/migrations/20240401_add_compound_metadata_table.down.sql b/migrations/20240401_add_compound_metadata_table.down.sql
@@ -0,0 +1,2 @@
+-- Drop the compound metadata table when rolling back the migration.
+DROP TABLE IF EXISTS biomedgps_compound_metadata;
diff --git a/migrations/20240401_add_compound_metadata_table.up.sql b/migrations/20240401_add_compound_metadata_table.up.sql
@@ -0,0 +1,30 @@
+-- biomedgps_compound_metadata table is created to store metadata for compounds, such as the compound name, the compound type, patents, etc.
+CREATE TABLE
+    IF NOT EXISTS biomedgps_compound_metadata (
+        id BIGSERIAL PRIMARY KEY, -- The entity metadata ID
+        compound_type VARCHAR(64) NOT NULL, -- The type of the compound, such as drug, small molecule, etc.
+        created VARCHAR(16) NOT NULL, -- The created time of the compound metadata
+        updated VARCHAR(16) NOT NULL, -- The updated time of the compound metadata
+        drugbank_id ARRAY[TEXT] NOT NULL, -- The DrugBank IDs of the compound
+        name VARCHAR(128) NOT NULL, -- The name of the compound
+        description TEXT NOT NULL, -- The description of the compound
+        cas_number VARCHAR(32) NOT NULL, -- The CAS number of the compound
+        unii VARCHAR(32) NOT NULL, -- The UNII of the compound
+        compound_state VARCHAR(32) NOT NULL, -- The state of the compound, such as solid, liquid, etc.
+        groups ARRAY[TEXT] NOT NULL, -- The groups of the compound, such as approved, investigational, etc.
+        synthesis_reference TEXT NOT NULL, -- The synthesis reference of the compound
+        indication TEXT NOT NULL, -- The indication of the compound
+        pharmacodynamics TEXT NOT NULL, -- The pharmacodynamics of the compound
+        mechanism_of_action TEXT NOT NULL, -- The mechanism of action of the compound
+        toxicity TEXT NOT NULL, -- The toxicity of the compound
+        metabolism TEXT NOT NULL, -- The metabolism of the compound
+        absorption TEXT NOT NULL, -- The absorption of the compound
+        half_life TEXT NOT NULL, -- The half-life of the compound
+        protein_binding TEXT NOT NULL, -- The protein binding of the compound
+        route_of_elimination TEXT NOT NULL, -- The route of elimination of the compound
+        volume_of_distribution TEXT NOT NULL, -- The volume of distribution of the compound
+        clearance TEXT NOT NULL, -- The clearance of the compound
+        synonyms ARRAY[TEXT] NOT NULL, -- The synonyms of the compound
+        categories JSONB NOT NULL, -- The categories of the compound
+        patents JSONB NOT NULL, -- The patents of the compound
+    );
diff --git a/src/api/mod.rs b/src/api/mod.rs
@@ -3,4 +3,4 @@
 pub mod route;
 pub mod schema;
 pub mod auth;
-pub mod req;
+pub mod publication;
diff --git a/src/api/req.rs → src/api/publication.rs b/src/api/req.rs → src/api/publication.rs
diff --git a/src/api/route.rs b/src/api/route.rs
@@ -1,7 +1,7 @@
 //! This module defines the routes of the API.
 
 use crate::api::auth::{CustomSecurityScheme, USERNAME_PLACEHOLDER};
-use crate::api::req::Publication;
+use crate::api::publication::Publication;
 use crate::api::schema::{
     ApiTags, DeleteResponse, GetEntityColorMapResponse, GetGraphResponse, GetPromptResponse,
     GetPublicationsResponse, GetRecordsResponse, GetRelationCountResponse, GetStatisticsResponse,

diff --git a/src/api/schema.rs b/src/api/schema.rs
@@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize};
 use validator::Validate;
 use validator::ValidationErrors;
 
-use super::req::{PublicationDetail, PublicationRecords};
+use super::publication::{PublicationDetail, PublicationRecords};
 
 #[derive(Tags)]
 pub enum ApiTags {

diff --git a/src/bin/biomedgps-cli.rs b/src/bin/biomedgps-cli.rs
@@ -2,6 +2,7 @@ extern crate log;
 
 use biomedgps::model::init_db::create_kg_score_table;
 use biomedgps::model::kge::{init_kge_models, DEFAULT_MODEL_NAME};
+use biomedgps::model::metadata::CompoundMetadata;
 use biomedgps::model::{
     init_db::{
         create_score_table, get_kg_score_table_name, kg_entity_table2graphdb,
@@ -10,8 +11,7 @@ use biomedgps::model::{
     util::read_annotation_file,
 };
 use biomedgps::{
-    build_index, connect_graph_db, import_data, import_graph_data, import_kge, init_logger,
-    run_migrations,
+    build_index, connect_graph_db, import_data, import_kge, init_logger, run_migrations,
 };
 use log::*;
 use regex::Regex;
@@ -114,7 +114,7 @@ pub struct ImportDBArguments {
     ///
     /// In addition, if you upgrade the entity and relation tables, you need to ensure that the entity2d, relation_metadata, entity_metadata, knowledge_curation, subgraph tables are also upgraded. For the entity_metadata and relation_metadata, you can use the importdb command to upgrade after the entity and relation tables are upgraded.
     ///
-    /// The order of the tables to import is: entity, relation, entity_metadata, relation_metadata, knowledge_curation [Optional], subgraph [Optional], entity2d [Optional].
+    /// The order of the tables to import is: entity, relation, entity_metadata, relation_metadata, knowledge_curation [Optional], subgraph [Optional], entity2d [Optional], compound-metadata[Optional].
     #[structopt(name = "table", short = "t", long = "table")]
     table: String,
 
@@ -472,6 +472,28 @@ async fn main() {
                 return;
             };
 
+            if arguments.table == "compound-metadata" {
+                let pool = match sqlx::PgPool::connect(&database_url).await {
+                    Ok(v) => v,
+                    Err(e) => {
+                        error!("Connect to database failed: {}", e);
+                        std::process::exit(1);
+                    }
+                };
+                let filepath = match &arguments.filepath {
+                    Some(v) => PathBuf::from(v),
+                    None => {
+                        error!("Please specify the file path for the compound-metadata table.");
+                        std::process::exit(1);
+                    }
+                };
+
+                match CompoundMetadata::sync2db(&pool, &filepath).await {
+                    Ok(_) => info!("Import compound-metadata table successfully."),
+                    Err(e) => error!("Import compound-metadata table failed: {}", e),
+                }
+            }
+
             // The annotation file is essential for relation table. We need the formatted_relation_type to annotate the relation_type.
             let relation_type_mappings = if arguments.table == "relation" {
                 if arguments.annotation_file.is_none() {

diff --git a/src/model/metadata.rs b/src/model/metadata.rs
@@ -0,0 +1,216 @@
+//! Add metadata to the entity and relationship model.
+use log::debug;
+use poem_openapi::Object;
+use serde::{Deserialize, Serialize};
+use serde_json;
+use std::{error::Error, path::PathBuf};
+use validator::{Validate, ValidationErrors};
+
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Object, sqlx::FromRow, Validate)]
+pub struct Category {
+    pub category: String,
+    pub mesh_id: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Object, sqlx::FromRow, Validate)]
+pub struct Patent {
+    pub number: String,
+    pub country: String,
+    pub approved: String,
+    pub expires: String,
+    pub pediatric_extension: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Object, sqlx::FromRow, Validate)]
+pub struct CompoundMetadata {
+    pub compound_type: String,
+    pub created: String,
+    pub updated: String,
+    pub drugbank_id: Vec<String>,
+    pub name: String,
+    pub description: String,
+    pub cas_number: String,
+    pub unii: String,
+    pub compound_state: String,
+    pub groups: Vec<String>,
+    // pub general_references: GeneralReference,
+    pub synthesis_reference: String,
+    pub indication: String,
+    pub pharmacodynamics: String,
+    pub mechanism_of_action: String,
+    pub toxicity: String,
+    pub metabolism: String,
+    pub absorption: String,
+    pub half_life: String,
+    pub protein_binding: String,
+    pub route_of_elimination: String,
+    pub volume_of_distribution: String,
+    pub clearance: String,
+    // pub classification: Classification,
+    // pub salts: Vec<String>,
+    pub synonyms: Vec<String>,
+    // pub products: Vec<Product>,
+    // pub international_brands: Vec<InternationalBrand>,
+    // pub mixtures: Vec<Mixture>,
+    // pub packagers: Vec<Packager>,
+    // pub manufacturers: Vec<Manufacturer>,
+    // pub prices: Vec<Price>,
+    pub categories: Vec<Category>,
+    // pub affected_organisms: Vec<String>,
+    // pub dosages: Vec<Dosage>,
+    // pub atc_codes: Vec<AtcCode>,
+    // pub ahfs_codes: Vec<AhfsCode>,
+    // pub pdb_entries: Vec<PdbEntry>,
+    pub patents: Vec<Patent>,
+    // pub food_interactions: Vec<String>,
+    // pub drug_interactions: Vec<DrugInteraction>,
+    // pub sequences: Vec<Sequence>,
+    // pub experimental_properties: ExperimentalProperty,
+    // pub external_identifiers: Vec<ExternalIdentifier>,
+    // pub external_links: Vec<ExternalLink>,
+    // pub pathways: Vec<Pathway>,
+    // pub reactions: Vec<Reaction>,
+    // pub snp_effects: Vec<SnpEffect>,
+    // pub snp_adverse_drug_reactions: Vec<SnpAdverseDrugReaction>,
+    // pub targets: Vec<Target>,
+    // pub enzymes: Vec<Enzyme>,
+    // pub carriers: Vec<Carrier>,
+    // pub transporters: Vec<Transporter>,
+}
+
+impl CompoundMetadata {
+    pub async fn sync2db(pool: &sqlx::PgPool, filepath: &PathBuf) -> Result<(), Box<dyn Error>> {
+        match sqlx::query("DROP TABLE IF EXISTS staging")
+            .execute(pool)
+            .await
+        {
+            Ok(_) => debug!("Drop table staging successfully."),
+            Err(e) => debug!("Drop table staging failed: {:?}", e),
+        }
+
+        let mut tx = pool.begin().await?;
+        sqlx::query(
+            "CREATE TEMPORARY TABLE staging (LIKE biomedgps_compound_metadata INCLUDING DEFAULTS)",
+        )
+        .execute(&mut tx)
+        .await?;
+
+        let columns = Self::fields().join(", ");
+        let query_str = format!(
+            "COPY staging ({}) FROM {} WITH (FORMAT JSON)",
+            columns,
+            filepath.display()
+        );
+
+        debug!("Start to copy data to the staging table.");
+        sqlx::query(&query_str).execute(&mut tx).await?;
+
+        let where_clause = Self::unique_fields()
+            .iter()
+            .map(|c| format!("biomedgps_compound_metadata.{} = staging.{}", c, c))
+            .collect::<Vec<String>>()
+            .join(" AND ");
+
+        sqlx::query(&format!(
+            "INSERT INTO biomedgps_compound_metadata ({})
+             SELECT {} FROM staging
+             WHERE NOT EXISTS (SELECT 1 FROM biomedgps_compound_metadata WHERE {})
+             ON CONFLICT DO NOTHING",
+            columns, columns, where_clause
+        ))
+        .execute(&mut tx)
+        .await?;
+
+        tx.commit().await?;
+
+        match sqlx::query("DROP TABLE IF EXISTS staging")
+            .execute(pool)
+            .await
+        {
+            Ok(_) => {}
+            Err(_) => {}
+        };
+
+        Ok(())
+    }
+}
+
+pub trait CheckMetadata {
+    fn check_json_is_valid(filepath: &PathBuf) -> Vec<Box<ValidationErrors>>;
+
+    // Implement the check function
+    fn check_json_is_valid_default<
+        S: for<'de> serde::Deserialize<'de> + Validate + std::fmt::Debug,
+    >(
+        filepath: &PathBuf,
+    ) -> Vec<Box<ValidationErrors>> {
+        let file = std::fs::File::open(filepath).unwrap();
+        let reader = std::io::BufReader::new(file);
+        let data: Vec<S> = serde_json::from_reader(reader).unwrap();
+        let mut errors: Vec<Box<ValidationErrors>> = Vec::new();
+        for d in data.iter() {
+            match d.validate() {
+                Ok(_) => {}
+                Err(e) => {
+                    errors.push(Box::new(e));
+                }
+            }
+        }
+        errors
+    }
+
+    fn fields() -> Vec<String>;
+
+    fn unique_fields() -> Vec<String>;
+
+    fn get_error_msg<S: for<'de> serde::Deserialize<'de> + Validate + std::fmt::Debug>(
+        r: Result<Vec<S>, Box<ValidationErrors>>,
+    ) -> String {
+        match r {
+            Ok(_) => "".to_string(),
+            Err(e) => {
+                return e.to_string();
+            }
+        }
+    }
+}
+
+impl CheckMetadata for CompoundMetadata {
+    fn check_json_is_valid(filepath: &PathBuf) -> Vec<Box<ValidationErrors>> {
+        Self::check_json_is_valid_default::<CompoundMetadata>(filepath)
+    }
+
+    fn unique_fields() -> Vec<String> {
+        vec!["name".to_string()]
+    }
+
+    fn fields() -> Vec<String> {
+        vec![
+            "compound_type".to_string(),
+            "created".to_string(),
+            "updated".to_string(),
+            "drugbank_id".to_string(),
+            "name".to_string(),
+            "description".to_string(),
+            "cas_number".to_string(),
+            "unii".to_string(),
+            "compound_state".to_string(),
+            "groups".to_string(),
+            "synthesis_reference".to_string(),
+            "indication".to_string(),
+            "pharmacodynamics".to_string(),
+            "mechanism_of_action".to_string(),
+            "toxicity".to_string(),
+            "metabolism".to_string(),
+            "absorption".to_string(),
+            "half_life".to_string(),
+            "protein_binding".to_string(),
+            "route_of_elimination".to_string(),
+            "volume_of_distribution".to_string(),
+            "clearance".to_string(),
+            "categories".to_string(),
+            "patents".to_string(),
+            "synonyms".to_string(),
+        ]
+    }
+}
diff --git a/src/model/mod.rs b/src/model/mod.rs
@@ -6,3 +6,4 @@ pub mod graph;
 pub mod llm;
 pub mod kge;
 pub mod init_db;
+pub mod metadata;