Skip to content

Commit

Permalink
Add a new table for managing metadata.
Browse files Browse the repository at this point in the history
  • Loading branch information
yjcyxky committed Apr 2, 2024
1 parent 91a0dce commit bea2f0c
Show file tree
Hide file tree
Showing 11 changed files with 291 additions and 8 deletions.
5 changes: 3 additions & 2 deletions data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ cargo install json2parquet
Get additional data for each compound from [DrugBank](https://www.drugbank.ca/). You might need to request access to the DrugBank data. If you have access, download the DrugBank XML file and save it to the `data` directory. We assume the file is named `drugbank_5.1_2024-01-03.xml`.

```bash
python3 data/drugbank.py tojson --input-file data/drugbank_5.1_2024-01-03.xml --output-dir data
python3 data/drugbank.py tojson --input-file data/drugbank/drugbank_5.1_2024-01-03.xml --output-dir data/drugbank

json2parquet data/drugbank_5.1_2024-01-03.json data/drugbank_5.1_2024-01-03.parquet
python3 data/drugbank.py tojson --input-file data/drugbank/drugbank_5.1_2024-01-03.xml --output-dir data/drugbank --format linejson
json2parquet data/drugbank/drugbank_5.1_2024-01-03.jsonl data/drugbank/drugbank_5.1_2024-01-03.parquet
```

### Gene
11 changes: 11 additions & 0 deletions data/drugbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,16 @@ def format_salts_drugbank_id(drug):
for path, default in uncorrected_paths:
drugs_data = [set_default_if_empty(drug, path, default) for drug in drugs_data]

# We don't like the following fields, because they might cause issues when we import the data into a database.
for drug in drugs_data:
if "type" in drug:
drug["compound_type"] = drug["type"]
del drug["type"]

if "state" in drug:
drug["compound_state"] = drug["state"]
del drug["state"]

# Prepare the output JSON file path using version and exported-on attributes
json_file_path = os.path.join(output_dir, f"drugbank_{version}_{exported_on}.json")

Expand All @@ -251,6 +261,7 @@ def format_salts_drugbank_id(drug):

data = checktypes_wrapper(output_file, json_file_path)
if format == "linejson":
json_file_path = json_file_path.replace(".json", ".jsonl")
with open(json_file_path, "w", encoding="utf-8") as json_file:
for drug in data:
json_file.write(json.dumps(drug, ensure_ascii=False) + "\n")
Expand Down
2 changes: 2 additions & 0 deletions migrations/20240401_add_compound_metadata_table.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- Drop the compound metadata table when rolling back the migration.
DROP TABLE IF EXISTS biomedgps_compound_metadata;
30 changes: 30 additions & 0 deletions migrations/20240401_add_compound_metadata_table.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- biomedgps_compound_metadata table is created to store metadata for compounds, such as the compound name, the compound type, patents, etc.
CREATE TABLE
IF NOT EXISTS biomedgps_compound_metadata (
id BIGSERIAL PRIMARY KEY, -- The entity metadata ID
compound_type VARCHAR(64) NOT NULL, -- The type of the compound, such as drug, small molecule, etc.
created VARCHAR(16) NOT NULL, -- The created time of the compound metadata
updated VARCHAR(16) NOT NULL, -- The updated time of the compound metadata
drugbank_id ARRAY[TEXT] NOT NULL, -- The DrugBank IDs of the compound
name VARCHAR(128) NOT NULL, -- The name of the compound
description TEXT NOT NULL, -- The description of the compound
cas_number VARCHAR(32) NOT NULL, -- The CAS number of the compound
unii VARCHAR(32) NOT NULL, -- The UNII of the compound
compound_state VARCHAR(32) NOT NULL, -- The state of the compound, such as solid, liquid, etc.
groups ARRAY[TEXT] NOT NULL, -- The groups of the compound, such as approved, investigational, etc.
synthesis_reference TEXT NOT NULL, -- The synthesis reference of the compound
indication TEXT NOT NULL, -- The indication of the compound
pharmacodynamics TEXT NOT NULL, -- The pharmacodynamics of the compound
mechanism_of_action TEXT NOT NULL, -- The mechanism of action of the compound
toxicity TEXT NOT NULL, -- The toxicity of the compound
metabolism TEXT NOT NULL, -- The metabolism of the compound
absorption TEXT NOT NULL, -- The absorption of the compound
half_life TEXT NOT NULL, -- The half-life of the compound
protein_binding TEXT NOT NULL, -- The protein binding of the compound
route_of_elimination TEXT NOT NULL, -- The route of elimination of the compound
volume_of_distribution TEXT NOT NULL, -- The volume of distribution of the compound
clearance TEXT NOT NULL, -- The clearance of the compound
synonyms ARRAY[TEXT] NOT NULL, -- The synonyms of the compound
categories JSONB NOT NULL, -- The categories of the compound
patents JSONB NOT NULL, -- The patents of the compound
);
2 changes: 1 addition & 1 deletion src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
pub mod route;
pub mod schema;
pub mod auth;
pub mod req;
pub mod publication;
File renamed without changes.
2 changes: 1 addition & 1 deletion src/api/route.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! This module defines the routes of the API.

use crate::api::auth::{CustomSecurityScheme, USERNAME_PLACEHOLDER};
use crate::api::req::Publication;
use crate::api::publication::Publication;
use crate::api::schema::{
ApiTags, DeleteResponse, GetEntityColorMapResponse, GetGraphResponse, GetPromptResponse,
GetPublicationsResponse, GetRecordsResponse, GetRelationCountResponse, GetStatisticsResponse,
Expand Down
2 changes: 1 addition & 1 deletion src/api/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use serde::{Deserialize, Serialize};
use validator::Validate;
use validator::ValidationErrors;

use super::req::{PublicationDetail, PublicationRecords};
use super::publication::{PublicationDetail, PublicationRecords};

#[derive(Tags)]
pub enum ApiTags {
Expand Down
28 changes: 25 additions & 3 deletions src/bin/biomedgps-cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ extern crate log;

use biomedgps::model::init_db::create_kg_score_table;
use biomedgps::model::kge::{init_kge_models, DEFAULT_MODEL_NAME};
use biomedgps::model::metadata::CompoundMetadata;
use biomedgps::model::{
init_db::{
create_score_table, get_kg_score_table_name, kg_entity_table2graphdb,
Expand All @@ -10,8 +11,7 @@ use biomedgps::model::{
util::read_annotation_file,
};
use biomedgps::{
build_index, connect_graph_db, import_data, import_graph_data, import_kge, init_logger,
run_migrations,
build_index, connect_graph_db, import_data, import_kge, init_logger, run_migrations,
};
use log::*;
use regex::Regex;
Expand Down Expand Up @@ -114,7 +114,7 @@ pub struct ImportDBArguments {
///
/// In addition, if you upgrade the entity and relation tables, you need to ensure that the entity2d, relation_metadata, entity_metadata, knowledge_curation, subgraph tables are also upgraded. For the entity_metadata and relation_metadata, you can use the importdb command to upgrade after the entity and relation tables are upgraded.
///
/// The order of the tables to import is: entity, relation, entity_metadata, relation_metadata, knowledge_curation [Optional], subgraph [Optional], entity2d [Optional].
/// The order of the tables to import is: entity, relation, entity_metadata, relation_metadata, knowledge_curation [Optional], subgraph [Optional], entity2d [Optional], compound-metadata[Optional].
#[structopt(name = "table", short = "t", long = "table")]
table: String,

Expand Down Expand Up @@ -472,6 +472,28 @@ async fn main() {
return;
};

if arguments.table == "compound-metadata" {
let pool = match sqlx::PgPool::connect(&database_url).await {
Ok(v) => v,
Err(e) => {
error!("Connect to database failed: {}", e);
std::process::exit(1);
}
};
let filepath = match &arguments.filepath {
Some(v) => PathBuf::from(v),
None => {
error!("Please specify the file path for the compound-metadata table.");
std::process::exit(1);
}
};

match CompoundMetadata::sync2db(&pool, &filepath).await {
Ok(_) => info!("Import compound-metadata table successfully."),
Err(e) => error!("Import compound-metadata table failed: {}", e),
}
}

// The annotation file is essential for relation table. We need the formatted_relation_type to annotate the relation_type.
let relation_type_mappings = if arguments.table == "relation" {
if arguments.annotation_file.is_none() {
Expand Down
216 changes: 216 additions & 0 deletions src/model/metadata.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
//! Add metadata to the entity and relationship model.
use log::debug;
use poem_openapi::Object;
use serde::{Deserialize, Serialize};
use serde_json;
use std::{error::Error, path::PathBuf};
use validator::{Validate, ValidationErrors};

#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Object, sqlx::FromRow, Validate)]
pub struct Category {
pub category: String,
pub mesh_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Object, sqlx::FromRow, Validate)]
pub struct Patent {
pub number: String,
pub country: String,
pub approved: String,
pub expires: String,
pub pediatric_extension: String,
}

#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Object, sqlx::FromRow, Validate)]
pub struct CompoundMetadata {
pub compound_type: String,
pub created: String,
pub updated: String,
pub drugbank_id: Vec<String>,
pub name: String,
pub description: String,
pub cas_number: String,
pub unii: String,
pub compound_state: String,
pub groups: Vec<String>,
// pub general_references: GeneralReference,
pub synthesis_reference: String,
pub indication: String,
pub pharmacodynamics: String,
pub mechanism_of_action: String,
pub toxicity: String,
pub metabolism: String,
pub absorption: String,
pub half_life: String,
pub protein_binding: String,
pub route_of_elimination: String,
pub volume_of_distribution: String,
pub clearance: String,
// pub classification: Classification,
// pub salts: Vec<String>,
pub synonyms: Vec<String>,
// pub products: Vec<Product>,
// pub international_brands: Vec<InternationalBrand>,
// pub mixtures: Vec<Mixture>,
// pub packagers: Vec<Packager>,
// pub manufacturers: Vec<Manufacturer>,
// pub prices: Vec<Price>,
pub categories: Vec<Category>,
// pub affected_organisms: Vec<String>,
// pub dosages: Vec<Dosage>,
// pub atc_codes: Vec<AtcCode>,
// pub ahfs_codes: Vec<AhfsCode>,
// pub pdb_entries: Vec<PdbEntry>,
pub patents: Vec<Patent>,
// pub food_interactions: Vec<String>,
// pub drug_interactions: Vec<DrugInteraction>,
// pub sequences: Vec<Sequence>,
// pub experimental_properties: ExperimentalProperty,
// pub external_identifiers: Vec<ExternalIdentifier>,
// pub external_links: Vec<ExternalLink>,
// pub pathways: Vec<Pathway>,
// pub reactions: Vec<Reaction>,
// pub snp_effects: Vec<SnpEffect>,
// pub snp_adverse_drug_reactions: Vec<SnpAdverseDrugReaction>,
// pub targets: Vec<Target>,
// pub enzymes: Vec<Enzyme>,
// pub carriers: Vec<Carrier>,
// pub transporters: Vec<Transporter>,
}

impl CompoundMetadata {
pub async fn sync2db(pool: &sqlx::PgPool, filepath: &PathBuf) -> Result<(), Box<dyn Error>> {
match sqlx::query("DROP TABLE IF EXISTS staging")
.execute(pool)
.await
{
Ok(_) => debug!("Drop table staging successfully."),
Err(e) => debug!("Drop table staging failed: {:?}", e),
}

let mut tx = pool.begin().await?;
sqlx::query(
"CREATE TEMPORARY TABLE staging (LIKE biomedgps_compound_metadata INCLUDING DEFAULTS)",
)
.execute(&mut tx)
.await?;

let columns = Self::fields().join(", ");
let query_str = format!(
"COPY staging ({}) FROM {} WITH (FORMAT JSON)",
columns,
filepath.display()
);

debug!("Start to copy data to the staging table.");
sqlx::query(&query_str).execute(&mut tx).await?;

let where_clause = Self::unique_fields()
.iter()
.map(|c| format!("biomedgps_compound_metadata.{} = staging.{}", c, c))
.collect::<Vec<String>>()
.join(" AND ");

sqlx::query(&format!(
"INSERT INTO biomedgps_compound_metadata ({})
SELECT {} FROM staging
WHERE NOT EXISTS (SELECT 1 FROM biomedgps_compound_metadata WHERE {})
ON CONFLICT DO NOTHING",
columns, columns, where_clause
))
.execute(&mut tx)
.await?;

tx.commit().await?;

match sqlx::query("DROP TABLE IF EXISTS staging")
.execute(pool)
.await
{
Ok(_) => {}
Err(_) => {}
};

Ok(())
}
}

pub trait CheckMetadata {
fn check_json_is_valid(filepath: &PathBuf) -> Vec<Box<ValidationErrors>>;

// Implement the check function
fn check_json_is_valid_default<
S: for<'de> serde::Deserialize<'de> + Validate + std::fmt::Debug,
>(
filepath: &PathBuf,
) -> Vec<Box<ValidationErrors>> {
let file = std::fs::File::open(filepath).unwrap();
let reader = std::io::BufReader::new(file);
let data: Vec<S> = serde_json::from_reader(reader).unwrap();
let mut errors: Vec<Box<ValidationErrors>> = Vec::new();
for d in data.iter() {
match d.validate() {
Ok(_) => {}
Err(e) => {
errors.push(Box::new(e));
}
}
}
errors
}

fn fields() -> Vec<String>;

fn unique_fields() -> Vec<String>;

fn get_error_msg<S: for<'de> serde::Deserialize<'de> + Validate + std::fmt::Debug>(
r: Result<Vec<S>, Box<ValidationErrors>>,
) -> String {
match r {
Ok(_) => "".to_string(),
Err(e) => {
return e.to_string();
}
}
}
}

impl CheckMetadata for CompoundMetadata {
fn check_json_is_valid(filepath: &PathBuf) -> Vec<Box<ValidationErrors>> {
Self::check_json_is_valid_default::<CompoundMetadata>(filepath)
}

fn unique_fields() -> Vec<String> {
vec!["name".to_string()]
}

fn fields() -> Vec<String> {
vec![
"compound_type".to_string(),
"created".to_string(),
"updated".to_string(),
"drugbank_id".to_string(),
"name".to_string(),
"description".to_string(),
"cas_number".to_string(),
"unii".to_string(),
"compound_state".to_string(),
"groups".to_string(),
"synthesis_reference".to_string(),
"indication".to_string(),
"pharmacodynamics".to_string(),
"mechanism_of_action".to_string(),
"toxicity".to_string(),
"metabolism".to_string(),
"absorption".to_string(),
"half_life".to_string(),
"protein_binding".to_string(),
"route_of_elimination".to_string(),
"volume_of_distribution".to_string(),
"clearance".to_string(),
"categories".to_string(),
"patents".to_string(),
"synonyms".to_string(),
]
}
}
1 change: 1 addition & 0 deletions src/model/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ pub mod graph;
pub mod llm;
pub mod kge;
pub mod init_db;
pub mod metadata;

0 comments on commit bea2f0c

Please sign in to comment.