Improve the auth process.

open-prophetdb · Mar 28, 2024 · 6369383 · 6369383
1 parent 4f1cd17
commit 6369383
Show file tree

Hide file tree

Showing 12 changed files with 434 additions and 140 deletions.
diff --git a/.gitignore b/.gitignore
@@ -30,4 +30,5 @@ assets/*
 .VSCodeCounter
 neo4j-data
 neo4j-import
-data
+data/*.json
+data/*.parquet
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,13 @@
+## Prepare additional data for each entity and relation
+
+### Compound
+
+Get additional data for each compound from [DrugBank](https://www.drugbank.ca/).
+
+```bash
+python3 data/drugbank.py tojson --input xxx.xml --output data
+
+python3 data/drugbank.py toparquet --input data/drugbank_5.1_2024-01-03.json --output data
+```
+
+### Gene
diff --git a/data/drugbank.py b/data/drugbank.py
@@ -0,0 +1,199 @@
+import click
+import os
+import xml.etree.ElementTree as ET
+import json
+from collections import defaultdict
+
+cli = click.Group()
+
+def remove_namespace_and_hyphen(tag):
+    """Remove the namespace from the tag name and replace hyphens with underscores."""
+    # Remove namespace
+    tag = tag.split('}')[-1]
+    # Replace hyphens with underscores
+    tag = tag.replace('-', '_')
+    return tag
+
+def parse_element(elem):
+    """Recursively parse an XML element into a dictionary or a list, ignoring attributes."""
+    # Check for text content directly in this element
+    if elem.text and elem.text.strip():
+        return elem.text.strip()
+    else:
+        result = {}
+
+    for name, value in elem.attrib.items():
+        result[remove_namespace_and_hyphen(name)] = value
+
+    # Group children by tag to decide on list or single item
+    children_by_tag = {}
+    for child in elem:
+        key = remove_namespace_and_hyphen(child.tag)
+        children_by_tag.setdefault(key, []).append(parse_element(child))
+
+    # Add children to result, converting to lists or dicts as appropriate
+    for key, children in children_by_tag.items():
+        if len(children) > 1:
+            result[key] = children  # Set as list if multiple children with same tag
+        else:
+            result[key] = children[0]  # Single item, not a list
+
+    return result or None
+
+def unify_to_array(value):
+    if value is None:
+        return []
+    elif isinstance(value, list):
+        return value
+    else:
+        return [value]
+
+def check_singular_plural(singular, plural):
+    if singular == "category" and plural == "categories":
+        return True
+    return singular + "s" == plural
+
+def set_default_if_empty(nested_dict, path, default):
+    """
+    Sets a default value in a nested dictionary based on a given path if the final key is not set or empty.
+    
+    :param nested_dict: The nested dictionary to modify.
+    :param path: A list of keys representing the path to the target value.
+    :param default: The default value to set if the target is not set or empty.
+    """
+    if isinstance(nested_dict, list):
+        return [set_default_if_empty(item, path[1:], default) for item in nested_dict]
+
+    if isinstance(nested_dict, dict):
+        # Navigate through the nested dictionary along the path, except for the last key
+        current_level = nested_dict
+
+        for key in path[:-1]:
+            if key in current_level:
+                if current_level[key] is None or current_level[key] == "":
+                    return current_level
+                elif isinstance(current_level[key], list):
+                    current_level[key] = [set_default_if_empty(item, path[1:], default) for item in current_level[key]]
+                elif isinstance(current_level[key], dict):
+                    current_level[key] = set_default_if_empty(current_level[key], path[1:], default)
+
+        final_key = path[-1]
+        if final_key in current_level and current_level[final_key] is not None:
+            if (isinstance(current_level[final_key], str) or 
+                isinstance(current_level[final_key], dict)) and isinstance(default, list):
+                current_level[final_key] = [current_level[final_key]]
+
+        return current_level
+
+
+def transform_json(obj):
+    if isinstance(obj, dict):
+        new_obj = {}
+        for key, value in obj.items():
+            if isinstance(value, dict) and len(value.keys()) == 1 and check_singular_plural(list(value.keys())[0], key):
+                subkey = list(value.keys())[0]
+                new_obj[key] = unify_to_array(
+                    transform_json(value[subkey])
+                )
+            else:
+                new_obj[key] = transform_json(value)
+
+        return new_obj
+    elif isinstance(obj, list):
+        return [transform_json(item) for item in obj]
+
+    return obj
+
+def check_types(data, parent_key='', path_types=defaultdict(list)):
+    if isinstance(data, dict):
+        for key, value in data.items():
+            if isinstance(value, (dict, list)):
+                check_types(value, f"{parent_key}.{key}" if parent_key else key, path_types)
+            else:
+                path_types[f"{parent_key}.{key}" if parent_key else key].append(type(value).__name__)
+    elif isinstance(data, list):
+        for i, item in enumerate(data):
+            check_types(item, f"{parent_key}[{i}]" if parent_key else str(i), path_types)
+    else:
+        path_types[parent_key].append(type(data).__name__)
+
+    return path_types
+
+def find_inconsistencies(path_types):
+    inconsistencies = {}
+    for path, types in path_types.items():
+        if len(set(types)) > 1:  # More than one unique type for the path
+            inconsistencies[path] = set(types)
+    return inconsistencies
+
+@cli.command(help="Converts a DrugBank XML file to a JSON file.")
+@click.option('--input', '-i', required=True, type=click.Path(exists=True, file_okay=True, dir_okay=False), help="Path to the DrugBank XML file.")
+@click.option('--output', '-o', required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True), help="Path to the output directory for the JSON file.")
+def tojson(input, output):
+    # Load the XML file
+    print(f'Converting {input} to JSON...')
+    tree = ET.parse(input)
+    root = tree.getroot()
+
+    # Extract the version and export date from the root element for the filename
+    version = root.attrib['version']
+    exported_on = root.attrib['exported-on']
+
+    # Process all 'drug' elements
+    print('Processing drug elements...')
+    drugs_data = [parse_element(drug) for drug in root.findall('{http://www.drugbank.ca}drug')]
+    drugs_data = transform_json([drug for drug in drugs_data])
+
+    # You can add more processing here to keep the type of the data consistent
+    drugs_data = [set_default_if_empty(drug, ['drugbank_id'], []) for drug in drugs_data]
+    drugs_data = [set_default_if_empty(drug, ['targets', 'polypeptide'], []) for drug in drugs_data]
+    drugs_data = [set_default_if_empty(drug, ['pathways', 'enzymes', 'uniprot_id'], []) for drug in drugs_data]
+
+    # Prepare the output JSON file path using version and exported-on attributes
+    json_file_path = f'{output}/drugbank_{version}_{exported_on}.json'
+
+    # Save the processed data to a JSON file
+    print(f'Saving JSON file to {json_file_path}...')
+    with open(json_file_path, 'w', encoding='utf-8') as json_file:
+        json.dump(drugs_data, json_file, ensure_ascii=False, indent=4)
+
+
+@click.command(help="Converts a DrugBank json file to a parquet file.")
+@click.option('--input', '-i', required=True, type=click.Path(exists=True, file_okay=True, dir_okay=False), help="Path to the DrugBank JSON file.")
+@click.option('--output', '-o', required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True), help="Path to the output directory for the Parquet file.")
+def toparquet(input, output):
+    import pandas as pd
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    # Load the JSON file
+    print(f'Converting {input} to Parquet...')
+    with open(input, 'r', encoding='utf-8') as json_file:
+        drugs_data = json.load(json_file)
+
+    # Convert the JSON data to a DataFrame
+    df = pd.json_normalize(drugs_data)
+
+    # 新增类型检查代码
+    path_types = check_types(drugs_data)
+    inconsistencies = find_inconsistencies(path_types)
+    if inconsistencies:
+        print("Found inconsistencies in the data types after json_normalize:")
+        for path, types in inconsistencies.items():
+            print(f"Path: {path}, Types: {types}")
+    else:
+        print("No inconsistencies found in the data types after json_normalize.")
+
+    # Prepare the output Parquet file path
+    parquet_file_path = f'{output}/{os.path.splitext(os.path.basename(input))[0]}.parquet'
+
+    # Save the DataFrame to a Parquet file
+    print(f'Saving Parquet file to {parquet_file_path}...')
+    table = pa.Table.from_pandas(df)
+    pq.write_table(table, parquet_file_path)
+
+cli.add_command(tojson)
+cli.add_command(toparquet)
+
+if __name__ == '__main__':
+    cli()
diff --git a/data/test.ipynb b/data/test.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def set_default_if_empty(nested_dict, path, default):\n",
+    "    \"\"\"\n",
+    "    Sets a default value in a nested dictionary based on a given path if the final key is not set or empty.\n",
+    "    \n",
+    "    :param nested_dict: The nested dictionary to modify.\n",
+    "    :param path: A list of keys representing the path to the target value.\n",
+    "    :param default: The default value to set if the target is not set or empty.\n",
+    "    \"\"\"\n",
+    "    print(\"Set default if empty: \", path, default)\n",
+    "    if isinstance(nested_dict, list):\n",
+    "        return [set_default_if_empty(item, path[1:], default) for item in nested_dict]\n",
+    "    \n",
+    "    if isinstance(nested_dict, dict):\n",
+    "        # Navigate through the nested dictionary along the path, except for the last key\n",
+    "        current_level = nested_dict\n",
+    "\n",
+    "        for key in path[:-1]:\n",
+    "            print(\"Key: \", key, \"Isinstance: \", isinstance(current_level[key], list))\n",
+    "            if key in current_level:\n",
+    "                if current_level[key] is None or current_level[key] == \"\":\n",
+    "                    return current_level\n",
+    "                elif isinstance(current_level[key], list):\n",
+    "                    current_level[key] = [set_default_if_empty(item, path[1:], default) for item in current_level[key]]\n",
+    "                elif isinstance(current_level[key], dict):\n",
+    "                    current_level[key] = set_default_if_empty(current_level[key], path[1:], default)\n",
+    "\n",
+    "        final_key = path[-1]\n",
+    "        if final_key in current_level and current_level[final_key] is not None:\n",
+    "            if (isinstance(current_level[final_key], str) or \n",
+    "                isinstance(current_level[final_key], dict)) and isinstance(default, list):\n",
+    "                print(\"C: \", current_level[final_key])\n",
+    "                current_level[final_key] = [current_level[final_key]]\n",
+    "\n",
+    "        return current_level"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "def check_types(data, parent_key='', path_types=defaultdict(list)):\n",
+    "    if isinstance(data, dict):\n",
+    "        for key, value in data.items():\n",
+    "            if isinstance(value, (dict, list)):\n",
+    "                check_types(value, f\"{parent_key}.{key}\" if parent_key else key, path_types)\n",
+    "            else:\n",
+    "                path_types[f\"{parent_key}.{key}\" if parent_key else key].append(type(value).__name__)\n",
+    "    elif isinstance(data, list):\n",
+    "        for i, item in enumerate(data):\n",
+    "            check_types(item, f\"{parent_key}[{i}]\" if parent_key else str(i), path_types)\n",
+    "    else:\n",
+    "        path_types[parent_key].append(type(data).__name__)\n",
+    "\n",
+    "    return path_types\n",
+    "\n",
+    "def find_inconsistencies(path_types):\n",
+    "    inconsistencies = {}\n",
+    "    for path, types in path_types.items():\n",
+    "        if len(set(types)) > 1:  # More than one unique type for the path\n",
+    "            inconsistencies[path] = set(types)\n",
+    "    return inconsistencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No inconsistencies found in the data types.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "data = json.load(open(\"/Users/jy006/Documents/Code/BioMedGPS/biomedgps/data/drugbank_5.1_2024-01-03.json\"))\n",
+    "\n",
+    "path_types = check_types(data)\n",
+    "inconsistencies = find_inconsistencies(path_types)\n",
+    "if inconsistencies:\n",
+    "    print(\"Found inconsistencies in the data types:\")\n",
+    "    for path, types in inconsistencies.items():\n",
+    "        print(f\"Path: {path}, Types: {types}\")\n",
+    "else:\n",
+    "    print(\"No inconsistencies found in the data types.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "network-medicine",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/model/graph.rs b/src/model/graph.rs
@@ -800,8 +800,9 @@ impl Graph {
     /// * `Result<&Vec<Edge>, ValidationError>` - If the strict_mode is true, it will return the missed nodes in the graph.
     ///
     pub fn get_edges(&mut self, strict_mode: Option<bool>) -> Result<&Vec<Edge>, ValidationError> {
+        // sort_by will cause the ordering which is defined by the user failed.
+        // self.edges.sort_by(|a, b| a.relid.cmp(&b.relid));
         // Dedup the edges
-        self.edges.sort_by(|a, b| a.relid.cmp(&b.relid));
         self.edges.dedup_by(|a, b| a.relid == b.relid);
 
         self.nodes = self.get_nodes().to_vec();