Skip to content

Commit

Permalink
Improve the auth process.
Browse files Browse the repository at this point in the history
  • Loading branch information
yjcyxky committed Mar 28, 2024
1 parent 4f1cd17 commit 6369383
Show file tree
Hide file tree
Showing 12 changed files with 434 additions and 140 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ assets/*
.VSCodeCounter
neo4j-data
neo4j-import
data
data/*.json
data/*.parquet
13 changes: 13 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Prepare additional data for each entity and relation

### Compound

Get additional data for each compound from [DrugBank](https://www.drugbank.ca/).

```bash
python3 data/drugbank.py tojson --input xxx.xml --output data

python3 data/drugbank.py toparquet --input data/drugbank_5.1_2024-01-03.json --output data
```

### Gene
199 changes: 199 additions & 0 deletions data/drugbank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import click
import os
import xml.etree.ElementTree as ET
import json
from collections import defaultdict

cli = click.Group()

def remove_namespace_and_hyphen(tag):
"""Remove the namespace from the tag name and replace hyphens with underscores."""
# Remove namespace
tag = tag.split('}')[-1]
# Replace hyphens with underscores
tag = tag.replace('-', '_')
return tag

def parse_element(elem):
"""Recursively parse an XML element into a dictionary or a list, ignoring attributes."""
# Check for text content directly in this element
if elem.text and elem.text.strip():
return elem.text.strip()
else:
result = {}

for name, value in elem.attrib.items():
result[remove_namespace_and_hyphen(name)] = value

# Group children by tag to decide on list or single item
children_by_tag = {}
for child in elem:
key = remove_namespace_and_hyphen(child.tag)
children_by_tag.setdefault(key, []).append(parse_element(child))

# Add children to result, converting to lists or dicts as appropriate
for key, children in children_by_tag.items():
if len(children) > 1:
result[key] = children # Set as list if multiple children with same tag
else:
result[key] = children[0] # Single item, not a list

return result or None

def unify_to_array(value):
if value is None:
return []
elif isinstance(value, list):
return value
else:
return [value]

def check_singular_plural(singular, plural):
if singular == "category" and plural == "categories":
return True
return singular + "s" == plural

def set_default_if_empty(nested_dict, path, default):
"""
Sets a default value in a nested dictionary based on a given path if the final key is not set or empty.
:param nested_dict: The nested dictionary to modify.
:param path: A list of keys representing the path to the target value.
:param default: The default value to set if the target is not set or empty.
"""
if isinstance(nested_dict, list):
return [set_default_if_empty(item, path[1:], default) for item in nested_dict]

if isinstance(nested_dict, dict):
# Navigate through the nested dictionary along the path, except for the last key
current_level = nested_dict

for key in path[:-1]:
if key in current_level:
if current_level[key] is None or current_level[key] == "":
return current_level
elif isinstance(current_level[key], list):
current_level[key] = [set_default_if_empty(item, path[1:], default) for item in current_level[key]]
elif isinstance(current_level[key], dict):
current_level[key] = set_default_if_empty(current_level[key], path[1:], default)

final_key = path[-1]
if final_key in current_level and current_level[final_key] is not None:
if (isinstance(current_level[final_key], str) or
isinstance(current_level[final_key], dict)) and isinstance(default, list):
current_level[final_key] = [current_level[final_key]]

return current_level


def transform_json(obj):
if isinstance(obj, dict):
new_obj = {}
for key, value in obj.items():
if isinstance(value, dict) and len(value.keys()) == 1 and check_singular_plural(list(value.keys())[0], key):
subkey = list(value.keys())[0]
new_obj[key] = unify_to_array(
transform_json(value[subkey])
)
else:
new_obj[key] = transform_json(value)

return new_obj
elif isinstance(obj, list):
return [transform_json(item) for item in obj]

return obj

def check_types(data, parent_key='', path_types=defaultdict(list)):
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, (dict, list)):
check_types(value, f"{parent_key}.{key}" if parent_key else key, path_types)
else:
path_types[f"{parent_key}.{key}" if parent_key else key].append(type(value).__name__)
elif isinstance(data, list):
for i, item in enumerate(data):
check_types(item, f"{parent_key}[{i}]" if parent_key else str(i), path_types)
else:
path_types[parent_key].append(type(data).__name__)

return path_types

def find_inconsistencies(path_types):
inconsistencies = {}
for path, types in path_types.items():
if len(set(types)) > 1: # More than one unique type for the path
inconsistencies[path] = set(types)
return inconsistencies

@cli.command(help="Converts a DrugBank XML file to a JSON file.")
@click.option('--input', '-i', required=True, type=click.Path(exists=True, file_okay=True, dir_okay=False), help="Path to the DrugBank XML file.")
@click.option('--output', '-o', required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True), help="Path to the output directory for the JSON file.")
def tojson(input, output):
# Load the XML file
print(f'Converting {input} to JSON...')
tree = ET.parse(input)
root = tree.getroot()

# Extract the version and export date from the root element for the filename
version = root.attrib['version']
exported_on = root.attrib['exported-on']

# Process all 'drug' elements
print('Processing drug elements...')
drugs_data = [parse_element(drug) for drug in root.findall('{http://www.drugbank.ca}drug')]
drugs_data = transform_json([drug for drug in drugs_data])

# You can add more processing here to keep the type of the data consistent
drugs_data = [set_default_if_empty(drug, ['drugbank_id'], []) for drug in drugs_data]
drugs_data = [set_default_if_empty(drug, ['targets', 'polypeptide'], []) for drug in drugs_data]
drugs_data = [set_default_if_empty(drug, ['pathways', 'enzymes', 'uniprot_id'], []) for drug in drugs_data]

# Prepare the output JSON file path using version and exported-on attributes
json_file_path = f'{output}/drugbank_{version}_{exported_on}.json'

# Save the processed data to a JSON file
print(f'Saving JSON file to {json_file_path}...')
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(drugs_data, json_file, ensure_ascii=False, indent=4)


@click.command(help="Converts a DrugBank json file to a parquet file.")
@click.option('--input', '-i', required=True, type=click.Path(exists=True, file_okay=True, dir_okay=False), help="Path to the DrugBank JSON file.")
@click.option('--output', '-o', required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True), help="Path to the output directory for the Parquet file.")
def toparquet(input, output):
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Load the JSON file
print(f'Converting {input} to Parquet...')
with open(input, 'r', encoding='utf-8') as json_file:
drugs_data = json.load(json_file)

# Convert the JSON data to a DataFrame
df = pd.json_normalize(drugs_data)

# 新增类型检查代码
path_types = check_types(drugs_data)
inconsistencies = find_inconsistencies(path_types)
if inconsistencies:
print("Found inconsistencies in the data types after json_normalize:")
for path, types in inconsistencies.items():
print(f"Path: {path}, Types: {types}")
else:
print("No inconsistencies found in the data types after json_normalize.")

# Prepare the output Parquet file path
parquet_file_path = f'{output}/{os.path.splitext(os.path.basename(input))[0]}.parquet'

# Save the DataFrame to a Parquet file
print(f'Saving Parquet file to {parquet_file_path}...')
table = pa.Table.from_pandas(df)
pq.write_table(table, parquet_file_path)

cli.add_command(tojson)
cli.add_command(toparquet)

if __name__ == '__main__':
cli()
127 changes: 127 additions & 0 deletions data/test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def set_default_if_empty(nested_dict, path, default):\n",
" \"\"\"\n",
" Sets a default value in a nested dictionary based on a given path if the final key is not set or empty.\n",
" \n",
" :param nested_dict: The nested dictionary to modify.\n",
" :param path: A list of keys representing the path to the target value.\n",
" :param default: The default value to set if the target is not set or empty.\n",
" \"\"\"\n",
" print(\"Set default if empty: \", path, default)\n",
" if isinstance(nested_dict, list):\n",
" return [set_default_if_empty(item, path[1:], default) for item in nested_dict]\n",
" \n",
" if isinstance(nested_dict, dict):\n",
" # Navigate through the nested dictionary along the path, except for the last key\n",
" current_level = nested_dict\n",
"\n",
" for key in path[:-1]:\n",
" print(\"Key: \", key, \"Isinstance: \", isinstance(current_level[key], list))\n",
" if key in current_level:\n",
" if current_level[key] is None or current_level[key] == \"\":\n",
" return current_level\n",
" elif isinstance(current_level[key], list):\n",
" current_level[key] = [set_default_if_empty(item, path[1:], default) for item in current_level[key]]\n",
" elif isinstance(current_level[key], dict):\n",
" current_level[key] = set_default_if_empty(current_level[key], path[1:], default)\n",
"\n",
" final_key = path[-1]\n",
" if final_key in current_level and current_level[final_key] is not None:\n",
" if (isinstance(current_level[final_key], str) or \n",
" isinstance(current_level[final_key], dict)) and isinstance(default, list):\n",
" print(\"C: \", current_level[final_key])\n",
" current_level[final_key] = [current_level[final_key]]\n",
"\n",
" return current_level"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from collections import defaultdict\n",
"\n",
"def check_types(data, parent_key='', path_types=defaultdict(list)):\n",
" if isinstance(data, dict):\n",
" for key, value in data.items():\n",
" if isinstance(value, (dict, list)):\n",
" check_types(value, f\"{parent_key}.{key}\" if parent_key else key, path_types)\n",
" else:\n",
" path_types[f\"{parent_key}.{key}\" if parent_key else key].append(type(value).__name__)\n",
" elif isinstance(data, list):\n",
" for i, item in enumerate(data):\n",
" check_types(item, f\"{parent_key}[{i}]\" if parent_key else str(i), path_types)\n",
" else:\n",
" path_types[parent_key].append(type(data).__name__)\n",
"\n",
" return path_types\n",
"\n",
"def find_inconsistencies(path_types):\n",
" inconsistencies = {}\n",
" for path, types in path_types.items():\n",
" if len(set(types)) > 1: # More than one unique type for the path\n",
" inconsistencies[path] = set(types)\n",
" return inconsistencies"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"No inconsistencies found in the data types.\n"
]
}
],
"source": [
"import json\n",
"\n",
"data = json.load(open(\"/Users/jy006/Documents/Code/BioMedGPS/biomedgps/data/drugbank_5.1_2024-01-03.json\"))\n",
"\n",
"path_types = check_types(data)\n",
"inconsistencies = find_inconsistencies(path_types)\n",
"if inconsistencies:\n",
" print(\"Found inconsistencies in the data types:\")\n",
" for path, types in inconsistencies.items():\n",
" print(f\"Path: {path}, Types: {types}\")\n",
"else:\n",
" print(\"No inconsistencies found in the data types.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "network-medicine",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
3 changes: 2 additions & 1 deletion src/model/graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -800,8 +800,9 @@ impl Graph {
/// * `Result<&Vec<Edge>, ValidationError>` - If the strict_mode is true, it will return the missed nodes in the graph.
///
pub fn get_edges(&mut self, strict_mode: Option<bool>) -> Result<&Vec<Edge>, ValidationError> {
// sort_by will cause the ordering which is defined by the user failed.
// self.edges.sort_by(|a, b| a.relid.cmp(&b.relid));
// Dedup the edges
self.edges.sort_by(|a, b| a.relid.cmp(&b.relid));
self.edges.dedup_by(|a, b| a.relid == b.relid);

self.nodes = self.get_nodes().to_vec();
Expand Down
Loading

0 comments on commit 6369383

Please sign in to comment.