Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infer metadata from the data *.csv file #173

Merged
merged 17 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ Authors
* Marie-Claire Gering
* Julian Endres
* Felix Maurer
* Pierre-Francois Duc
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Unreleased
----------

Features
* Improve the function to infer package metadata `#173 <https://github.com/oemof/oemof-tabular/pull/173>`_

Fixes

Expand Down
15 changes: 14 additions & 1 deletion docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,18 @@ This can also be done for sequences and geometries.
To create meta-data `json` file you can use the following code:


.. code-block:: python

from datapackage_utilities import building

building.infer_metadata_from_data(
package_name="my-datapackage",
path="/home/user/datpackages/my-datapackage"
)


Or, if you want to specify manually the relation of the foreign keys, you can use this code:

.. code-block:: python

from datapackage_utilities import building
Expand Down Expand Up @@ -354,7 +366,8 @@ field names in the generators-profile resource.
.. note::

This usage breaks with the datapackage standard and creates
non-valid resources.**
non-valid resources.



Scripting
Expand Down
2 changes: 1 addition & 1 deletion src/oemof/tabular/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.0.5"
__version__ = "0.0.6dev"
__project__ = "oemof.tabular"


Expand Down
5 changes: 5 additions & 0 deletions src/oemof/tabular/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
with open(FOREIGN_KEY_DESCRIPTORS_FILE, "r") as fk_descriptors_file:
FOREIGN_KEY_DESCRIPTORS = json.load(fk_descriptors_file)

SPECIAL_FIELD_NAMES = {}
for fk, descriptor in FOREIGN_KEY_DESCRIPTORS.items():
for el in descriptor:
SPECIAL_FIELD_NAMES[el["fields"]] = fk

supported_oemof_tabular_versions = [
None,
"0.0.1",
Expand Down
204 changes: 204 additions & 0 deletions src/oemof/tabular/datapackage/building.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys
import tarfile
import urllib.request
import warnings
import zipfile
from ftplib import FTP
from urllib.parse import urlparse
Expand Down Expand Up @@ -59,6 +60,208 @@ def update_package_descriptor():
p.save("datapackage.json")


def map_sequence_profiles_to_resource_name(
p, excluded_profiles=("timeindex",)
):
"""Look in every sequence resources and map each of its fields to itself

Within this process the unicity of the field names will be checked,
with the exception of the field "timeindex"

"""

def check_sequences_labels_unicity(labels, new_labels):
intersect = set(labels).intersection(new_labels)
if len(intersect) == 1:
intersect = intersect.pop()
if not intersect == "timeindex":
answer = [intersect]
else:
answer = []
else:
answer = list(intersect)

if answer:
warnings.warn(
f"The labels of the profiles are not unique across all"
f"files within 'sequences' folder: '{','.join(intersect)}' "
f"used more than once"
)
return answer

sequences = {}
sequence_labels = []
duplicated_labels = []
for r in p.resources:
if "/sequences/" in r.descriptor["path"]:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here one has to use os.sep instead of "/"

field_labels = [
f.name
for f in r.schema.fields
if f.name not in excluded_profiles
]
sequences[r.descriptor["name"]] = field_labels
duplicated_labels += check_sequences_labels_unicity(
sequence_labels, field_labels
)
sequence_labels += field_labels

if duplicated_labels:
# write an error message here
raise ValueError(
f"The following sequences labels are not unique"
f" across all sequences files: "
f"{', '.join(duplicated_labels)}"
)
# map each profile to its resource name
sequences_mapping = {
value: key for (key, values) in sequences.items() for value in values
}
return sequences_mapping


def infer_resource_foreign_keys(resource, sequences_profiles_to_resource):
"""Find out the foreign keys within a resource fields

Look through all field of a resource which are of type 'string'
if any of their values are matching a profile header in any of
the sequences resources


Parameters
----------
resource: a :datapackage.Resource: instance
sequences_profiles_to_resource: the mapping of sequence profile
headers to their resource name

Returns
-------
The :datapackage.Resource: instance with updated "foreignKeys" field

"""
r = resource
data = pd.DataFrame.from_records(r.read(keyed=True))
# TODO not sure this should be set here
r.descriptor["schema"]["primaryKey"] = "name"
if "foreignKeys" not in r.descriptor["schema"]:
r.descriptor["schema"]["foreignKeys"] = []

for field in r.schema.fields:
if field.type == "string":
for potential_fk in data[field.name].dropna().unique():

if potential_fk in sequences_profiles_to_resource:
# this is actually a wrong format and should be
# with a "fields" field under the "reference" fields

fk = {
"fields": field.name,
"reference": {
"resource": sequences_profiles_to_resource[
potential_fk
],
},
}

if fk not in r.descriptor["schema"]["foreignKeys"]:
r.descriptor["schema"]["foreignKeys"].append(fk)
r.commit()
return r


def infer_package_foreign_keys(package):
"""Infer the foreign_keys from elements and sequences and update meta data

Parameters
----------
package

Returns
-------

"""
p = package
sequences_profiles_to_resource = map_sequence_profiles_to_resource_name(p)

for r in p.resources:
if os.sep + "elements" + os.sep in r.descriptor["path"]:
r = infer_resource_foreign_keys(r, sequences_profiles_to_resource)
# sort foreign_key entries by alphabetically by fields
r.descriptor["schema"]["foreignKeys"].sort(
key=lambda x: x["fields"]
)
p.remove_resource(r.name)
p.add_resource(r.descriptor)


def infer_metadata_from_data(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Idea was to make this feature optional, therefore I did not want to modify infer_metadata function

path,
package_name="default-name",
metadata_filename="datapackage.json",
):
"""Creates a metadata .json file at the root-folder of datapackage

The foreign keys are inferred from the csv files within
"data/elements" and "data/sequences" resources.

Parameters
----------
path: string
Absolute path to root-folder of the datapackage
package_name: string
Name of the data package
metadata_filename: basestring
Name of the inferred metadata string.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small docstring would be good

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 0c4279b0c4279b

Returns
-------
Save a json metadata file at the root-folder of datapackage
under the provided path.
"""

# Infer the fields from the package data
path = os.path.abspath(path)
p0 = Package(base_path=path)
p0.infer(os.path.join(path, "**" + os.sep + "*.csv"))
p0.commit()
p0.save(os.path.join(path, metadata_filename))

foreign_keys = {}

def infer_resource_basic_foreign_keys(resource):
"""Prepare foreign_keys dict for building.infer_metadata

Compare the fields of a resource to a list of field names known
to be foreign keys. If the field name is within the list, it is
used to populate the dict 'foreign_keys'
"""
for field in resource.schema.fields:
if field.name in config.SPECIAL_FIELD_NAMES:
fk_descriptor = config.SPECIAL_FIELD_NAMES[field.name]
if fk_descriptor in foreign_keys:
if resource.name not in foreign_keys[fk_descriptor]:
foreign_keys[fk_descriptor].append(resource.name)
else:
foreign_keys[fk_descriptor] = [resource.name]

for r in p0.resources:
if os.sep + "elements" + os.sep in r.descriptor["path"]:
infer_resource_basic_foreign_keys(r)
# this function saves the metadata of the package in json format
infer_metadata(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because this function does already part of the job if provided a dict foreign_keys, I used it and therefore just wrote the function infer_resource_basic_foreign_keys to fill this foreign_keys dict.

package_name=package_name,
path=path,
foreign_keys=foreign_keys,
metadata_filename=metadata_filename,
)

# reload the package from the saved json file
p = Package(os.path.join(path, metadata_filename))
infer_package_foreign_keys(p)
p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"]))
p.commit()
p.save(os.path.join(path, metadata_filename))


def infer_metadata(
package_name="default-name",
keep_resources=False,
Expand Down Expand Up @@ -231,6 +434,7 @@ def infer_metadata(
)
p.add_resource(r.descriptor)

p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"]))
p.commit()
p.save(metadata_filename)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"profile": "tabular-data-package",
"name": "oemof-tabular-dispatch-example",
"name": "dispatch-example",
"oemof_tabular_version": "0.0.6dev",
"resources": [
{
Expand Down Expand Up @@ -435,4 +435,4 @@
}
}
]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
kwargs = {}

building.infer_metadata(
package_name="oemof-tabular-dispatch-example",
package_name="dispatch-example",
foreign_keys={
"bus": ["volatile", "dispatchable", "storage", "load"],
"profile": ["load", "volatile"],
Expand Down
Loading
Loading