Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infer metadata from the data *.csv file #173

Merged
merged 17 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,18 @@ This can also be done for sequences and geometries.
To create meta-data `json` file you can use the following code:


.. code-block:: python

from datapackage_utilities import building

building.infer_metadata_from_data(
package_name="my-datapackage",
path="/home/user/datpackages/my-datapackage"
)


Or, if you want to specify manually the relation of the foreign keys, you can use this code:

.. code-block:: python

from datapackage_utilities import building
Expand Down Expand Up @@ -354,7 +366,8 @@ field names in the generators-profile resource.
.. note::

This usage breaks with the datapackage standard and creates
non-valid resources.**
non-valid resources.



Scripting
Expand Down
5 changes: 5 additions & 0 deletions src/oemof/tabular/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
with open(FOREIGN_KEY_DESCRIPTORS_FILE, "r") as fk_descriptors_file:
FOREIGN_KEY_DESCRIPTORS = json.load(fk_descriptors_file)

SPECIAL_FIELD_NAMES = {}
for fk, descriptor in FOREIGN_KEY_DESCRIPTORS.items():
for el in descriptor:
SPECIAL_FIELD_NAMES[el["fields"]] = fk

supported_oemof_tabular_versions = [
None,
"0.0.1",
Expand Down
173 changes: 173 additions & 0 deletions src/oemof/tabular/datapackage/building.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys
import tarfile
import urllib.request
import warnings
import zipfile
from ftplib import FTP
from urllib.parse import urlparse
Expand Down Expand Up @@ -59,6 +60,178 @@ def update_package_descriptor():
p.save("datapackage.json")


def map_sequence_profiles_to_resource_name(
p, excluded_profiles=("timeindex",)
):
"""Look in every resource which is a sequence and map each of its fields to itself

Within this process the unicity of the field names will be checked, with the exception of the field "timeindex"

"""

def check_sequences_labels_unicity(labels, new_labels):
intersect = set(labels).intersection(new_labels)
if len(intersect) == 1:
intersect = intersect.pop()
if not intersect == "timeindex":
answer = [intersect]
else:
answer = []
else:
answer = list(intersect)

if answer:
warnings.warn(
f"The labels of the profiles are not unique across all files within 'sequences' folder: '{','.join(intersect)}' used more than once"
)
return answer

sequences = {}
sequence_labels = []
duplicated_labels = []
for r in p.resources:
if "/sequences/" in r.descriptor["path"]:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here one has to use os.sep instead of "/"

field_labels = [
f.name
for f in r.schema.fields
if f.name not in excluded_profiles
]
sequences[r.descriptor["name"]] = field_labels
duplicated_labels += check_sequences_labels_unicity(
sequence_labels, field_labels
)
sequence_labels += field_labels

if duplicated_labels:
# write an error message here
pass
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An error should be thrown here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in b8b4e64b8b4e64

# map each profile to its resource name
sequences_mapping = {
value: key for (key, values) in sequences.items() for value in values
}
return sequences_mapping


def infer_resource_foreign_keys(resource, sequences_profiles_to_resource):
"""Find out the foreign keys within a resource fields

Look through all field of a resource which are of type 'string' if any of their values are matching a profile header in any of the sequences resources


Parameters
----------
resource: a :datapackage.Resource: instance
sequences_profiles_to_resource: the mapping of sequence profile headers to their resource name

Returns
-------
The :datapackage.Resource: instance with updated "foreignKeys" field

"""
r = resource
data = pd.DataFrame.from_records(r.read(keyed=True))
# TODO not sure this should be set here
r.descriptor["schema"]["primaryKey"] = "name"
if "foreignKeys" not in r.descriptor["schema"]:
r.descriptor["schema"]["foreignKeys"] = []

for field in r.schema.fields:
if field.type == "string":
for potential_fk in data.dropna()[field.name].unique():

if potential_fk in sequences_profiles_to_resource:
# this is actually a wrong format and should be with a "fields" field under the "reference" fields

fk = {
"fields": field.name,
"reference": {
"resource": sequences_profiles_to_resource[
potential_fk
],
},
}

if fk not in r.descriptor["schema"]["foreignKeys"]:
r.descriptor["schema"]["foreignKeys"].append(fk)
r.commit()
return r


def infer_package_foreign_keys(package):
"""Infer the foreign_keys from data/elements and data/sequences and update meta data

Parameters
----------
package

Returns
-------

"""
p = package
sequences_profiles_to_resource = map_sequence_profiles_to_resource_name(p)

for r in p.resources:
if "/elements/" in r.descriptor["path"]:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here one has to use os.sep instead of "/"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 37c5ad537c5ad5

r = infer_resource_foreign_keys(r, sequences_profiles_to_resource)
p.remove_resource(r.name)
p.add_resource(r.descriptor)


def infer_metadata_from_data(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Idea was to make this feature optional, therefore I did not want to modify infer_metadata function

package_name="default-name",
path=None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default parameter "None" throws an error and makes no sense IMO.
I would make it mandatory and first argument.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 0c4279b0c4279b

metadata_filename="datapackage.json",
):
"""

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small docstring would be good

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 0c4279b0c4279b

Returns
-------

"""

# Infer the fields from the package data
path = os.path.abspath(path)
p0 = Package(base_path=path)
p0.infer(os.path.join(path, "**/*.csv"))
p0.commit()
p0.save(os.path.join(path, metadata_filename))

foreign_keys = {}

def infer_resource_basic_foreign_keys(resource):
"""insert resource foreign_key into a dict formatted for building.infer_metadata

Compare the fields of a resource to a list of field names known to be foreign keys. If the field name is within the list, it is used to populate the dict 'foreign_keys'
"""
for field in resource.schema.fields:
if field.name in config.SPECIAL_FIELD_NAMES:
fk_descriptor = config.SPECIAL_FIELD_NAMES[field.name]
if fk_descriptor in foreign_keys:
if resource.name not in foreign_keys[fk_descriptor]:
foreign_keys[fk_descriptor].append(resource.name)
else:
foreign_keys[fk_descriptor] = [resource.name]

for r in p0.resources:
if "/elements/" in r.descriptor["path"]:
infer_resource_basic_foreign_keys(r)
# this function saves the metadata of the package in json format
infer_metadata(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because this function does already part of the job if provided a dict foreign_keys, I used it and therefore just wrote the function infer_resource_basic_foreign_keys to fill this foreign_keys dict.

package_name=package_name,
path=path,
foreign_keys=foreign_keys,
metadata_filename=metadata_filename,
)

# reload the package from the saved json file
p = Package(os.path.join(path, metadata_filename))
infer_package_foreign_keys(p)
p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"]))
p.commit()
p.save(os.path.join(path, metadata_filename))


def infer_metadata(
package_name="default-name",
keep_resources=False,
Expand Down
Loading