oemof · Bachibouzouk · Jun 4, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -14,3 +14,4 @@ Authors
 * Marie-Claire Gering
 * Julian Endres
 * Felix Maurer
+* Pierre-Francois Duc
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,7 @@ Unreleased
 ----------
 
 Features
+* Improve the function to infer package metadata `#173 <https://github.com/oemof/oemof-tabular/pull/173>`_
 
 Fixes
 

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -168,6 +168,18 @@ This can also be done for sequences and geometries.
 To create meta-data `json` file you can use the following code:
 
 
+.. code-block:: python
+
+	from datapackage_utilities import building
+
+	building.infer_metadata_from_data(
+		package_name="my-datapackage",
+		path="/home/user/datpackages/my-datapackage"
+	)
+
+
+Or, if you want to specify manually the relation of the foreign keys, you can use this code:
+
 .. code-block:: python
 
 	from datapackage_utilities import building
@@ -354,7 +366,8 @@ field names in the generators-profile resource.
 	.. note::
 
 		This usage breaks with the datapackage standard and creates
-		non-valid resources.**
+		non-valid resources.
+
 
 
 Scripting

diff --git a/src/oemof/tabular/__init__.py b/src/oemof/tabular/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.5"
+__version__ = "0.0.6dev"
 __project__ = "oemof.tabular"
 
 

diff --git a/src/oemof/tabular/config/config.py b/src/oemof/tabular/config/config.py
@@ -18,6 +18,11 @@
 with open(FOREIGN_KEY_DESCRIPTORS_FILE, "r") as fk_descriptors_file:
     FOREIGN_KEY_DESCRIPTORS = json.load(fk_descriptors_file)
 
+SPECIAL_FIELD_NAMES = {}
+for fk, descriptor in FOREIGN_KEY_DESCRIPTORS.items():
+    for el in descriptor:
+        SPECIAL_FIELD_NAMES[el["fields"]] = fk
+
 supported_oemof_tabular_versions = [
     None,
     "0.0.1",

diff --git a/src/oemof/tabular/datapackage/building.py b/src/oemof/tabular/datapackage/building.py
@@ -6,6 +6,7 @@
 import sys
 import tarfile
 import urllib.request
+import warnings
 import zipfile
 from ftplib import FTP
 from urllib.parse import urlparse
@@ -59,6 +60,208 @@ def update_package_descriptor():
     p.save("datapackage.json")
 
 
+def map_sequence_profiles_to_resource_name(
+    p, excluded_profiles=("timeindex",)
+):
+    """Look in every sequence resources and map each of its fields to itself
+
+    Within this process the unicity of the field names will be checked,
+    with the exception of the field "timeindex"
+
+    """
+
+    def check_sequences_labels_unicity(labels, new_labels):
+        intersect = set(labels).intersection(new_labels)
+        if len(intersect) == 1:
+            intersect = intersect.pop()
+            if not intersect == "timeindex":
+                answer = [intersect]
+            else:
+                answer = []
+        else:
+            answer = list(intersect)
+
+        if answer:
+            warnings.warn(
+                f"The labels of the profiles are not unique across all"
+                f"files within 'sequences' folder: '{','.join(intersect)}' "
+                f"used more than once"
+            )
+        return answer
+
+    sequences = {}
+    sequence_labels = []
+    duplicated_labels = []
+    for r in p.resources:
+        if "/sequences/" in r.descriptor["path"]:
+            field_labels = [
+                f.name
+                for f in r.schema.fields
+                if f.name not in excluded_profiles
+            ]
+            sequences[r.descriptor["name"]] = field_labels
+            duplicated_labels += check_sequences_labels_unicity(
+                sequence_labels, field_labels
+            )
+            sequence_labels += field_labels
+
+    if duplicated_labels:
+        # write an error message here
+        raise ValueError(
+            f"The following sequences labels are not unique"
+            f" across all sequences files: "
+            f"{', '.join(duplicated_labels)}"
+        )
+    # map each profile to its resource name
+    sequences_mapping = {
+        value: key for (key, values) in sequences.items() for value in values
+    }
+    return sequences_mapping
+
+
+def infer_resource_foreign_keys(resource, sequences_profiles_to_resource):
+    """Find out the foreign keys within a resource fields
+
+    Look through all field of a resource which are of type 'string'
+    if any of their values are matching a profile header in any of
+    the sequences resources
+
+
+    Parameters
+    ----------
+    resource: a :datapackage.Resource: instance
+    sequences_profiles_to_resource: the mapping of sequence profile
+        headers to their resource name
+
+    Returns
+    -------
+    The :datapackage.Resource: instance with updated "foreignKeys" field
+
+    """
+    r = resource
+    data = pd.DataFrame.from_records(r.read(keyed=True))
+    # TODO not sure this should be set here
+    r.descriptor["schema"]["primaryKey"] = "name"
+    if "foreignKeys" not in r.descriptor["schema"]:
+        r.descriptor["schema"]["foreignKeys"] = []
+
+    for field in r.schema.fields:
+        if field.type == "string":
+            for potential_fk in data[field.name].dropna().unique():
+
+                if potential_fk in sequences_profiles_to_resource:
+                    # this is actually a wrong format and should be
+                    # with a "fields" field under the "reference" fields
+
+                    fk = {
+                        "fields": field.name,
+                        "reference": {
+                            "resource": sequences_profiles_to_resource[
+                                potential_fk
+                            ],
+                        },
+                    }
+
+                    if fk not in r.descriptor["schema"]["foreignKeys"]:
+                        r.descriptor["schema"]["foreignKeys"].append(fk)
+    r.commit()
+    return r
+
+
+def infer_package_foreign_keys(package):
+    """Infer the foreign_keys from elements and sequences and update meta data
+
+    Parameters
+    ----------
+    package
+
+    Returns
+    -------
+
+    """
+    p = package
+    sequences_profiles_to_resource = map_sequence_profiles_to_resource_name(p)
+
+    for r in p.resources:
+        if os.sep + "elements" + os.sep in r.descriptor["path"]:
+            r = infer_resource_foreign_keys(r, sequences_profiles_to_resource)
+            # sort foreign_key entries by alphabetically by fields
+            r.descriptor["schema"]["foreignKeys"].sort(
+                key=lambda x: x["fields"]
+            )
+            p.remove_resource(r.name)
+            p.add_resource(r.descriptor)
+
+
+def infer_metadata_from_data(
+    path,
+    package_name="default-name",
+    metadata_filename="datapackage.json",
+):
+    """Creates a metadata .json file at the root-folder of datapackage
+
+    The foreign keys are inferred from the csv files within
+    "data/elements" and "data/sequences" resources.
+
+    Parameters
+    ----------
+    path: string
+        Absolute path to root-folder of the datapackage
+    package_name: string
+        Name of the data package
+    metadata_filename: basestring
+        Name of the inferred metadata string.
+
+    Returns
+    -------
+    Save a json metadata file at the root-folder of datapackage
+    under the provided path.
+    """
+
+    # Infer the fields from the package data
+    path = os.path.abspath(path)
+    p0 = Package(base_path=path)
+    p0.infer(os.path.join(path, "**" + os.sep + "*.csv"))
+    p0.commit()
+    p0.save(os.path.join(path, metadata_filename))
+
+    foreign_keys = {}
+
+    def infer_resource_basic_foreign_keys(resource):
+        """Prepare foreign_keys dict for building.infer_metadata
+
+        Compare the fields of a resource to a list of field names known
+        to be foreign keys. If the field name is within the list, it is
+        used to populate the dict 'foreign_keys'
+        """
+        for field in resource.schema.fields:
+            if field.name in config.SPECIAL_FIELD_NAMES:
+                fk_descriptor = config.SPECIAL_FIELD_NAMES[field.name]
+                if fk_descriptor in foreign_keys:
+                    if resource.name not in foreign_keys[fk_descriptor]:
+                        foreign_keys[fk_descriptor].append(resource.name)
+                else:
+                    foreign_keys[fk_descriptor] = [resource.name]
+
+    for r in p0.resources:
+        if os.sep + "elements" + os.sep in r.descriptor["path"]:
+            infer_resource_basic_foreign_keys(r)
+    # this function saves the metadata of the package in json format
+    infer_metadata(
+        package_name=package_name,
+        path=path,
+        foreign_keys=foreign_keys,
+        metadata_filename=metadata_filename,
+    )
+
+    # reload the package from the saved json file
+    p = Package(os.path.join(path, metadata_filename))
+    infer_package_foreign_keys(p)
+    p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"]))
+    p.commit()
+    p.save(os.path.join(path, metadata_filename))
+
+
 def infer_metadata(
     package_name="default-name",
     keep_resources=False,
@@ -231,6 +434,7 @@ def infer_metadata(
             )
             p.add_resource(r.descriptor)
 
+    p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"]))
     p.commit()
     p.save(metadata_filename)
 

diff --git a/src/oemof/tabular/examples/datapackages/dispatch/datapackage.json b/src/oemof/tabular/examples/datapackages/dispatch/datapackage.json
@@ -1,6 +1,6 @@
 {
     "profile": "tabular-data-package",
-    "name": "oemof-tabular-dispatch-example",
+    "name": "dispatch-example",
     "oemof_tabular_version": "0.0.6dev",
     "resources": [
         {
@@ -435,4 +435,4 @@
             }
         }
     ]
-}
+}
diff --git a/src/oemof/tabular/examples/datapackages/dispatch/scripts/infer.py b/src/oemof/tabular/examples/datapackages/dispatch/scripts/infer.py
@@ -11,7 +11,7 @@
     kwargs = {}
 
 building.infer_metadata(
-    package_name="oemof-tabular-dispatch-example",
+    package_name="dispatch-example",
     foreign_keys={
         "bus": ["volatile", "dispatchable", "storage", "load"],
         "profile": ["load", "volatile"],