This repository has been archived by the owner on Nov 1, 2023. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 6
Improve metadata handling #90
Closed
Closed
Changes from 7 commits
Commits
Show all changes
20 commits
Select commit
Hold shift + click to select a range
ad3d6d3
:construction: Let operations between variables handle metadata properly
pabloarosado db6e0be
:white-check-mark: Update tests
pabloarosado 33d8a99
:sparkles: Minor improvements in style and documentation
pabloarosado ee9e981
:white_check_mark: Update tests
pabloarosado d1be5e3
:tada: Add metadata handling for all common dunder methods
pabloarosado 26b31ff
:white_check_mark: Add tests
pabloarosado a99c0df
:tada: Add merge function that handles metadata (WIP)
pabloarosado 3c732ab
feat: Let processing log properly store variable name
pabloarosado d34a294
test: Update tests
pabloarosado 43ea392
feat: Add other existing pandas method to properly handle metadata, a…
pabloarosado 8865d24
feat: Add processing log when loading, creating, or saving a table
pabloarosado 38159ae
test: Update tests
pabloarosado 1621297
feat: Improve metadata handling when reading new files and merging ta…
pabloarosado 967f3c7
feat: Add processing log entry after renaming columns, and other mino…
pabloarosado 51b0112
test: Update tests
pabloarosado 789fd0a
feat: Get sources and licenses from dataset if not defined for each v…
pabloarosado 65f767e
feat: Implement logic to handle metadat for melt function
pabloarosado 61bc8b1
feat: Implement logic to handle metadata for concat function
pabloarosado 89e6cf2
Merge branch 'master' of github.com:owid/owid-catalog-py into improve…
pabloarosado 2821139
style: Improve format
pabloarosado File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
"""Common operations performed on tables and variables. | ||
|
||
""" | ||
from .tables import concat, melt, merge, pivot, read_csv, read_excel | ||
|
||
__all__ = ["concat", "melt", "merge", "pivot", "read_csv", "read_excel"] | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -125,15 +125,21 @@ def read(cls, path: Union[str, Path]) -> "Table": | |
path = path.as_posix() | ||
|
||
if path.endswith(".csv"): | ||
return cls.read_csv(path) | ||
table = cls.read_csv(path) | ||
|
||
elif path.endswith(".feather"): | ||
return cls.read_feather(path) | ||
table = cls.read_feather(path) | ||
|
||
elif path.endswith(".parquet"): | ||
return cls.read_parquet(path) | ||
table = cls.read_parquet(path) | ||
else: | ||
raise ValueError(f"could not detect a suitable format to read from: {path}") | ||
|
||
# # Add processing log to the metadata of each variable in the table. | ||
# TODO: Unit tests fail when adding processing log to each variable. | ||
# table = _add_processing_log_to_each_loaded_variable(table) | ||
|
||
raise ValueError(f"could not detect a suitable format to read from: {path}") | ||
return table | ||
|
||
# Mypy complaints about this not matching the defintiion of NDFrame.to_csv but I don't understand why | ||
def to_csv(self, path: Any, **kwargs: Any) -> None: # type: ignore | ||
|
@@ -505,3 +511,72 @@ def reset_index(self, *args, **kwargs) -> Optional["Table"]: # type: ignore | |
# preserve metadata in _fields, calling reset_index() on a table drops it | ||
t._fields = self._fields | ||
return t # type: ignore | ||
|
||
def merge(self, right, *args, **kwargs) -> "Table": | ||
return merge(left=self, right=right, *args, **kwargs) | ||
|
||
def melt(self, *args, **kwargs) -> "Table": | ||
return melt(frame=self, *args, **kwargs) | ||
|
||
def pivot(self, *args, **kwargs) -> "Table": | ||
return pivot(data=self, *args, **kwargs) | ||
|
||
|
||
def merge(left, right, *args, **kwargs) -> Table: | ||
# TODO: This function needs further logic. For example, to handle "on"/"left_on"/"right_on" columns, | ||
# or suffixes, or overlapping columns (that will end in "_x" and "_y" by default), or indexes. | ||
tb = Table(pd.merge(left, right, *args, **kwargs)) | ||
columns_that_were_in_left = set(tb.columns) & set(left.columns) | ||
columns_that_were_in_right = set(tb.columns) & set(right.columns) | ||
|
||
for column in columns_that_were_in_left: | ||
tb[column].metadata = left[column].metadata | ||
for column in columns_that_were_in_right: | ||
tb[column].metadata = right[column].metadata | ||
|
||
return tb | ||
|
||
|
||
# TODO: Handle metadata and processing info for each of the following functions. | ||
def concat(*args, **kwargs) -> Table: | ||
return Table(pd.concat(*args, **kwargs)) | ||
|
||
|
||
def melt(*args, **kwargs) -> Table: | ||
return Table(pd.melt(*args, **kwargs)) | ||
|
||
|
||
def pivot(*args, **kwargs) -> Table: | ||
return Table(pd.pivot(*args, **kwargs)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, perhaps the table level metadata should be copied (excl: primary key). |
||
|
||
|
||
def read_csv(*args, **kwargs) -> Table: | ||
return Table(pd.read_csv(*args, **kwargs)) | ||
|
||
|
||
def read_excel(*args, **kwargs) -> Table: | ||
return Table(pd.read_excel(*args, **kwargs)) | ||
|
||
|
||
def _add_processing_log_to_each_loaded_variable(table): | ||
# Add entry to processing log, specifying that each variable was loaded from this table. | ||
# Generate a URI for the table. | ||
table_uri = f"{table.metadata.dataset.uri}/{table.metadata.short_name}" | ||
# Get the names of the columns currently used for the index of the table (if any). | ||
index_columns = table.metadata.primary_key | ||
|
||
# If the table has an index, reset it so we have access to all variables in the table. | ||
if len(index_columns) > 0: | ||
table = table.reset_index(drop=False) | ||
for column in table.columns: | ||
# If no processing log is found for a variable, start a new one. | ||
if table[column].metadata.processing_log is None: | ||
table[column].metadata.processing_log = [] | ||
# Append a new entry to the processing log. | ||
table[column].metadata.processing_log.append( | ||
[{"variable": column, "parents": [table_uri], "operation": "load"}] | ||
) | ||
if len(index_columns) > 0: | ||
table = table.set_index(index_columns) | ||
|
||
return table |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cherry on the top would be monkey patching pandas if we are in
enable_tracing
context managerit's not necessary though, might do more bad than good.