Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #738 #740

Merged
merged 6 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
67 changes: 26 additions & 41 deletions doc/examples/prepare_timeseries.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,7 @@
"# Preprocessing user-provided time series\n",
"*Developed by D. Brakenhoff, Artesia, R Caljé, Artesia and R.A. Collenteur, Eawag, January (2021-2023)*\n",
"\n",
"This notebooks shows how to solve the most common errors that arise during the validation of the user provided time series. After showing how to deal with some of the easier errors, we will dive into the topic of making time series equidistant. For this purpose Pastas contains a lot of helper functions. \n",
"\n",
"<div class=\"alert alert-info\">\n",
"\n",
"<b>Note</b> \n",
"To see the Error messages returned by `validate_series` method, you have to uncomment the code-line. This is done to make sure the notebook still runs automatically for our Documentation website.\n",
"</div>\n"
"This notebooks shows how to solve the most common errors that arise during the validation of the user provided time series. After showing how to deal with some of the easier errors, we will dive into the topic of making time series equidistant. For this purpose Pastas contains a lot of helper functions."
]
},
{
Expand Down Expand Up @@ -703,7 +697,7 @@
"\n",
"ax.grid(True)\n",
"ax.legend(loc=\"best\")\n",
"ax.set_xlabel(\"\");"
"ax.set_xlabel(\"\")"
]
},
{
Expand Down Expand Up @@ -739,7 +733,7 @@
"metadata": {},
"outputs": [],
"source": [
"dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1)\n",
"dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1, sort=True)\n",
"dfall.columns = [\n",
" \"original\",\n",
" \"pandas_equidistant_sample\",\n",
Expand Down Expand Up @@ -813,7 +807,7 @@
"s_pastas.plot(ax=ax, marker=\".\", label=\"get_equidistant_series_nearest\")\n",
"ax.grid(True)\n",
"ax.legend(loc=\"best\")\n",
"ax.set_xlabel(\"\");"
"ax.set_xlabel(\"\")"
]
},
{
Expand All @@ -831,7 +825,7 @@
"metadata": {},
"outputs": [],
"source": [
"dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1)\n",
"dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1, sort=True)\n",
"dfall.columns = [\n",
" \"original\",\n",
" \"pandas_equidistant_sample\",\n",
Expand Down Expand Up @@ -902,9 +896,9 @@
"# Create figure\n",
"plt.figure(figsize=(10, 6))\n",
"ax = series.plot(marker=\"o\", label=\"original\", ms=10)\n",
"s_pd2.plot(ax=ax, marker=\"x\", ms=10, label=\"pandas_equidistant_nearest\")\n",
"s_pd3.plot(ax=ax, marker=\"^\", ms=8, label=\"pandas_equidistant_asfreq\")\n",
"s_pd1.plot(ax=ax, marker=\"+\", ms=16, label=\"pandas_equidistant_sample\")\n",
"# s_pd2.plot(ax=ax, marker=\"x\", ms=10, label=\"pandas_equidistant_nearest\")\n",
"# s_pd3.plot(ax=ax, marker=\"^\", ms=8, label=\"pandas_equidistant_asfreq\")\n",
"# s_pd1.plot(ax=ax, marker=\"+\", ms=16, label=\"pandas_equidistant_sample\")\n",
"s_pastas1.plot(\n",
" ax=ax, marker=\".\", ms=6, label=\"get_equidistant_series_nearest (minimize data loss)\"\n",
")\n",
Expand All @@ -913,7 +907,7 @@
")\n",
"ax.grid(True)\n",
"ax.legend(loc=\"best\")\n",
"ax.set_xlabel(\"\");"
"ax.set_xlabel(\"\")"
]
},
{
Expand All @@ -939,7 +933,9 @@
"metadata": {},
"outputs": [],
"source": [
"dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1)\n",
"dfall = pd.concat(\n",
" [series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1, sort=True\n",
")\n",
"dfall.columns = [\n",
" \"original\",\n",
" \"pandas_equidistant_sample\",\n",
Expand Down Expand Up @@ -1019,7 +1015,7 @@
")\n",
"ax.grid(True)\n",
"ax.legend(loc=\"best\")\n",
"ax.set_xlabel(\"\");"
"ax.set_xlabel(\"\")"
]
},
{
Expand All @@ -1037,7 +1033,9 @@
"metadata": {},
"outputs": [],
"source": [
"dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1)\n",
"dfall = pd.concat(\n",
" [series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1, sort=True\n",
")\n",
"dfall.columns = [\n",
" \"original\",\n",
" \"pandas_equidistant_sample\",\n",
Expand All @@ -1061,9 +1059,7 @@
"cell_type": "code",
"execution_count": null,
"id": "950db6ec",
"metadata": {
"scrolled": true
},
"metadata": {},
"outputs": [],
"source": [
"valueskept = dfall.apply(values_kept, args=(dfall[\"original\"],))\n",
Expand All @@ -1073,30 +1069,19 @@
"\n",
"pd.concat([valueskept, duplicates], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0de4e67b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"vscode": {
"interpreter": {
"hash": "dace5e1b41a98a8e52d2a8eebc3b981caf2c12e7a76736ebfb89a489e3b62e79"
}
"name": "python"
}
},
"nbformat": 4,
Expand Down
9 changes: 8 additions & 1 deletion pastas/timeseries_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,14 @@ def get_equidistant_series_nearest(
"""

# build new equidistant index
idx = date_range(series.index[0], series.index[-1], freq=freq)
t_offset = _get_time_offset(series.index, freq).value_counts().idxmax()
# use t_offset to pick time that will keep the most data without shifting in time
# from the original series.
idx = date_range(
series.index[0].floor(freq) + t_offset,
series.index[-1].ceil(freq) + t_offset,
freq=freq,
)

# get linear interpolated index from original series
fl = interpolate.interp1d(
Expand Down
53 changes: 52 additions & 1 deletion tests/test_timeseries_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pastas as ps
import numpy as np
import pandas as pd
import pytest

import pastas as ps


def test_frequency_is_supported():
ps.ts._frequency_is_supported("D")
Expand All @@ -14,3 +17,51 @@ def test_get_stress_dt():
assert ps.ts._get_stress_dt("7D") == 7.0
assert ps.ts._get_stress_dt("W") == 7.0
assert ps.ts._get_stress_dt("SMS") == 15.0


def test_time_series_sampling_methods():
# some helper functions to compute differences in performance
def values_kept(s, original):
diff = set(original.dropna().values) & set(s.dropna().values)
return len(diff)

def n_duplicates(s):
return (s.value_counts() >= 2).sum()

# Create timeseries
freq = "2h"
freq2 = "1h"
idx0 = pd.date_range("2000-01-01 18:00:00", freq=freq, periods=3).tolist()
idx1 = pd.date_range("2000-01-02 01:30:00", freq=freq2, periods=10).tolist()
idx0 = idx0 + idx1
idx0[3] = pd.Timestamp("2000-01-02 01:31:00")
series = pd.Series(index=idx0, data=np.arange(len(idx0), dtype=float))
series.iloc[8:10] = np.nan

# Create equidistant timeseries
s_pd1 = ps.ts.pandas_equidistant_sample(series, freq)
s_pd2 = ps.ts.pandas_equidistant_nearest(series, freq)
s_pd3 = ps.ts.pandas_equidistant_asfreq(series, freq)
s_pastas1 = ps.ts.get_equidistant_series_nearest(
series, freq, minimize_data_loss=False
)
s_pastas2 = ps.ts.get_equidistant_series_nearest(
series, freq, minimize_data_loss=True
)

dfall = pd.concat(
[series, s_pd1, s_pd2, s_pd3, s_pastas1, s_pastas2], axis=1, sort=True
)
dfall.columns = [
"original",
"pandas_equidistant_sample",
"pandas_equidistant_nearest",
"pandas_equidistant_asfreq",
"get_equidistant_series_nearest (default)",
"get_equidistant_series_nearest (minimize data loss)",
]
valueskept = dfall.apply(values_kept, args=(dfall["original"],))
duplicates = dfall.apply(n_duplicates)
print(valueskept)
assert np.all(valueskept.values == np.array([11, 4, 7, 8, 8, 9]))
assert np.all(duplicates.values == np.array([0, 0, 2, 0, 0, 0]))