pastas · raoulcollenteur · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024
diff --git a/doc/examples/prepare_timeseries.ipynb b/doc/examples/prepare_timeseries.ipynb
@@ -8,13 +8,7 @@
     "# Preprocessing user-provided time series\n",
     "*Developed by D. Brakenhoff, Artesia, R Caljé, Artesia and R.A. Collenteur, Eawag, January (2021-2023)*\n",
     "\n",
-    "This notebooks shows how to solve the most common errors that arise during the validation of the user provided time series. After showing how to deal with some of the easier errors, we will dive into the topic of making time series equidistant. For this purpose Pastas contains a lot of helper functions.   \n",
-    "\n",
-    "<div class=\"alert alert-info\">\n",
-    "\n",
-    "<b>Note</b> \n",
-    "To see the Error messages returned by `validate_series` method, you have to uncomment the code-line. This is done to make sure the notebook still runs automatically for our Documentation website.\n",
-    "</div>\n"
+    "This notebooks shows how to solve the most common errors that arise during the validation of the user provided time series. After showing how to deal with some of the easier errors, we will dive into the topic of making time series equidistant. For this purpose Pastas contains a lot of helper functions."
    ]
   },
   {
@@ -703,7 +697,7 @@
     "\n",
     "ax.grid(True)\n",
     "ax.legend(loc=\"best\")\n",
-    "ax.set_xlabel(\"\");"
+    "ax.set_xlabel(\"\")"
    ]
   },
   {
@@ -739,7 +733,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1)\n",
+    "dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1, sort=True)\n",
     "dfall.columns = [\n",
     "    \"original\",\n",
     "    \"pandas_equidistant_sample\",\n",
@@ -813,7 +807,7 @@
     "s_pastas.plot(ax=ax, marker=\".\", label=\"get_equidistant_series_nearest\")\n",
     "ax.grid(True)\n",
     "ax.legend(loc=\"best\")\n",
-    "ax.set_xlabel(\"\");"
+    "ax.set_xlabel(\"\")"
    ]
   },
   {
@@ -831,7 +825,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1)\n",
+    "dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas], axis=1, sort=True)\n",
     "dfall.columns = [\n",
     "    \"original\",\n",
     "    \"pandas_equidistant_sample\",\n",
@@ -902,9 +896,9 @@
     "# Create figure\n",
     "plt.figure(figsize=(10, 6))\n",
     "ax = series.plot(marker=\"o\", label=\"original\", ms=10)\n",
-    "s_pd2.plot(ax=ax, marker=\"x\", ms=10, label=\"pandas_equidistant_nearest\")\n",
-    "s_pd3.plot(ax=ax, marker=\"^\", ms=8, label=\"pandas_equidistant_asfreq\")\n",
-    "s_pd1.plot(ax=ax, marker=\"+\", ms=16, label=\"pandas_equidistant_sample\")\n",
+    "# s_pd2.plot(ax=ax, marker=\"x\", ms=10, label=\"pandas_equidistant_nearest\")\n",
+    "# s_pd3.plot(ax=ax, marker=\"^\", ms=8, label=\"pandas_equidistant_asfreq\")\n",
+    "# s_pd1.plot(ax=ax, marker=\"+\", ms=16, label=\"pandas_equidistant_sample\")\n",
     "s_pastas1.plot(\n",
     "    ax=ax, marker=\".\", ms=6, label=\"get_equidistant_series_nearest (minimize data loss)\"\n",
     ")\n",
@@ -913,7 +907,7 @@
     ")\n",
     "ax.grid(True)\n",
     "ax.legend(loc=\"best\")\n",
-    "ax.set_xlabel(\"\");"
+    "ax.set_xlabel(\"\")"
    ]
   },
   {
@@ -939,7 +933,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1)\n",
+    "dfall = pd.concat(\n",
+    "    [series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1, sort=True\n",
+    ")\n",
     "dfall.columns = [\n",
     "    \"original\",\n",
     "    \"pandas_equidistant_sample\",\n",
@@ -1019,7 +1015,7 @@
     ")\n",
     "ax.grid(True)\n",
     "ax.legend(loc=\"best\")\n",
-    "ax.set_xlabel(\"\");"
+    "ax.set_xlabel(\"\")"
    ]
   },
   {
@@ -1037,7 +1033,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dfall = pd.concat([series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1)\n",
+    "dfall = pd.concat(\n",
+    "    [series, s_pd1, s_pd2, s_pd3, s_pastas2, s_pastas1], axis=1, sort=True\n",
+    ")\n",
     "dfall.columns = [\n",
     "    \"original\",\n",
     "    \"pandas_equidistant_sample\",\n",
@@ -1061,9 +1059,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "950db6ec",
-   "metadata": {
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "valueskept = dfall.apply(values_kept, args=(dfall[\"original\"],))\n",
@@ -1073,30 +1069,19 @@
     "\n",
     "pd.concat([valueskept, duplicates], axis=1)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0de4e67b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "dace5e1b41a98a8e52d2a8eebc3b981caf2c12e7a76736ebfb89a489e3b62e79"
-   }
+   "name": "python"
   }
  },
  "nbformat": 4,

diff --git a/pastas/timeseries_utils.py b/pastas/timeseries_utils.py
@@ -357,7 +357,14 @@ def get_equidistant_series_nearest(
     """
 
     # build new equidistant index
-    idx = date_range(series.index[0], series.index[-1], freq=freq)
+    t_offset = _get_time_offset(series.index, freq).value_counts().idxmax()
+    # use t_offset to pick time that will keep the most data without shifting in time
+    # from the original series.
+    idx = date_range(
+        series.index[0].floor(freq) + t_offset,
+        series.index[-1].ceil(freq) + t_offset,
+        freq=freq,
+    )
 
     # get linear interpolated index from original series
     fl = interpolate.interp1d(

diff --git a/tests/test_timeseries_utils.py b/tests/test_timeseries_utils.py
@@ -1,6 +1,9 @@
-import pastas as ps
+import numpy as np
+import pandas as pd
 import pytest
 
+import pastas as ps
+
 
 def test_frequency_is_supported():
     ps.ts._frequency_is_supported("D")
@@ -14,3 +17,51 @@ def test_get_stress_dt():
     assert ps.ts._get_stress_dt("7D") == 7.0
     assert ps.ts._get_stress_dt("W") == 7.0
     assert ps.ts._get_stress_dt("SMS") == 15.0
+
+
+def test_time_series_sampling_methods():
+    # some helper functions to compute differences in performance
+    def values_kept(s, original):
+        diff = set(original.dropna().values) & set(s.dropna().values)
+        return len(diff)
+
+    def n_duplicates(s):
+        return (s.value_counts() >= 2).sum()
+
+    # Create timeseries
+    freq = "2h"
+    freq2 = "1h"
+    idx0 = pd.date_range("2000-01-01 18:00:00", freq=freq, periods=3).tolist()
+    idx1 = pd.date_range("2000-01-02 01:30:00", freq=freq2, periods=10).tolist()
+    idx0 = idx0 + idx1
+    idx0[3] = pd.Timestamp("2000-01-02 01:31:00")
+    series = pd.Series(index=idx0, data=np.arange(len(idx0), dtype=float))
+    series.iloc[8:10] = np.nan
+
+    # Create equidistant timeseries
+    s_pd1 = ps.ts.pandas_equidistant_sample(series, freq)
+    s_pd2 = ps.ts.pandas_equidistant_nearest(series, freq)
+    s_pd3 = ps.ts.pandas_equidistant_asfreq(series, freq)
+    s_pastas1 = ps.ts.get_equidistant_series_nearest(
+        series, freq, minimize_data_loss=False
+    )
+    s_pastas2 = ps.ts.get_equidistant_series_nearest(
+        series, freq, minimize_data_loss=True
+    )
+
+    dfall = pd.concat(
+        [series, s_pd1, s_pd2, s_pd3, s_pastas1, s_pastas2], axis=1, sort=True
+    )
+    dfall.columns = [
+        "original",
+        "pandas_equidistant_sample",
+        "pandas_equidistant_nearest",
+        "pandas_equidistant_asfreq",
+        "get_equidistant_series_nearest (default)",
+        "get_equidistant_series_nearest (minimize data loss)",
+    ]
+    valueskept = dfall.apply(values_kept, args=(dfall["original"],))
+    duplicates = dfall.apply(n_duplicates)
+    print(valueskept)
+    assert np.all(valueskept.values == np.array([11, 4, 7, 8, 8, 9]))
+    assert np.all(duplicates.values == np.array([0, 0, 2, 0, 0, 0]))