Merge pull request #458 from openeemeter/update-docker-and-dependencies

Updated Dockerfile and Pipfile.lock and fixed tests
openeemeter · Mar 11, 2023 · 6d31d34 · 6d31d34
2 parents be460bf + d90999c
commit 6d31d34
Show file tree

Hide file tree

Showing 18 changed files with 2,040 additions and 1,186 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,13 @@ Changelog
 Development
 -----------
 
-* Placeholder
+* Update python version in Dockerfile.
+* Update other dependencies (including adding rust) in Dockerfile.
+* Remove pinned dependencies in Pipfile.
+* Relock Pipfile (and do so inside of the docker image).
+* Update pytests to account for changes in newer pandas where categorical variables are no longer included in `df.sum().sum()`.
+
+
 
 3.1.1
 -----

diff --git a/Dockerfile b/Dockerfile
@@ -1,10 +1,10 @@
-FROM python:3.6.6
+FROM python:3.10
 
 RUN set -ex && pip install pip pipenv --upgrade
 
 # sphinxcontrib-spelling dependency
 RUN apt-get update \
-  && apt-get install -yqq libenchant-dev
+  && apt-get install -yqq libenchant-2-dev
 
 COPY Pipfile Pipfile
 COPY Pipfile.lock Pipfile.lock

diff --git a/Pipfile b/Pipfile
@@ -7,18 +7,18 @@ name = "pypi"
 
 [packages]
 
-click = "==7.0"
-eeweather = ">=0.3.12"
+click = "*"
 matplotlib = "*"
-statsmodels = "==0.11.1"
-scipy = "==1.4.1"
-sqlalchemy = "*"
-pandas = "==0.25.2"
+statsmodels = "*"
+scipy = "*"
+pandas = "*"
 
 
 [dev-packages]
 
-black = "==18.6b4"
+sqlalchemy = "*"
+eeweather = ">=0.3.12"
+black = "*"
 coverage = "*"
 jupyterlab = "*"
 nbsphinx = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/docs/Makefile b/docs/Makefile
@@ -16,7 +16,7 @@ help:
 
 # Custom target for autobuild (philngo)
 livehtml:
-	sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)"/html -H 0.0.0.0 -p 8000 --poll -z ../eemeter
+	sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)"/html --host 0.0.0.0 --port 8000 --watch ../eemeter
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).

diff --git a/docs/conf.py b/docs/conf.py
@@ -201,4 +201,4 @@
 
 
 def setup(app):
-    app.add_stylesheet("css/custom.css")  # may also be an URL
+    app.add_css_file("css/custom.css")  # may also be an URL
diff --git a/eemeter/cli.py b/eemeter/cli.py
@@ -65,13 +65,11 @@ def cli():
 def _get_data(
     sample, meter_file, temperature_file, heating_balance_points, cooling_balance_points
 ):
-
     if sample is not None:
         with resource_stream("eemeter.samples", "metadata.json") as f:
             metadata = json.loads(f.read().decode("utf-8"))
         if sample in metadata:
             click.echo("Loading sample: {}".format(sample))
-
             meter_file = resource_stream(
                 "eemeter.samples", metadata[sample]["meter_data_filename"]
             )
@@ -106,7 +104,10 @@ def _get_data(
         heating_balance_points=heating_balance_points,
         cooling_balance_points=cooling_balance_points,
     )
-    return merge_features([usage_per_day, temperature_features])
+    merged_features = merge_features([usage_per_day, temperature_features])
+    # usage column must be `meter_value` for model fitting to work
+    merged_features.rename(columns={"usage_per_day": "meter_value"}, inplace=True)
+    return merged_features
 
 
 @cli.command()

diff --git a/eemeter/derivatives.py b/eemeter/derivatives.py
@@ -37,7 +37,7 @@ def _compute_ols_error(
 ):
     ols_model_agg_error = (
         (t_stat * rmse_base_residuals * post_obs)
-        / (base_obs ** 0.5)
+        / (base_obs**0.5)
         * (1.0 + ((base_avg - post_avg) ** 2.0 / base_var)) ** 0.5
     )
 
@@ -46,7 +46,7 @@ def _compute_ols_error(
     )
 
     ols_total_agg_error = (
-        ols_model_agg_error ** 2.0 + ols_noise_agg_error ** 2.0
+        ols_model_agg_error**2.0 + ols_noise_agg_error**2.0
     ) ** 0.5
 
     return ols_total_agg_error, ols_model_agg_error, ols_noise_agg_error
@@ -75,7 +75,7 @@ def _compute_fsu_error(
 
     fsu_error_band = total_base_energy * (
         t_stat
-        * (a_coeff * months_reporting ** 2.0 + b_coeff * months_reporting + c_coeff)
+        * (a_coeff * months_reporting**2.0 + b_coeff * months_reporting + c_coeff)
         * (rmse_base_residuals / base_avg)
         * ((base_obs / nprime) * (1.0 + (2.0 / nprime)) * (1.0 / post_obs)) ** 0.5
     )
@@ -372,7 +372,7 @@ def _compute_error_bands_modeled_savings(
         "FSU Error Band: Baseline": fsu_error_band_baseline,
         "FSU Error Band: Reporting": fsu_error_band_reporting,
         "FSU Error Band": (
-            fsu_error_band_baseline ** 2.0 + fsu_error_band_reporting ** 2.0
+            fsu_error_band_baseline**2.0 + fsu_error_band_reporting**2.0
         )
         ** 0.5,
     }

diff --git a/eemeter/features.py b/eemeter/features.py
@@ -233,7 +233,6 @@ def _compute_columns(temps):
         def _compute_columns(temps):
             count = temps.shape[0]
             if count > 24:
-
                 day_groups = np.floor(np.arange(count) / 24)
                 daily_temps = temps.groupby(day_groups).agg(["mean", "count"])
                 n_limit_period = percent_hourly_coverage_per_billing_period * count
@@ -267,7 +266,6 @@ def _compute_columns(temps):
                     for bp in heating_balance_points
                 }
             else:  # faster route for daily case, should have same effect.
-
                 if count > n_limit_daily:
                     count_cols = {"n_days_kept": 1, "n_days_dropped": 0}
                     # CalTRACK 2.2.2.3
@@ -833,7 +831,6 @@ def compute_temperature_bin_features(temperatures, bin_endpoints):
     bins = {}
 
     for i, (left_bin, right_bin) in enumerate(zip(bin_endpoints, bin_endpoints[1:])):
-
         bin_name = "bin_{}".format(i)
 
         in_bin = (temperatures > left_bin) & (temperatures <= right_bin)

diff --git a/eemeter/metrics.py b/eemeter/metrics.py
@@ -384,12 +384,10 @@ def __init__(
             or self.degrees_of_freedom < 1
             or self.observed_length < self.num_parameters
         ):
-
             self.cvrmse_auto_corr_correction = None
             self.approx_factor_auto_corr_correction = None
             self.fsu_base_term = None
         else:
-
             # factor to correct cvrmse_adj for autocorrelation of inputs
             # i.e., divide by (n' - n_param) instead of by (n - n_param)
             self.cvrmse_auto_corr_correction = (
@@ -411,15 +409,18 @@ def __init__(
             )
 
     def __repr__(self):
-        return "ModelMetrics(merged_length={}, r_squared_adj={}, cvrmse_adj={}, " "mape_no_zeros={}, nmae={}, nmbe={}, autocorr_resid={}, confidence_level={})".format(
-            self.merged_length,
-            round(self.r_squared_adj, 3),
-            round(self.cvrmse_adj, 3),
-            round(self.mape_no_zeros, 3),
-            round(self.nmae, 3),
-            round(self.nmbe, 3),
-            round(self.autocorr_resid, 3),
-            round(self.confidence_level, 3),
+        return (
+            "ModelMetrics(merged_length={}, r_squared_adj={}, cvrmse_adj={}, "
+            "mape_no_zeros={}, nmae={}, nmbe={}, autocorr_resid={}, confidence_level={})".format(
+                self.merged_length,
+                round(self.r_squared_adj, 3),
+                round(self.cvrmse_adj, 3),
+                round(self.mape_no_zeros, 3),
+                round(self.nmae, 3),
+                round(self.nmbe, 3),
+                round(self.autocorr_resid, 3),
+                round(self.confidence_level, 3),
+            )
         )
 
     def json(self):

diff --git a/tests/test_caltrack_design_matrices.py b/tests/test_caltrack_design_matrices.py
@@ -30,7 +30,7 @@
 
 
 def test_create_caltrack_hourly_preliminary_design_matrix(
-    il_electricity_cdd_hdd_hourly
+    il_electricity_cdd_hdd_hourly,
 ):
     meter_data = il_electricity_cdd_hdd_hourly["meter_data"]
     temperature_data = il_electricity_cdd_hdd_hourly["temperature_data"]
@@ -47,6 +47,8 @@ def test_create_caltrack_hourly_preliminary_design_matrix(
         "n_hours_kept",
         "temperature_mean",
     ]
+    # In newer pandas, categorical columns (like hour_of_week) arent included in sum
+    design_matrix.hour_of_week = design_matrix.hour_of_week.astype(float)
     assert round(design_matrix.sum().sum(), 2) == 136352.61
 
 
@@ -386,6 +388,7 @@ def test_create_caltrack_hourly_segmented_design_matrices(
         "meter_value",
         "weight",
     ]
+    design_matrix.hour_of_week = design_matrix.hour_of_week.astype(float)
     assert round(design_matrix.sum().sum(), 2) == 126210.07
 
     design_matrix = design_matrices["mar-apr-may-weighted"]
@@ -397,11 +400,12 @@ def test_create_caltrack_hourly_segmented_design_matrices(
         "meter_value",
         "weight",
     ]
+    design_matrix.hour_of_week = design_matrix.hour_of_week.astype(float)
     assert round(design_matrix.sum().sum(), 2) == 167659.28
 
 
 def test_create_caltrack_billing_design_matrix_empty_temp(
-    il_electricity_cdd_hdd_billing_monthly
+    il_electricity_cdd_hdd_billing_monthly,
 ):
     meter_data = il_electricity_cdd_hdd_billing_monthly["meter_data"]
     temperature_data = il_electricity_cdd_hdd_billing_monthly["temperature_data"][:0]
@@ -412,7 +416,7 @@ def test_create_caltrack_billing_design_matrix_empty_temp(
 
 
 def test_create_caltrack_billing_design_matrix_partial_empty_temp(
-    il_electricity_cdd_hdd_billing_monthly
+    il_electricity_cdd_hdd_billing_monthly,
 ):
     meter_data = il_electricity_cdd_hdd_billing_monthly["meter_data"]
     temperature_data = il_electricity_cdd_hdd_billing_monthly["temperature_data"][:200]

diff --git a/tests/test_caltrack_hourly.py b/tests/test_caltrack_hourly.py
@@ -99,6 +99,7 @@ def test_caltrack_hourly_fit_feature_processor(
         "weight",
     ]
     assert result.shape == (24, 10)
+    result.hour_of_week = result.hour_of_week.astype(float)
     assert round(result.sum().sum(), 2) == 5916.0
 
 
@@ -127,6 +128,7 @@ def test_caltrack_hourly_prediction_feature_processor(
         "weight",
     ]
     assert result.shape == (24, 9)
+    result.hour_of_week = result.hour_of_week.astype(float)
     assert round(result.sum().sum(), 2) == 4956.0
 
 
@@ -458,7 +460,7 @@ def segmented_design_matrices_single_mode(
 
 
 def test_fit_caltrack_hourly_model_segment_single_mode(
-    segmented_design_matrices_single_mode
+    segmented_design_matrices_single_mode,
 ):
     segment_name = "dec-jan-feb-weighted"
     segment_data = segmented_design_matrices_single_mode[segment_name]

diff --git a/tests/test_caltrack_usage_per_day.py b/tests/test_caltrack_usage_per_day.py
@@ -674,7 +674,7 @@ def cdd_hdd_h54_c67_billing_monthly_totals(il_electricity_cdd_hdd_billing_monthl
 
 
 def test_caltrack_predict_design_matrix_input_avg_false_output_avg_true(
-    cdd_hdd_h54_c67_billing_monthly_totals
+    cdd_hdd_h54_c67_billing_monthly_totals,
 ):
     data = cdd_hdd_h54_c67_billing_monthly_totals
     prediction = _caltrack_predict_design_matrix(
@@ -694,7 +694,7 @@ def test_caltrack_predict_design_matrix_input_avg_false_output_avg_true(
 
 
 def test_caltrack_predict_design_matrix_input_avg_false_output_avg_false(
-    cdd_hdd_h54_c67_billing_monthly_totals
+    cdd_hdd_h54_c67_billing_monthly_totals,
 ):
     data = cdd_hdd_h54_c67_billing_monthly_totals
     prediction = _caltrack_predict_design_matrix(
@@ -730,7 +730,7 @@ def cdd_hdd_h54_c67_billing_monthly_avgs(il_electricity_cdd_hdd_billing_monthly)
 
 
 def test_caltrack_predict_design_matrix_input_avg_true_output_avg_false(
-    cdd_hdd_h54_c67_billing_monthly_avgs
+    cdd_hdd_h54_c67_billing_monthly_avgs,
 ):
     data = cdd_hdd_h54_c67_billing_monthly_avgs
     prediction = _caltrack_predict_design_matrix(
@@ -750,7 +750,7 @@ def test_caltrack_predict_design_matrix_input_avg_true_output_avg_false(
 
 
 def test_caltrack_predict_design_matrix_input_avg_true_output_avg_true(
-    cdd_hdd_h54_c67_billing_monthly_avgs
+    cdd_hdd_h54_c67_billing_monthly_avgs,
 ):
     data = cdd_hdd_h54_c67_billing_monthly_avgs
     prediction = _caltrack_predict_design_matrix(
@@ -792,7 +792,7 @@ def test_caltrack_predict_design_matrix_n_days(cdd_hdd_h54_c67_billing_monthly_t
 
 
 def test_caltrack_predict_design_matrix_no_days_fails(
-    cdd_hdd_h54_c67_billing_monthly_totals
+    cdd_hdd_h54_c67_billing_monthly_totals,
 ):
     # This makes sure that the method fails if neither n_days nor
     # a DatetimeIndex is available.
@@ -1443,7 +1443,9 @@ def test_select_best_candidate_ok(
     assert best_candidate.r_squared_adj == 1
 
 
-def test_select_best_candidate_none(candidate_model_disqualified,):
+def test_select_best_candidate_none(
+    candidate_model_disqualified,
+):
     candidates = [candidate_model_disqualified]
 
     best_candidate, warnings = select_best_candidate(candidates)