# Há correlação entre a temperatura e o consumo de água?

In [17]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_parquet("../data/preprocessed_data/water_consumption_cleaned_0.parquet")


RESERVOIR_TOTAL_CAPACITY = 1_000_000  
df = df.assign(
    reservoir_level_liters=lambda x: (x["reservoir_level_(%)"] / 100) * RESERVOIR_TOTAL_CAPACITY,
    time_passed_seconds=lambda x: x["timestamp"].diff().dt.total_seconds(),
    liters_entered=lambda x: x["reservoir_level_liters"].diff()
)
df["flow_out_(l/s)"] = (-df["liters_entered"] / df["time_passed_seconds"]) + df["flow_in_(l/s)"]

df.head()

Unnamed: 0,timestamp,flow_in_(l/s),reservoir_level_(%),pressure_(mca),gmb_1_is_on,gmb_2_is_on,reservoir_level_liters,time_passed_seconds,liters_entered,flow_out_(l/s)
0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,298600.0,,,
1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,3710.0,60000.0,49.877493
2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,150.0,3000.0,45.64
3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,150.0,3400.0,42.973333
4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,150.0,3000.0,45.64


In [3]:
df_weather = pd.read_parquet("../data/preprocessed_data/water_consumption_merged_0.parquet")
df_weather.rename(columns={"flow_in_l_s": "flow_in_(l/s)", "pressure_mca": "pressure_(mca)", "reservoir_level_percentage": "reservoir_level_(%)"}, inplace=True)
df_weather.head()

Unnamed: 0,timestamp,flow_in_(l/s),reservoir_level_(%),pressure_(mca),gmb_1_is_on,gmb_2_is_on,total_precip_mm,station_pressure_mb,max_pressure_last_hour_mb,min_pressure_last_hour_mb,global_radiation_kj_m2,air_temp_c,dew_point_temp_c,max_temp_last_hour_c,min_temp_last_hour_c,max_dew_point_last_hour_c,min_dew_point_last_hour_c,max_humidity_last_hour_percentage,min_humidity_last_hour_percentage,relative_humidity_percentage,wind_direction_deg,max_wind_gust_m_s,wind_speed_m_s
0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,0.0,1014.6,1014.6,1013.7,626.7,26.0,22.6,26.0,22.4,22.6,21.1,92.0,81.0,82.0,93.0,1.8,0.4
1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8


In [4]:
print(df_weather.columns.to_list())
print(df.columns.to_list())

['timestamp', 'flow_in_(l/s)', 'reservoir_level_(%)', 'pressure_(mca)', 'gmb_1_is_on', 'gmb_2_is_on', 'total_precip_mm', 'station_pressure_mb', 'max_pressure_last_hour_mb', 'min_pressure_last_hour_mb', 'global_radiation_kj_m2', 'air_temp_c', 'dew_point_temp_c', 'max_temp_last_hour_c', 'min_temp_last_hour_c', 'max_dew_point_last_hour_c', 'min_dew_point_last_hour_c', 'max_humidity_last_hour_percentage', 'min_humidity_last_hour_percentage', 'relative_humidity_percentage', 'wind_direction_deg', 'max_wind_gust_m_s', 'wind_speed_m_s']
['timestamp', 'flow_in_(l/s)', 'reservoir_level_(%)', 'pressure_(mca)', 'gmb_1_is_on', 'gmb_2_is_on', 'reservoir_level_liters', 'time_passed_seconds', 'liters_entered', 'flow_out_(l/s)']


In [5]:
# join the two dataframes
df_merged =  pd.merge(df, df_weather, on=['timestamp', 'flow_in_(l/s)', 'reservoir_level_(%)', 'pressure_(mca)', 'gmb_1_is_on', 'gmb_2_is_on'])
df_merged.head()

Unnamed: 0,timestamp,flow_in_(l/s),reservoir_level_(%),pressure_(mca),gmb_1_is_on,gmb_2_is_on,reservoir_level_liters,time_passed_seconds,liters_entered,flow_out_(l/s),total_precip_mm,station_pressure_mb,max_pressure_last_hour_mb,min_pressure_last_hour_mb,global_radiation_kj_m2,air_temp_c,dew_point_temp_c,max_temp_last_hour_c,min_temp_last_hour_c,max_dew_point_last_hour_c,min_dew_point_last_hour_c,max_humidity_last_hour_percentage,min_humidity_last_hour_percentage,relative_humidity_percentage,wind_direction_deg,max_wind_gust_m_s,wind_speed_m_s
0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,298600.0,,,,0.0,1014.6,1014.6,1013.7,626.7,26.0,22.6,26.0,22.4,22.6,21.1,92.0,81.0,82.0,93.0,1.8,0.4
1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,3710.0,60000.0,49.877493,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,150.0,3000.0,45.64,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,150.0,3400.0,42.973333,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,150.0,3000.0,45.64,0.0,1015.1,1015.1,1014.6,1413.7,27.9,22.0,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8


In [7]:
weather_variables = df_merged.columns[10:].to_list()

for variable in weather_variables:
    correlation_with_flow_out = df_merged[variable].corr(df_merged["flow_out_(l/s)"])
    print(f"Correlation between {variable} and flow_out_(l/s): {correlation_with_flow_out}")

Correlation between total_precip_mm and flow_out_(l/s): -0.0010198629728348752
Correlation between station_pressure_mb and flow_out_(l/s): 0.010533330991995011
Correlation between max_pressure_last_hour_mb and flow_out_(l/s): 0.010187994239547788
Correlation between min_pressure_last_hour_mb and flow_out_(l/s): 0.01028066643837175
Correlation between global_radiation_kj_m2 and flow_out_(l/s): 0.024448767621066222
Correlation between air_temp_c and flow_out_(l/s): 0.0024020431904104014
Correlation between dew_point_temp_c and flow_out_(l/s): -0.0014179148685445093
Correlation between max_temp_last_hour_c and flow_out_(l/s): 0.00162815735516202
Correlation between min_temp_last_hour_c and flow_out_(l/s): -0.0017765168873905596
Correlation between max_dew_point_last_hour_c and flow_out_(l/s): -0.0008455481589044989
Correlation between min_dew_point_last_hour_c and flow_out_(l/s): -0.004530875272484287
Correlation between max_humidity_last_hour_percentage and flow_out_(l/s): -0.00408212840