In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [None]:
data_path = '../data'
# df = pd.read_csv("../data/consum_total_agregat.csv")
df = pd.read_parquet(os.path.join(data_path, 'parquet/1_ Consum total agregat.parquet'))
display(df.head())

In [None]:
# RENAME COLUMNS
df.columns = [
    "CensusSection", "District", "Municipality", "Date",
    "Use", "NumMeters", "Consumption_L_day"
]

display(df.head())

In [None]:
print(df.shape)
# There are 963419 rows and 7 columns in the dataset.

# NUMBER OF DUPLICATE ROWS
duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

## Data Discovery

In [None]:
# CHECKING DATA TYPES AND NON-NULL COUNTS
display(df.info())

In [None]:
# SUMMARY STATISTICS
print("\nDescriptive statistics:")
display(df.describe())
print("\nUnique values per column:")
display(df.nunique())

In [None]:
# DATA TRANSFORMATION

# Date formatting
df["Date"] = pd.to_datetime(df["Date"], errors='coerce')

# Standardizing 'Use' column
df["Use"] = df["Use"].replace({
    "Domèstic/Doméstico/Domestic": "Domestic",
    "Comercial/Comercial/Commercial": "Commercial",
    "Industrial/Industrial/Industrial": "Industrial"
})

display(df.head())

In [None]:
# District normalization
print(df["District"].value_counts()) # shows that some districts are formatted differently

df["District"] = df["District"].replace(">", np.nan)
df["District"] = pd.to_numeric(df["District"], errors="coerce")
df.loc[~df["District"].between(1, 10), "District"] = np.nan

print(df["District"].value_counts())


In [None]:
district_map = {
    1: "Ciutat Vella",
    2: "L'Eixample",
    3: "Sants-Montjuïc",
    4: "Les Corts",
    5: "Sarrià-Sant Gervasi",
    6: "Gràcia",
    7: "Horta-Guinardó",
    8: "Nou Barris",
    9: "Sant Andreu",
    10: "Sant Martí"
}

df["District_Name"] = df["District"].map(district_map)

print(df[["District", "District_Name"]].sample(5))

In [None]:
# ONLY KEEP RELEVANT COLUMNS
df = df[["CensusSection", "District", "District_Name", "Date", "Use", "NumMeters", "Consumption_L_day"]]

In [None]:
# HANDLING MISSING VALUES
print("\nMissing values per column:")
print(df.isnull().sum())

# Do nothing for now, just report missing values

## EDA

In [None]:
# Use pie chart distribution / distribution of consumption by use type

use_colors = {
    "Domestic": "skyblue",
    "Commercial": "green",
    "Industrial": "orange"
}

uses_order = ["Domestic", "Commercial", "Industrial"]
use_count = df["Use"].value_counts().reindex(uses_order)
use_share = df.groupby("Use")["Consumption_L_day"].sum().reindex(uses_order)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

axes[0].pie(
    use_count,
    labels=use_count.index,
    autopct='%1.1f%%',
    startangle=90,
    counterclock=False,
    colors=[use_colors[u] for u in use_count.index]
)
axes[0].set_title("Distribution of Records by Use Type")

axes[1].pie(
    use_share,
    labels=use_share.index,
    autopct='%1.1f%%',
    startangle=90,
    counterclock=False, 
    colors=[use_colors[u] for u in use_count.index]
)
axes[1].set_title("Share of Total Water Consumption by Use Type")

plt.tight_layout()
plt.show()

In [None]:
# District distribution
district_colors = {
    name: plt.cm.tab10(i)  # use matplotlib tab10 colormap
    for i, name in enumerate(district_map.values())
}

districts_order = list(district_map.values())

district_count = df["District_Name"].value_counts().reindex(districts_order)

district_share = df.groupby("District_Name")["Consumption_L_day"].sum().reindex(districts_order)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

axes[0].bar(
    district_count.index,
    district_count.values,
    color=[district_colors[d] for d in district_count.index]
)
axes[0].set_title("Distribution of Records by District")
axes[0].set_xticklabels(district_count.index, rotation=45, ha="right")

axes[1].bar(
    district_share.index,
    district_share.values,
    color=[district_colors[d] for d in district_share.index]
)
axes[1].set_title("Total Water Consumption by District")
axes[1].set_xticklabels(district_share.index, rotation=45, ha="right")

plt.tight_layout()
plt.show()

In [None]:
df["Consumption_per_meter"] = df["Consumption_L_day"] / df["NumMeters"]
district_avg = df.groupby("District_Name")["Consumption_per_meter"].mean().reindex(districts_order)

plt.figure(figsize=(10,6))
sns.barplot(x=district_avg.index, y=district_avg.values, palette=district_colors)
plt.title("Average Daily Consumption per Meter by District")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Liters per meter per day")
plt.show()


In [None]:
use_district_share = (
    df.groupby(["District_Name", "Use"])["Consumption_L_day"].sum()
      .groupby(level=0).apply(lambda x: x / x.sum())
      .unstack()
)

use_district_share.plot(kind="bar", stacked=True, figsize=(12,6), color=use_colors)
plt.title("Share of Consumption by Use Type within Each District")
plt.xticks(rotation=45, ha="right")
plt.xlabel("District")
plt.ylabel("Share of total consumption")
plt.legend(title="Use Type")
plt.tight_layout()
plt.show()


In [None]:
# Consumption over time
plt.figure(figsize=(12,6))
sns.lineplot(data=df, x="Date", y="Consumption_L_day", hue="Use")
plt.title("Water Consumption Over Time by Use Type")
plt.xlabel("Date")
plt.ylabel("Consumption (L/day)")
plt.legend(title="Use Type")
plt.tight_layout()
plt.show()

In [None]:
pivot = df.pivot_table(index="District_Name", columns="Date", values="Consumption_L_day", aggfunc="sum")
plt.figure(figsize=(14,6))
sns.heatmap(pivot, cmap="YlGnBu")
plt.title("Daily Water Consumption by District")
plt.xlabel("Date")
plt.ylabel("District")
plt.show()


In [None]:
agg = df.groupby("CensusSection")[["NumMeters", "Consumption_L_day"]].sum().reset_index()
sns.scatterplot(data=agg, x="NumMeters", y="Consumption_L_day")
plt.title("Relationship between Number of Meters and Total Consumption")
plt.xlabel("Number of Meters")
plt.ylabel("Total Daily Consumption (L)")
plt.show()

df[["NumMeters", "Consumption_L_day"]].corr()