In [None]:
# Necessary imports to read the datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# We have to properly select a dataset for testing
from google.colab import drive
drive.mount('/content/drive')

# Google Colab
file_path = "/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/df_domestic_cleaned_modified.csv"
df = pd.read_csv(file_path, low_memory=False)

df['Date'] = pd.to_datetime(df['Date'])


Mounted at /content/drive


In [None]:
def plot_consumption_by_id(dataframe, id_value, column='Accumulated consumption (L/day)'):
    """
    Plots a time series of accumulated consumption for a given ID.

    Parameters:
    - dataframe: the input DataFrame
    - id_value: the specific ID to filter and plot
    - column: the name of the column to plot (default is 'Accumulated consumption (L/day)')
    """
    ts = dataframe[dataframe['id'] == id_value].sort_values('Date')

    if ts.empty:
        print(f"No data found for ID: {id_value}")
        return

    plt.figure(figsize=(12, 4))
    plt.plot(ts['Date'], ts[column], linestyle='-', linewidth=1.2)
    plt.title(f"{column} for: {id_value}")
    plt.xlabel("Date")
    plt.ylabel(column)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

In [None]:
df

Unnamed: 0,id,Date,District,Use,Number of meters,Accumulated consumption (L/day),Consumption per meter,Year
0,0801501001_Badalona,2022-01-01,1,Domestic,250,16242,64.97,2022
1,0801501001_Badalona,2022-01-02,1,Domestic,251,17477,69.63,2022
2,0801501001_Badalona,2022-01-03,1,Domestic,251,16540,65.90,2022
3,0801501001_Badalona,2022-01-04,1,Domestic,251,17153,68.34,2022
4,0801501001_Badalona,2022-01-05,1,Domestic,251,18619,74.18,2022
...,...,...,...,...,...,...,...,...
706635,0819403001_Barcelona,2023-12-27,3,Domestic,41,254,6.20,2023
706636,0819403001_Barcelona,2023-12-28,3,Domestic,41,246,6.00,2023
706637,0819403001_Barcelona,2023-12-29,3,Domestic,41,229,5.59,2023
706638,0819403001_Barcelona,2023-12-30,3,Domestic,41,258,6.29,2023


In [None]:
# Now we want to do a random selection of 50 time series
# Group by ID and count entries
series_counts = df.groupby('id').size()

# Filter IDs with exactly 730 entries
valid_ids = series_counts[series_counts == 730].index

print(f"Number of valid time series with exactly 730 entries: {len(valid_ids)}")

# Get first and last "Number of meters" per id
first_meters = df.groupby('id').first()['Number of meters']
last_meters = df.groupby('id').last()['Number of meters']

# Compute increase
increase = last_meters - first_meters

# Define ID groups based on increase
low_increase_ids = increase[increase < 100].index
medium_increase_ids = increase[(increase >= 100) & (increase < 200)].index
high_increase_ids = increase[increase >= 200].index

# Create three DataFrames
df_low_increase = df[df['id'].isin(low_increase_ids)]
df_medium_increase = df[df['id'].isin(medium_increase_ids)]
df_high_increase = df[df['id'].isin(high_increase_ids)]


Number of valid time series with exactly 730 entries: 968


In [None]:
print(f"Series with <100 increase: {df_low_increase['id'].nunique()}")
print(f"Series with 100-199 increase: {df_medium_increase['id'].nunique()}")
print(f"Series with ≥200 increase: {df_high_increase['id'].nunique()}")


Series with <100 increase: 334
Series with 100-199 increase: 386
Series with ≥200 increase: 248


In [None]:
# Define your groups (IDs only)
group_ids = [
    df_low_increase['id'].unique(),
    df_medium_increase['id'].unique(),
    df_high_increase['id'].unique()
]

# Define the sample sizes
weights = [30, 20, 10]

# To make selection reproducible
np.random.seed(10)

# Store sampled IDs here
selected_ids = []

# Loop over groups and weights
for ids_group, n_samples in zip(group_ids, weights):
    sampled = np.random.choice(ids_group, size=n_samples, replace=False)
    selected_ids.extend(sampled)

# Now selected_ids is a list of all sampled IDs
print(f"Total selected IDs: {len(selected_ids)}")
print(selected_ids)


Total selected IDs: 60
['0810102007_Hospitalet', '0810102017_Hospitalet', '0801906002_Barcelona', '0801910092_Barcelona', '0801908049_Barcelona', '0810104040_Hospitalet', '0801903048_Barcelona', '0810102045_Hospitalet', '0801908046_Barcelona', '0810101040_Hospitalet', '0810106012_Hospitalet', '0810105032_Hospitalet', '0810106013_Hospitalet', '0801903040_Barcelona', '0801503008_Badalona', '0810106023_Hospitalet', '0801903035_Barcelona', '0801908043_Barcelona', '0801903060_Barcelona', '0801903006_Barcelona', '0810102048_Hospitalet', '0801910054_Barcelona', '0801903081_Barcelona', '0801904030_Barcelona', '0810101041_Hospitalet', '0801906010_Barcelona', '0810105003_Hospitalet', '0810102008_Hospitalet', '0801903067_Barcelona', '0801910053_Barcelona', '0801902037_Barcelona', '0801907011_Barcelona', '0801910119_Barcelona', '0801907013_Barcelona', '0810102033_Hospitalet', '0801907115_Barcelona', '0801901042_Barcelona', '0801905091_Barcelona', '0801908111_Barcelona', '0801906041_Barcelona', '08

In [None]:
# 1. Get all unique IDs from the full DataFrame
all_ids_df = df['id'].unique()

# 2. Compute the unselected IDs
unselected_ids = list(set(all_ids_df) - set(selected_ids))

print(f"Total IDs in df: {len(all_ids_df)}")
print(f"Selected IDs: {len(selected_ids)}")
print(f"Unselected IDs: {len(unselected_ids)}")


Total IDs in df: 968
Selected IDs: 60
Unselected IDs: 908


In [None]:
for series_id in selected_ids:
    plot_consumption_by_id(df, series_id)
    plot_consumption_by_id(df, series_id, column='Consumption per meter')


Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Define your specific id
target_id = '0801901042_Barcelona'  # change this as needed

# Filter the df for that id
df_id = df[df['id'] == target_id]

# Get the row with the latest date
latest_row = df_id.loc[df_id['Date'].idxmax()]
latest_date = latest_row['Date']
latest_meters = latest_row['Number of meters']

# Get the row with the earliest date
earliest_row = df_id.loc[df_id['Date'].idxmin()]
earliest_date = earliest_row['Date']
earliest_meters = earliest_row['Number of meters']

# Calculate the change
increase = latest_meters - earliest_meters

# Print the results
print(f"ID: {target_id}")
print(f"First day ({earliest_date}): {earliest_meters} meters")
print(f"Last day ({latest_date}): {latest_meters} meters")
print(f"Change in number of meters: {increase} meters")


ID: 0801901042_Barcelona
First day (2022-01-01 00:00:00): 567 meters
Last day (2023-12-31 00:00:00): 747 meters
Change in number of meters: 180 meters


In [None]:
# We divide the df
test_df = df[df['id'].isin(selected_ids)]
train_df = df[~df['id'].isin(selected_ids)]


In [None]:
test_df.to_csv('/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test/test.csv', index=False)
train_df.to_csv('/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test/train.csv', index=False)