In [1]:
# We install darts
!pip install u8darts[torch] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.6/200.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.1/823.1 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.4/354.4 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Let's scale and store all the train data for the models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [3]:
# We have to properly select a dataset for testing
from google.colab import drive
drive.mount('/content/drive')

# Google Colab
#file_path = "/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test/train.csv"
file_path = "/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test_reduced/train_reduced.csv"   # We will scale the new version of the dataset

df = pd.read_csv(file_path, low_memory=False)

df['Date'] = pd.to_datetime(df['Date'])


Mounted at /content/drive


In [4]:
df

Unnamed: 0,id,Date,District,Use,Number of meters,Accumulated consumption (L/day),Consumption per meter,Year,Month
0,0801501001_Badalona,2022-01-01,1,Domestic,250,16242,64.97,2022,1
1,0801501001_Badalona,2022-01-02,1,Domestic,251,17477,69.63,2022,1
2,0801501001_Badalona,2022-01-03,1,Domestic,251,16540,65.90,2022,1
3,0801501001_Badalona,2022-01-04,1,Domestic,251,17153,68.34,2022,1
4,0801501001_Badalona,2022-01-05,1,Domestic,251,18619,74.18,2022,1
...,...,...,...,...,...,...,...,...,...
605895,0819403001_Barcelona,2023-12-27,3,Domestic,41,254,6.20,2023,12
605896,0819403001_Barcelona,2023-12-28,3,Domestic,41,246,6.00,2023,12
605897,0819403001_Barcelona,2023-12-29,3,Domestic,41,229,5.59,2023,12
605898,0819403001_Barcelona,2023-12-30,3,Domestic,41,258,6.29,2023,12


In [5]:
# Initialize empty DataFrames
train_df = pd.DataFrame()
val_df = pd.DataFrame()
calib_test_df = pd.DataFrame()

# Loop through each unique id and split
for meter_id in df['id'].unique():
    df_id = df[df['id'] == meter_id].sort_values('Date')

    train_part = df_id.iloc[:365]
    val_part = df_id.iloc[365:485]
    calib_test_part = df_id.iloc[485:]

    # Append to overall DataFrames
    train_df = pd.concat([train_df, train_part])
    val_df = pd.concat([val_df, val_part])
    calib_test_df = pd.concat([calib_test_df, calib_test_part])

# Print first and last dates for each split
print("Train set:")
print(f"Start: {train_df['Date'].min()}, End: {train_df['Date'].max()}")
print("\nValidation set:")
print(f"Start: {val_df['Date'].min()}, End: {val_df['Date'].max()}")
print("\nCalibration and test set:")
print(f"Start: {calib_test_df['Date'].min()}, End: {calib_test_df['Date'].max()}")

Train set:
Start: 2022-01-01 00:00:00, End: 2022-12-31 00:00:00

Validation set:
Start: 2023-01-01 00:00:00, End: 2023-04-30 00:00:00

Calibration and test set:
Start: 2023-05-01 00:00:00, End: 2023-12-31 00:00:00


In [6]:
from darts.dataprocessing.transformers import Scaler
from darts import TimeSeries
import pandas as pd
import joblib

# We make sure date is in datetime format
for sub_df in [train_df, val_df]:
    sub_df['Date'] = pd.to_datetime(sub_df['Date'])

# Initialize dicts to hold scaled series and scalers
scaled_train_series = {}
scaled_val_series = {}
scalers = {}
i = 1

# Loop over each unique id
for meter_id in df['id'].unique():
    # Get train set for this id
    train_id_df = train_df[train_df['id'] == meter_id].sort_values('Date')

    # Convert to TimeSeries
    train_series = TimeSeries.from_dataframe(train_id_df, time_col='Date', value_cols='Consumption per meter')

    # Get validation set for this id
    val_id_df = val_df[val_df['id'] == meter_id].sort_values('Date')
    val_series = TimeSeries.from_dataframe(val_id_df, time_col='Date', value_cols='Consumption per meter')

    # Initialize scaler and fit on train
    # WE CAN USE THE MIN-MAX SCALER (Default Scaler between 0 and 1)
    scaler = Scaler()
    scaler.fit(train_series)

    # OR WE CAN USE THE STANDARD SCALER
    #scaler = Scaler(scaler=StandardScaler())
    #scaler.fit(train_series)

    # OR WE CAN USE THE MIN-MAX BETWEEN 1 AND 2
    #scaler = Scaler(scaler=MinMaxScaler(feature_range=(1, 2)))
    #scaler.fit(train_series)

    # Transform the series
    train_scaled = scaler.transform(train_series)
    val_scaled = scaler.transform(val_series)

    # Store results
    scaled_train_series[meter_id] = train_scaled
    scaled_val_series[meter_id] = val_scaled
    scalers[meter_id] = scaler

    # Print quick summary for progress
    print(f"Processed meter: {i}/{908}")
    i = i+1


Processed meter: 1/908
Processed meter: 2/908
Processed meter: 3/908
Processed meter: 4/908
Processed meter: 5/908
Processed meter: 6/908
Processed meter: 7/908
Processed meter: 8/908
Processed meter: 9/908
Processed meter: 10/908
Processed meter: 11/908
Processed meter: 12/908
Processed meter: 13/908
Processed meter: 14/908
Processed meter: 15/908
Processed meter: 16/908
Processed meter: 17/908
Processed meter: 18/908
Processed meter: 19/908
Processed meter: 20/908
Processed meter: 21/908
Processed meter: 22/908
Processed meter: 23/908
Processed meter: 24/908
Processed meter: 25/908
Processed meter: 26/908
Processed meter: 27/908
Processed meter: 28/908
Processed meter: 29/908
Processed meter: 30/908
Processed meter: 31/908
Processed meter: 32/908
Processed meter: 33/908
Processed meter: 34/908
Processed meter: 35/908
Processed meter: 36/908
Processed meter: 37/908
Processed meter: 38/908
Processed meter: 39/908
Processed meter: 40/908
Processed meter: 41/908
Processed meter: 42/908
P

In [7]:
# Save everything in a single file
save_data = {
    'train': scaled_train_series,
    'validation': scaled_val_series,
    'scalers': scalers
}

#joblib.dump(save_data, '/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test/scaled_train_increased_minmax.pkl')
joblib.dump(save_data, '/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test_reduced/scaled_train_reduced.pkl')


['/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test_reduced/scaled_train_reduced.pkl']

In [8]:
# Now we do the same with test
#file_path = "/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test/test.csv"
file_path = "/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test_reduced/test_reduced.csv"   # We will scale the new version of the dataset
df_test = pd.read_csv(file_path, low_memory=False)

df_test['Date'] = pd.to_datetime(df_test['Date'])

In [9]:
# Initialize empty DataFrames
train_df = pd.DataFrame()
val_df = pd.DataFrame()
calib_test_df = pd.DataFrame()

# Loop through each unique id and split
for meter_id in df_test['id'].unique():
    df_id = df_test[df_test['id'] == meter_id].sort_values('Date')

    train_part = df_id.iloc[:365]   # We simply use the same train part to make to make it coherent
    val_part = df_id.iloc[365:485]
    calib_test_part = df_id.iloc[485:]

    # Append to overall DataFrames
    train_df = pd.concat([train_df, train_part])
    val_df = pd.concat([val_df, val_part])
    calib_test_df = pd.concat([calib_test_df, calib_test_part])

# Print first and last dates for each split
print("Train set:")
print(f"Start: {train_df['Date'].min()}, End: {train_df['Date'].max()}")
print("\nBuffer set:")
print(f"Start: {val_df['Date'].min()}, End: {val_df['Date'].max()}")
print("\nTest set:")
print(f"Start: {calib_test_df['Date'].min()}, End: {calib_test_df['Date'].max()}")

Train set:
Start: 2022-01-01 00:00:00, End: 2022-12-31 00:00:00

Buffer set:
Start: 2023-01-01 00:00:00, End: 2023-04-30 00:00:00

Test set:
Start: 2023-05-01 00:00:00, End: 2023-12-31 00:00:00


In [10]:
# We make sure date is in datetime format
for sub_df in [train_df, val_df, calib_test_df]:
    sub_df['Date'] = pd.to_datetime(sub_df['Date'])

# Initialize dicts to hold scaled series and scalers
scaled_train_series = {}
scaled_val_series = {}
scaled_calib_test_series = {}
scalers = {}
i = 1

# Loop over each unique id
for meter_id in df_test['id'].unique():
    # Get train set for this id
    train_id_df = train_df[train_df['id'] == meter_id].sort_values('Date')
    train_serie = TimeSeries.from_dataframe(train_id_df, time_col='Date', value_cols='Consumption per meter')

    # Get validation set for this id
    val_id_df = val_df[val_df['id'] == meter_id].sort_values('Date')
    val_serie = TimeSeries.from_dataframe(val_id_df, time_col='Date', value_cols='Consumption per meter')

    # Get calibration set for this id
    calib_test_id_df = calib_test_df[calib_test_df['id'] == meter_id].sort_values('Date')
    calib_test_serie = TimeSeries.from_dataframe(calib_test_id_df, time_col='Date', value_cols='Consumption per meter')

    # Initialize scaler and fit on train
    # MIN MAX SCALER between 0 and 1
    scaler = Scaler()
    scaler.fit(train_serie)

    # OR WE CAN USE THE STANDARD SCALER
    #scaler = Scaler(scaler=StandardScaler())
    #scaler.fit(train_serie)

    # OR WE CAN USE THE MIN-MAX BETWEEN 1 AND 2
    #scaler = Scaler(scaler=MinMaxScaler(feature_range=(1, 2)))
    #scaler.fit(train_series)

    # Transform the series
    train_scaled = scaler.transform(train_serie)
    val_scaled = scaler.transform(val_serie)
    calib_test_scaled = scaler.transform(calib_test_serie)

    # Store results
    scaled_train_series[meter_id] = train_scaled
    scaled_val_series[meter_id] = val_scaled
    scaled_calib_test_series[meter_id] = calib_test_scaled
    scalers[meter_id] = scaler

    # Print quick summary for progress
    print(f"Processed meter: {i}/{60}")
    i = i+1


Processed meter: 1/60
Processed meter: 2/60
Processed meter: 3/60
Processed meter: 4/60
Processed meter: 5/60
Processed meter: 6/60
Processed meter: 7/60
Processed meter: 8/60
Processed meter: 9/60
Processed meter: 10/60
Processed meter: 11/60
Processed meter: 12/60
Processed meter: 13/60
Processed meter: 14/60
Processed meter: 15/60
Processed meter: 16/60
Processed meter: 17/60
Processed meter: 18/60
Processed meter: 19/60
Processed meter: 20/60
Processed meter: 21/60
Processed meter: 22/60
Processed meter: 23/60
Processed meter: 24/60
Processed meter: 25/60
Processed meter: 26/60
Processed meter: 27/60
Processed meter: 28/60
Processed meter: 29/60
Processed meter: 30/60
Processed meter: 31/60
Processed meter: 32/60
Processed meter: 33/60
Processed meter: 34/60
Processed meter: 35/60
Processed meter: 36/60
Processed meter: 37/60
Processed meter: 38/60
Processed meter: 39/60
Processed meter: 40/60
Processed meter: 41/60
Processed meter: 42/60
Processed meter: 43/60
Processed meter: 44/

In [11]:
# Save everything in a single file
save_data = {
    'train': scaled_train_series,
    'validation': scaled_val_series,
    'calib_test': scaled_calib_test_series,
    'scalers': scalers
}

#joblib.dump(save_data, '/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test/scaled_test_increased_minmax.pkl')
joblib.dump(save_data, '/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test_reduced/scaled_test_reduced.pkl')


['/content/drive/MyDrive/Colab Notebooks/TFG/data/01_datasets/train_test_reduced/scaled_test_reduced.pkl']