In [4]:
%pip install pandas


# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
data = pd.read_csv("/workspaces/TeamCPH/data/combined_df_corrected.csv")

data.head()  # Print first few rows to verify

output_path = "/workspaces/TeamCPH/LP_model_df.csv"

data.to_csv(output_path, index=False)

print(f"LP_model_df exported to: {output_path}")

Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m57.7 MB/s[0m  [33m0:00:00[0m6m0:00:01[0m
[?25hDownloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m82.0 MB/s[0m  [33m0:00:00[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Installing collected packages: pytz, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pandas]2m

In [5]:
# Handle missing values by removing rows with any missing values
prepared_data = data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
prepared_data.head()

(10560, 22)


Unnamed: 0,Date,Holiday,NextDayHoliday,IsWeekend,Month,KielerWeek,IsNewYearsEve,IsHalloween,t,lag_1,...,year_sin1,year_cos1,year_sin2,year_cos2,Revenue,Product_2,Product_3,Product_4,Product_5,Product_6
28,2013-07-29,True,True,False,7,False,False,False,28,1755.80259,...,0.463258,0.886224,0.8211,0.570784,183.564164,False,False,False,False,False
29,2013-07-30,True,True,False,7,False,False,False,29,1581.050773,...,0.478434,0.878124,0.840248,0.542202,132.11164,False,False,False,False,False
30,2013-07-31,True,True,False,7,False,False,False,30,1600.847329,...,0.493468,0.869764,0.858402,0.512978,144.745432,False,False,False,False,False
31,2013-08-01,True,True,False,8,False,False,False,31,1560.649381,...,0.508356,0.861147,0.875539,0.483147,157.854023,False,False,False,False,False
32,2013-08-02,True,True,False,8,False,False,False,32,1512.753817,...,0.523094,0.852275,0.89164,0.452745,117.359342,False,False,False,False,False


In [6]:
print(prepared_data)

             Date  Holiday  NextDayHoliday  IsWeekend  Month  KielerWeek  \
28     2013-07-29     True            True      False      7       False   
29     2013-07-30     True            True      False      7       False   
30     2013-07-31     True            True      False      7       False   
31     2013-08-01     True            True      False      8       False   
32     2013-08-02     True            True      False      8       False   
...           ...      ...             ...        ...    ...         ...   
10891  2018-07-27     True            True      False      7       False   
10892  2018-07-28     True            True       True      7       False   
10893  2018-07-29     True            True       True      7       False   
10894  2018-07-30     True            True      False      7       False   
10895  2018-07-31     True            True      False      7       False   

       IsNewYearsEve  IsHalloween    t        lag_1  ...  year_sin1  \
28             F

In [8]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Revenue', axis=1)
validation_features = validation_data.drop('Revenue', axis=1)
test_features = test_data.drop('Revenue', axis=1)

training_labels = training_data[['Revenue']]
validation_labels = validation_data[['Revenue']]
test_labels = test_data[['Revenue']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (7391, 21)
Validation features dimensions: (2112, 21)
Test features dimensions: (1057, 21)

Training labels dimensions: (7391, 1)
Validation labels dimensions: (2112, 1)
Test labels dimensions: (1057, 1)


In [9]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")