### Import Libraries and Data

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import your cleaned dataset
data = pd.read_csv('/workspaces/TeamCPH/data/combined_df_corrected_clean_for_nn.csv')
data.head()


Unnamed: 0,Date,Holiday,NextDayHoliday,IsWeekend,Month,KielerWeek,IsNewYearsEve,IsHalloween,t,lag_1,...,year_sin1,year_cos1,year_sin2,year_cos2,Revenue,Product_2,Product_3,Product_4,Product_5,Product_6
0,2013-07-01,1,1,0,7,0,0,0,0,1269.2491,...,0.0,1.0,0.0,1.0,148.82835,0,0,0,0,0
1,2013-07-01,1,1,0,7,0,0,0,0,1269.2491,...,0.0,1.0,0.0,1.0,535.85626,1,0,0,0,0
2,2013-07-01,1,1,0,7,0,0,0,0,1269.2491,...,0.0,1.0,0.0,1.0,201.19843,0,1,0,0,0
3,2013-07-01,1,1,0,7,0,0,0,0,1269.2491,...,0.0,1.0,0.0,1.0,0.0,0,0,0,0,1
4,2013-07-01,1,1,0,7,0,0,0,0,1269.2491,...,0.0,1.0,0.0,1.0,317.47586,0,0,0,1,0


### Data Preparation

In [4]:
# Your dataset appears to be fully numeric (plus an optional Date column).
# If you have categorical columns, list them here; otherwise keep it empty.
categorical_features = []

# Drop Date if present (neural nets require numeric inputs)
if 'Date' in data.columns:
    data = data.drop(columns=['Date'])

# Separate features and label (target)
features = data.drop('Revenue', axis=1)
labels = data[['Revenue']]

# If you ever add categorical features later, you can one-hot encode like this:
# features = pd.get_dummies(features, columns=categorical_features, drop_first=True)

# Combine features and labels back together for the rest of the pipeline
prepared_data = pd.concat([features, labels], axis=1)
prepared_data.head()


Unnamed: 0,Holiday,NextDayHoliday,IsWeekend,Month,KielerWeek,IsNewYearsEve,IsHalloween,t,lag_1,roll7_mean,...,year_sin1,year_cos1,year_sin2,year_cos2,Product_2,Product_3,Product_4,Product_5,Product_6,Revenue
0,1,1,0,7,0,0,0,0,1269.2491,1338.3954,...,0.0,1.0,0.0,1.0,0,0,0,0,0,148.82835
1,1,1,0,7,0,0,0,0,1269.2491,1338.3954,...,0.0,1.0,0.0,1.0,1,0,0,0,0,535.85626
2,1,1,0,7,0,0,0,0,1269.2491,1338.3954,...,0.0,1.0,0.0,1.0,0,1,0,0,0,201.19843
3,1,1,0,7,0,0,0,0,1269.2491,1338.3954,...,0.0,1.0,0.0,1.0,0,0,0,0,1,0.0
4,1,1,0,7,0,0,0,0,1269.2491,1338.3954,...,0.0,1.0,0.0,1.0,0,0,0,1,0,317.47586


### Selection of Training, Validation and Test Data

In [5]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Revenue', axis=1)
validation_features = validation_data.drop('Revenue', axis=1)
test_features = test_data.drop('Revenue', axis=1)

training_labels = training_data[['Revenue']]
validation_labels = validation_data[['Revenue']]
test_labels = test_data[['Revenue']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (7627, 20)
Validation features dimensions: (2179, 20)
Test features dimensions: (1090, 20)

Training labels dimensions: (7627, 1)
Validation labels dimensions: (2179, 1)
Test labels dimensions: (1090, 1)


#### Data Export

In [6]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")