### Import Libraries and Data

In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
data = pd.read_csv("/workspaces/TeamCPH/model_df.csv")

data = data.drop(columns=["Warengruppe"])
data = data.drop(columns=["log_Umsatz"])

data.head()  # Print first few rows to verify

output_path = "/workspaces/TeamCPH/model_df1.csv"

data.to_csv(output_path, index=False)

print(f"model_df1 exported to: {output_path}")

model_df1 exported to: /workspaces/TeamCPH/model_df1.csv


### Data Preparation

In [8]:
# Handle missing values by removing rows with any missing values
prepared_data = data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
prepared_data.head()


(9292, 16)


Unnamed: 0,Umsatz,holiday,IsWeekend,IsNewYears,Easter,KielerWoche,sin_1y,cos_1y,sin_2y,Revenue_lag1,Revenue_lag7,WG_2,WG_3,WG_4,WG_5,WG_6
0,135.500244,1,0,0,0,0,-0.109446,-0.993993,0.217577,92.637755,148.828353,0,0,0,0,0
1,136.048383,1,0,0,0,0,-0.126528,-0.991963,0.251022,135.500244,159.793757,0,0,0,0,0
2,135.132314,1,0,0,0,0,-0.143572,-0.98964,0.28417,136.048383,111.885594,0,0,0,0,0
3,138.549393,1,0,0,0,0,-0.160575,-0.987024,0.316982,135.132314,168.864941,0,0,0,0,0
4,101.447511,1,0,0,0,0,-0.177529,-0.984116,0.349418,138.549393,171.280754,0,0,0,0,0


In [9]:
print(prepared_data)

          Umsatz  holiday  IsWeekend  IsNewYears  Easter  KielerWoche  \
0     135.500244        1          0           0       0            0   
1     136.048383        1          0           0       0            0   
2     135.132314        1          0           0       0            0   
3     138.549393        1          0           0       0            0   
4     101.447511        1          0           0       0            0   
...          ...      ...        ...         ...     ...          ...   
9287   87.471228        1          0           0       0            0   
9288   71.911652        1          0           0       0            0   
9289   84.062223        1          1           0       0            0   
9290   60.981969        1          1           0       0            0   
9291   34.972644        1          0           0       0            0   

        sin_1y    cos_1y    sin_2y  Revenue_lag1  Revenue_lag7  WG_2  WG_3  \
0    -0.109446 -0.993993  0.217577     92.637

### Selection of Training, Validation and Test Data

In [10]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (6504, 15)
Validation features dimensions: (1858, 15)
Test features dimensions: (930, 15)

Training labels dimensions: (6504, 1)
Validation labels dimensions: (1858, 1)
Test labels dimensions: (930, 1)


#### Data Export

In [11]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")