In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, GRU
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import math
from sklearn.metrics import mean_squared_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Marek/Uni/Seminar_25/time-series_data.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150150 entries, 0 to 150149
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   record_ID        150150 non-null  int64  
 1   week             150150 non-null  object 
 2   store_id         150150 non-null  int64  
 3   sku_id           150150 non-null  int64  
 4   total_price      150149 non-null  float64
 5   base_price       150150 non-null  float64
 6   is_featured_sku  150150 non-null  int64  
 7   is_display_sku   150150 non-null  int64  
 8   units_sold       150150 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 10.3+ MB


In [None]:
dataset.dropna(inplace=True)

In [None]:
#Spalte 'week' in datetime konvertieren
dataset['week'] = pd.to_datetime(dataset['week'], format='%d/%m/%y')

In [None]:
# Vorbereitung der Variablen

numeric_features = ['units_sold', 'total_price', 'base_price']
categorical_features = ['store_id', 'sku_id']

# Label Encoding für kategoriale Features
le_store = LabelEncoder()
le_sku = LabelEncoder()

dataset['store_id_encoded'] = le_store.fit_transform(dataset['store_id'])
dataset['sku_id_encoded'] = le_sku.fit_transform(dataset['sku_id'])

# Neue Features für die Sequenzierung, inklusive der binären Variablen
features = numeric_features + ['is_featured_sku', 'is_display_sku', 'store_id_encoded', 'sku_id_encoded']

In [None]:
# Erstellung Trainingsset

# Schritt 1: Eindeutige 'week'-Werte extrahieren
unique_weeks = dataset.index.unique()
n_weeks = len(unique_weeks)

weeks_train = int(0.75 * n_weeks)

# Schritt 2: Definiere die Grenzen für die Sets
train_weeks = unique_weeks[:weeks_train]
rest_weeks = unique_weeks[weeks_train:]

# Schritt 3: Daten in die jeweiligen Sets aufteilen
train_set = dataset[dataset.index.isin(train_weeks)]
print("Trainingsset:")
print(f"Anzahl der Reihen: {train_set.shape[0]}, mögliche Länge der Sequenzen: {train_set.index.nunique()}")

rest_set = dataset[dataset.index.isin(rest_weeks)]
print("Rest-Set:")
print(f"Anzahl der Reihen: {rest_set.shape[0]}, mögliche Länge der Sequenzen: {rest_set.index.nunique()}")

Trainingsset:
Anzahl der Reihen: 112611, mögliche Länge der Sequenzen: 112611
Rest-Set:
Anzahl der Reihen: 37538, mögliche Länge der Sequenzen: 37538


In [None]:
# Erstellung Validierungs- und Testset

# Schritt 1: Gruppierung nach 'store_id' und 'sku_id'
rest_grouped = list(rest_set.groupby(['store_id', 'sku_id']))

# Schritt 2: Shuffle der Gruppen, um eine zufällige Verteilung zu gewährleisten
np.random.seed(42)  # für Reproduzierbarkeit
np.random.shuffle(rest_grouped)

# Schritt 3: Aufteilung der Gruppen in Sets
n_groups = len(rest_grouped)
split = int(0.5 * n_groups)

val_groups = rest_grouped[:split]
test_groups = rest_grouped[split:]

# Schritt 4: Neue DataFrames für die Sets erstellen
val_set = pd.concat([group for _, group in val_groups])
test_set = pd.concat([group for _, group in test_groups])

# Überprüfung der Größen
print("Validierungsset:")
print(f"Anzahl der Reihen: {val_set.shape[0]}, Anzahl der Gruppen: {len(val_groups)}, Länge der Reihen: {val_set.index.nunique()}")
print("Test-Set:")
print(f"Anzahl der Reihen: {test_set.shape[0]}, Anzahl der Gruppen: {len(test_groups)}, Länge der Reihen: {test_set.index.nunique()}")

Validierungsset:
Anzahl der Reihen: 18739, Anzahl der Gruppen: 577, Länge der Reihen: 18739
Test-Set:
Anzahl der Reihen: 18799, Anzahl der Gruppen: 578, Länge der Reihen: 18799


In [None]:
# Index zurücksetzen

# Schritt 1: Den Index wieder auf den datetime 'week' setzen (falls nicht mehr so)
train_set = train_set.set_index('week')
val_set = val_set.set_index('week')
test_set = test_set.set_index('week')

# Schritt 2: Nach 'week' sortieren
train_set = train_set.sort_index()
val_set = val_set.sort_index()
test_set = test_set.sort_index()

In [None]:
print("Trainingsset:")
print(f"Anzahl der Reihen: {train_set.shape[0]}, mögliche Länge der Sequenzen: {train_set.index.nunique()}")
print()
print("Validierungsset:")
print(f"Anzahl der Reihen: {val_set.shape[0]}, Anzahl der Gruppen: {len(val_groups)}, Länge der Reihen: {val_set.index.nunique()}")
print()
print("Test-Set:")
print(f"Anzahl der Reihen: {test_set.shape[0]}, Anzahl der Gruppen: {len(test_groups)}, Länge der Reihen: {test_set.index.nunique()}")

Trainingsset:
Anzahl der Reihen: 112611, mögliche Länge der Sequenzen: 98

Validierungsset:
Anzahl der Reihen: 18739, Anzahl der Gruppen: 577, Länge der Reihen: 33

Test-Set:
Anzahl der Reihen: 18799, Anzahl der Gruppen: 578, Länge der Reihen: 33


In [None]:
# Skalierer nur für die numerischen Features (ohne binäre)
scalers = {feature: MinMaxScaler() for feature in numeric_features}

for feature in numeric_features:
  scalers[feature].fit(train_set[feature].values.reshape(-1, 1))

In [None]:
# Funktion zur Erstellung von Sequenzen
def create_sequences(dataset, features, numeric_features, scalers, timesteps):
    X_all = []
    y_all = []

    grouped = dataset.groupby(['store_id', 'sku_id'])

    for (store_id, sku_id), group in grouped:
        group = group.sort_index()

        # Daten extrahieren
        data = group[features].values

        # Skalieren der numerischen Features
        for feature in numeric_features:
            index = features.index(feature)
            data[:, index] = scalers[feature].transform(data[:, index].reshape(-1, 1)).flatten()

        # Kategoriale Features bleiben als Integer

        for i in range(timesteps, len(data)):
            X_seq = data[i - timesteps:i]
            X_all.append(X_seq)
            y_all.append(data[i, 0])  # units_sold ist der erste Wert in features

    X_array = np.array(X_all)
    y_array = np.array(y_all)
    return X_array, y_array

In [None]:
# Liste der Fenstergrößen
window_sizes = [30]

# Dictionary, um die erzeugten Datensätze zu speichern
train_sets = {}
val_sets = {}
test_sets = {}

for window in window_sizes:
    X_train, y_train = create_sequences(train_set, features, numeric_features, scalers, window)
    train_sets[window] = {'X': X_train, 'y': y_train}

    X_val, y_val = create_sequences(val_set, features, numeric_features, scalers, window)
    val_sets[window] = {'X': X_val, 'y': y_val}

    X_test, y_test = create_sequences(test_set, features, numeric_features, scalers, window)
    test_sets[window] = {'X': X_test, 'y': y_test}

    print(f"Fenstergröße {window}:")
    print(f"Trainings-Sequenzen: {X_train.shape}, Validierungs-Sequenzen: {X_val.shape}, Test-Sequenzen: {X_test.shape}.")

Fenstergröße 30:
Trainings-Sequenzen: (77961, 30, 7), Validierungs-Sequenzen: (1429, 30, 7), Test-Sequenzen: (1459, 30, 7).


In [None]:
X_tr = train_sets[30]['X']
y_tr = train_sets[30]['y']
X_v = val_sets[30]['X']
y_v = val_sets[30]['y']
X_t = test_sets[30]['X']
y_t = test_sets[30]['y']

In [None]:
# LSTM-Modell erstellen
model_LSTM = Sequential()
model_LSTM.add(LSTM(units=64,
                input_shape=(X_tr.shape[1], X_tr.shape[2]),
                return_sequences=True))
model_LSTM.add(LSTM(units=64, return_sequences=True))
model_LSTM.add(LSTM(units=64))
model_LSTM.add(Dense(1, activation='linear'))

# Modell kompilieren
model_LSTM.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error')

# EarlyStopping Callback definieren
early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

# Lernratenabbaufaktor
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-8)

# Training durchführen
history = model_LSTM.fit(X_tr, y_tr, epochs=50, batch_size=64,
                    validation_data=(X_v, y_v), callbacks=[early_stop, reduce_lr])

# Validation Loss speichern
val_loss = np.min(history.history['val_loss'])
print(f"Validierungsverlust: {val_loss:.6f}")

  super().__init__(**kwargs)


Epoch 1/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 72ms/step - loss: 0.0012 - val_loss: 4.1909e-04 - learning_rate: 0.0010
Epoch 2/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 68ms/step - loss: 3.4263e-04 - val_loss: 4.1432e-04 - learning_rate: 0.0010
Epoch 3/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 69ms/step - loss: 2.9061e-04 - val_loss: 3.8550e-04 - learning_rate: 0.0010
Epoch 4/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 76ms/step - loss: 2.7543e-04 - val_loss: 3.1963e-04 - learning_rate: 0.0010
Epoch 5/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 73ms/step - loss: 2.3823e-04 - val_loss: 3.0087e-04 - learning_rate: 5.0000e-04
Epoch 6/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 73ms/step - loss: 2.3559e-04 - val_loss: 2.9594e-04 - learning_rate: 5.0000e-04
Epoch 7/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [None]:
# Modell erstellen
model_RNN = Sequential()
model_RNN.add(SimpleRNN(units=64,
                input_shape=(X_tr.shape[1], X_tr.shape[2]),
                activation='relu', return_sequences=True))
model_RNN.add(SimpleRNN(units=64, activation='relu', return_sequences=True))
model_RNN.add(SimpleRNN(units=64, activation='relu'))
model_RNN.add(Dense(1, activation='linear'))

# Modell kompilieren
model_RNN.compile(optimizer=Adam(learning_rate=0.01),
              loss='mean_squared_error')

# EarlyStopping Callback definieren
early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

# Lernratenabbaufaktor
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-8)

# Training durchführen
history = model_RNN.fit(X_tr, y_tr, epochs=50, batch_size=64,
                    validation_data=(X_v, y_v), callbacks=[early_stop, reduce_lr])

# Validation Loss speichern
val_loss = np.min(history.history['val_loss'])
print(f"Validierungsverlust: {val_loss:.6f}")

Epoch 1/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 31ms/step - loss: 25.1884 - val_loss: 4.6214e-04 - learning_rate: 0.0100
Epoch 2/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 31ms/step - loss: 4.2092e-04 - val_loss: 5.2083e-04 - learning_rate: 0.0100
Epoch 3/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 31ms/step - loss: 3.4390e-04 - val_loss: 8.5950e-04 - learning_rate: 0.0100
Epoch 4/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 32ms/step - loss: 3.2389e-04 - val_loss: 3.6363e-04 - learning_rate: 0.0100
Epoch 5/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 31ms/step - loss: 2.7068e-04 - val_loss: 3.3248e-04 - learning_rate: 1.0000e-03
Epoch 6/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 30ms/step - loss: 2.4635e-04 - val_loss: 3.3094e-04 - learning_rate: 1.0000e-03
Epoch 7/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
# Modell erstellen
model_GRU = Sequential()
model_GRU.add(GRU(units=32,
                input_shape=(X_tr.shape[1], X_tr.shape[2]),
                return_sequences=True))
model_GRU.add(GRU(units=32, return_sequences=True))
model_GRU.add(GRU(units=32))
model_GRU.add(Dense(1, activation='linear'))

# Modell kompilieren
model_GRU.compile(optimizer=Adam(learning_rate=0.01),
              loss='mean_squared_error')

# EarlyStopping Callback definieren
early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

# Lernratenabbaufaktor
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-8)

# Training durchführen
history = model_GRU.fit(X_tr, y_tr, epochs=50, batch_size=64,
                    validation_data=(X_v, y_v), callbacks=[early_stop, reduce_lr])

# Validation Loss speichern
val_loss = np.min(history.history['val_loss'])
print(f"Validierungsverlust: {val_loss:.6f}")

Epoch 1/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 61ms/step - loss: 0.0233 - val_loss: 4.0503e-04 - learning_rate: 0.0100
Epoch 2/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 58ms/step - loss: 3.3549e-04 - val_loss: 4.7743e-04 - learning_rate: 0.0100
Epoch 3/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 58ms/step - loss: 2.8450e-04 - val_loss: 3.4256e-04 - learning_rate: 0.0100
Epoch 4/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 58ms/step - loss: 2.4866e-04 - val_loss: 4.4771e-04 - learning_rate: 0.0100
Epoch 5/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 58ms/step - loss: 2.3877e-04 - val_loss: 3.1659e-04 - learning_rate: 0.0050
Epoch 6/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 59ms/step - loss: 2.2508e-04 - val_loss: 3.7645e-04 - learning_rate: 0.0050
Epoch 7/50
[1m1219/1219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
# Vorhersagen mit den Modellen
pred_LSTM = model_LSTM.predict(X_t)
pred_RNN = model_RNN.predict(X_t)
pred_GRU = model_GRU.predict(X_t)

# Berechnung des mittleren quadratischen Fehlers (MSE)
mse_LSTM = mean_squared_error(y_t, pred_LSTM)
mse_RNN = mean_squared_error(y_t, pred_RNN)
mse_GRU = mean_squared_error(y_t, pred_GRU)

# Ausgabe der Ergebnisse
print(f"LSTM Modell Test-MSE: {mse_LSTM:6f}")
print(f"RNN Modell Test-MSE: {mse_RNN:.6f}")
print(f"GRU Modell Test-MSE: {mse_GRU:.6f}")

[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
LSTM Modell Test-MSE: 0.000295
RNN Modell Test-MSE: 0.000299
GRU Modell Test-MSE: 0.000311
