In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from collections import Counter

2024-07-25 13:16:25.931811: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Read and Prepare data

In [2]:
df = pd.read_csv('data/processed/train_1_processed.csv', index_col=None)

#### Select language

In [3]:
df = df[df['language'] == 'fr']

#### Select only time series columns

In [4]:
# Regular expression to match date format "YYYY-MM-DD"
date_pattern = re.compile(r'\d{4}-\d{2}-\d{2}')

# Filter columns based on the regular expression
filtered_columns = [col for col in df.columns if date_pattern.match(col)]

# Create a new DataFrame with only the filtered columns
df = df[filtered_columns]

In [5]:
df = df.fillna(0)

In [6]:
df.shape

(17802, 550)

#### Split training and test data

In [21]:
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)

In [22]:
print(train_data.shape)
print(test_data.shape)

(14241, 550)
(3561, 550)


#### Normalize

In [27]:
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

In [28]:
print(train_data.shape)
print(test_data.shape)

(14241, 550)
(3561, 550)


#### Prepare data for LSTM

In [29]:
def create_multivariate_dataset(data, timesteps=7):
    X, y = [], []
    for i in range(len(data) - timesteps):
        X.append(data[i:i + timesteps])  # Shape: (timesteps, num_pages)
        y.append(data[i + timesteps])    # Shape: (num_pages,)
    return np.array(X), np.array(y)

In [30]:
timesteps = 7
X_train, y_train = create_multivariate_dataset(train_data, timesteps=timesteps)
X_test, y_test = create_multivariate_dataset(test_data, timesteps=timesteps)

# Data Modeling

#### Define Multivariate LSTM

In [31]:
model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(timesteps, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50, activation='relu'))
model.add(Dense(X_train.shape[2]))
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

  super().__init__(**kwargs)


#### Train model

In [32]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Extract training and validation loss and accuracy from the history object
loss = history.history['loss']
val_loss = history.history.get('val_loss')
val_accuracy = history.history.get('val_accuracy')

# Plot training and validation loss
plt.figure(figsize=(12, 5))

plt.plot(loss, label='Training Loss')
if val_loss:
    plt.plot(val_loss, label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

Epoch 1/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 1.5475e-04 - val_loss: 4.9101e-07
Epoch 2/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 1.4710e-04 - val_loss: 2.9544e-07
Epoch 3/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - loss: 1.3171e-04 - val_loss: 1.4608e-05
Epoch 4/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - loss: 1.3781e-04 - val_loss: 5.4790e-07
Epoch 5/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - loss: 3.0032e-05 - val_loss: 1.3904e-05
Epoch 6/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 1.0112e-04 - val_loss: 8.7223e-07
Epoch 7/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - loss: 1.3276e-04 - val_loss: 3.6521e-07
Epoch 8/50
[1m401/401[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 3

KeyboardInterrupt: 

# Data Evaluation

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
# Example: Plot predictions for a specific location (e.g., first location)
location_idx = 5

plt.figure(figsize=(15, 5))
plt.plot(y_test[:, location_idx], label='Actual')
plt.plot(y_pred[:, location_idx], label='Predicted')
plt.title(f'Traffic Demand Prediction for Location {location_idx}')
plt.legend()
plt.show()