# Phase 1/3: Team Assignment

## Step 0: Import Packages

In [89]:
# Import essential libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import joblib
import kfp
from kfp import dsl
import h5py

## Step 1: Import Data

In [90]:
# Load and inspect the contents of the 'df' group
def load_data():
    with h5py.File('metr-la.h5', 'r') as f:
        df_group = f['df']
        
        # List all keys within the 'df' group
        print("Keys inside 'df' group:", list(df_group.keys()))
        
        # You can then load individual datasets once you know the keys
        # For example, if a key 'data' exists:
        if 'data' in df_group:
            data = df_group['data'][:]
            print("Loaded data shape:", data.shape)
        return df_group

# Inspect the 'df' group
df_group = load_data()

Keys inside 'df' group: ['axis0', 'axis1', 'block0_items', 'block0_values']


In [91]:
# Load and reconstruct the DataFrame from the 'df' group
def load_data():
    with h5py.File('metr-la.h5', 'r') as f:
        df_group = f['df']
        
        # Load the column names (features) from 'block0_items'
        columns = list(df_group['block0_items'][:].astype(str))
        
        # Load the actual data from 'block0_values'
        data = df_group['block0_values'][:]
        
        # Convert the data into a DataFrame
        df = pd.DataFrame(data, columns=columns)
        
        return df

# Load the data and inspect the first few rows
df = load_data()
print("Data shape:", df.shape)

Data shape: (34272, 207)


In [92]:
df.head()

Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
0,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
1,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Step 1.1: Checking for Null Values

In [93]:
print("Checking for missing values...")
print(df.isnull().sum())

Checking for missing values...
773869    0
767541    0
767542    0
717447    0
717446    0
         ..
717592    0
717595    0
772168    0
718141    0
769373    0
Length: 207, dtype: int64


### Step 1.2: Checking df Stats

In [94]:
print("\nStatistical summary:")
df.describe()


Statistical summary:


Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
count,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,...,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0,34272.0
mean,54.631359,60.452789,60.72612,49.524287,46.079798,50.952003,54.471684,57.255095,56.068044,52.871841,...,37.803342,58.156679,51.217523,59.795754,59.329923,56.915083,62.484679,54.697381,58.92021,51.197504
std,22.619199,15.970239,18.313353,15.843261,19.350345,16.68176,17.984761,18.751065,18.240361,23.343805,...,13.525743,20.690411,22.224997,16.126225,19.84995,18.260438,16.959238,16.303651,19.080474,21.239354
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,60.364583,63.0,65.444444,50.333333,34.666667,49.555556,55.75,61.0,58.222222,43.428571,...,30.444444,64.111111,53.444444,61.714286,63.666667,60.222222,65.888889,50.125,62.888889,54.125
50%,64.888889,65.0,67.375,53.875,46.0,56.111111,62.111111,63.333333,62.444444,65.875,...,43.222222,67.111111,61.777778,64.875,66.777778,63.0,67.625,61.125,66.125,62.0
75%,66.875,66.375,68.444444,58.125,64.5,60.333333,65.0,65.0,64.888889,67.625,...,46.625,68.444444,64.375,66.5,68.25,64.75,68.625,64.444444,67.75,63.444444
max,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,...,65.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0


## Step 2: Data Preprocessing

### Step 2.1: Feature Engineering (BEFORE THE PROCESSING)

In [95]:
# Feature Engineering (optimized to avoid fragmentation)
def feature_engineering(df):
    # Compute rolling mean and std for each sensor at once using apply
    rolling_means = df.rolling(window=3).mean().add_suffix('_rolling_mean')
    rolling_stds = df.rolling(window=3).std().add_suffix('_rolling_std')

    # Combine the original dataframe with the new rolling features
    df_combined = pd.concat([df, rolling_means, rolling_stds], axis=1)

    # Drop NaN values that were introduced by the rolling window
    df_combined = df_combined.dropna()

    return df_combined

# Apply feature engineering
df = feature_engineering(df)

# Check the updated dataframe
print(f"Data shape after feature engineering: {df.shape}")


Data shape after feature engineering: (34270, 621)


### Step 2.2: Using MinMaxScaler to Preprocess Data 

In [96]:
def preprocess_data(data):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    return data_scaled, scaler

In [97]:
# Apply preprocessing to the entire feature-engineered dataset
data_scaled, scaler = preprocess_data(df.values)

## Step 3: Create Sequences for Time-Series Forecasting

In [98]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Step 2: Create sequences from the scaled data
def create_sequences(data, time_steps=10):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i + time_steps])  # Sequence of features
        y.append(data[i + time_steps, 0])  # Predict the next value for the first feature
    return np.array(X), np.array(y)

# Apply preprocessing and sequence creation
data_scaled, scaler = preprocess_data(df.values)
X, y = create_sequences(data_scaled, time_steps=10)

# Step 3: Split into train and test
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Ensure y_train and y_test are 2D arrays
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Check the shapes
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (27408, 10, 621), y_train shape: (27408, 1)
X_test shape: (6852, 10, 621), y_test shape: (6852, 1)


## Step 4: Save the Preprocessed Data (Pickle)

In [99]:
joblib.dump((X, y), 'preprocessed_data.pkl')

['preprocessed_data.pkl']

In [100]:
import kfp

print(kfp.__version__)


2.9.0


## Step 5: Building LSTM & GRU

### Step 5.1: LSTM Model Architecture
##### Just for reference. Model being built in `train_LSTM.py`

In [49]:
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(32))
    model.add(Dense(y_train.shape[1]))  
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

### Step 5.1: GRU Model Architecture
##### Just for reference. Model being built in `train_GRU.py`

In [50]:
def build_gru_model(input_shape):
    model = Sequential()
    model.add(GRU(64, input_shape=input_shape, return_sequences=True))
    model.add(GRU(32))
    model.add(Dense(y_train.shape[1]))  
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

## Step 6: Train LSTM & GRU

The LSTM and GRU models are trained using Kubeflow, and the metrics (MAE, MSE) are logged. 
Training is executed via two scripts:
- `train_LSTM.py` for LSTM model training.
- `train_GRU.py` for GRU model training.

The Kubeflow pipeline executes these scripts and tracks the results.

## Step: 7: Results from Kubeflow

In [None]:
# Assuming you load the Kubeflow logs and metrics back into the notebook
lstm_mae = 2.34  # Example values loaded from Kubeflow logs
gru_mae = 2.56


models = ['LSTM', 'GRU']
mae_values = [lstm_mae, gru_mae]

plt.bar(models, mae_values)
plt.title('Model MAE Comparison')
plt.ylabel('MAE')
plt.show()
