# RAN Slice PRB Prediction rApp Model Generator

This notebook generates an LSTM model for predicting Physical Resource Block (PRB) usage in RAN slices.

## Overview
- Fetches NSSAI performance data from InfluxDB
- Preprocesses data with encoders and scalers
- Creates time series sequences for LSTM training
- Trains and evaluates LSTM model
- Saves model artifacts for deployment

In [None]:
# Imports and Configuration
import os
import sys
import argparse
import json
from datetime import datetime, timezone
from typing import Tuple, List, Dict, Optional

import numpy as np
import pandas as pd

from influxdb_client import InfluxDBClient
from influxdb_client.client.flux_table import FluxStructureEncoder
from influxdb_client.client.write_api import SYNCHRONOUS

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from joblib import dump, load

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Attention
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import matplotlib.pyplot as plt
from typing import List, Optional

In [None]:
# InfluxDB Configuration
influx_url = "http://localhost:8086"
influx_org = "srib"
influx_token = "mRVPMAc6Wa9npA-mvJY2t3ttYS4Sr5JXRflkk81pt_edEz05aGujt2cnNBm3kITix6qRwLCr7HejEV6ufskOcA=="
bucket =  "nssi_pm_bucket"
measurement = "nssi_pm_bucket"
start = "-0"

# Field and Tag Definitions
field_prb_dl = "RRU.PrbDl.SNSSAI"
field_data_dl = "DRB.PdcpSduVolumeDL.SNSSAI"
field_rrc_succ = "RRC.ConnEstabSucc.Cause"
tag_slice_type = "sliceType"
tag_nssi_id = "measObjLdn"

# Model Parameters
window = 672
horizon = 1

## Data Fetching Functions

In [None]:
def build_flux_query() -> str:
    """
    Build a Flux query that:
    - filters measurement
    - keeps relevant fields
    - pivots to a wide table: columns for prb_dl, data_dl, rrc_succ
    - keeps slice_type and nssi_id as columns
    """

    field_names=[field_prb_dl, field_data_dl, field_rrc_succ]
    fields_filter = " or ".join([f'r["_field"] == "{f}"' for f in field_names])
    q = f'''
from(bucket: "{bucket}")
  |> range(start: {start})
  |> filter(fn: (r) => r["_measurement"] == "{measurement}")
  |> filter(fn: (r) => {fields_filter})
  |> pivot(rowKey: ["_time"], columnKey: ["_field"], valueColumn: "_value")
  |> keep(columns: ["_time", "{tag_slice_type}", "{tag_nssi_id}", "{'","'.join(field_names)}"])
  |> sort(columns: ["_time"])
'''

    return q

In [None]:
def fetch_from_influx() -> pd.DataFrame:
    client = InfluxDBClient(url=influx_url, token=influx_token, org=influx_org, timeout=60_000)
    query_api = client.query_api()
    flux = build_flux_query()
    tables = query_api.query_data_frame(query=flux)
    client.close()

    if isinstance(tables, list) and len(tables) > 0:
        df = pd.concat(tables, ignore_index=True)
    else:
        df = tables

    if df is None or df.empty:
        raise RuntimeError("No data returned from InfluxDB. Check your query parameters.")

    # Standardize column names
    df = df.rename(columns={
        "_time": "time",
        tag_slice_type: "slice_type",
        tag_nssi_id: "nssi_id", # Renaming measObjLdn to nssi_id
        field_prb_dl: "prb_dl",
        field_data_dl: "data_dl",
        field_rrc_succ: "rrc_succ"
    })

    # Ensure types
    df["time"] = pd.to_datetime(df["time"], utc=True)
    df = df.sort_values(["slice_type", "nssi_id", "time"]).reset_index(drop=True)

    # Drop rows with any NA in core columns
    df = df.dropna(subset=["slice_type", "nssi_id", "time", "prb_dl", "data_dl", "rrc_succ"])

    return df[["time", "slice_type", "nssi_id", "prb_dl", "data_dl", "rrc_succ"]]

## Data Preparation Functions

In [None]:
def prepare_encoders_and_scalers(df: pd.DataFrame) -> Tuple[OneHotEncoder, OneHotEncoder, Dict[str, MinMaxScaler]]:
    slice_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    slice_enc.fit(df[["slice_type"]])

    nssi_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    nssi_enc.fit(df[["nssi_id"]])

    scalers = {
        "prb": MinMaxScaler(),
        "data": MinMaxScaler(),
        "rrc": MinMaxScaler(),
        "y": MinMaxScaler()
    }
    scalers["prb"].fit(df[["prb_dl"]])
    scalers["data"].fit(df[["data_dl"]])
    scalers["rrc"].fit(df[["rrc_succ"]])
    scalers["y"].fit(df[["prb_dl"]])  # target same as prb_dl

    return slice_enc, nssi_enc, scalers

In [None]:
def make_sequences_per_slice(
    df: pd.DataFrame,
    window: int,
    horizon: int,
    target_column: str,
    slice_enc: OneHotEncoder,
    nssi_enc: OneHotEncoder,
    scalers: Dict[str, MinMaxScaler]
) -> Tuple[np.ndarray, np.ndarray]:
    """
    For each (slice_type, nssi_id) combination, use rolling windows to create sequences.
    Features per timestep: [one-hot(slice_type), one-hot(nssi_id), scaled(prb_dl, data_dl, rrc_succ)]
    Target: next-step prb_dl (scaled with y_scaler)
    """
    X_list, y_list = [], []

    for (slice_type, nssi_id), g in df.groupby(["slice_type", "nssi_id"]):
        g = g.sort_values("time").reset_index(drop=True)

        if len(g) < window + horizon:
            continue # Not enough data for this specific group to form a sequence

        # Prepare feature matrix for this group
        # One-hot for slice_type (same for all rows in g)
        slice_oh = slice_enc.transform(np.array([[slice_type]]))  # shape (1, k_slice)
        slice_oh_row = np.repeat(slice_oh, len(g), axis=0)        # shape (len(g), k_slice)

        # One-hot for nssi_id (same for all rows in g)
        nssi_oh = nssi_enc.transform(np.array([[nssi_id]]))  # shape (1, k_nssi)
        nssi_oh_row = np.repeat(nssi_oh, len(g), axis=0)      # shape (len(g), k_nssi)

        # Scale numeric features
        prb = scalers["prb"].transform(g[["prb_dl"]])
        data = scalers["data"].transform(g[["data_dl"]])
        rrc = scalers["rrc"].transform(g[["rrc_succ"]])

        feat = np.concatenate([slice_oh_row, nssi_oh_row, prb, data, rrc], axis=1)  # shape (len(g), k_slice + k_nssi + 3)

        # Targets (scaled)
        y_scaled = scalers["y"].transform(g[[target_column]])  # prb_dl

        # Build window->horizon sequences
        total_len = len(g)
        for i in range(total_len - window - horizon + 1):
            seq_x = feat[i:i+window, :]  # (window, features)
            target = y_scaled[i+window + horizon - 1, 0]  # scalar
            X_list.append(seq_x)
            y_list.append(target)

    if not X_list:
        raise RuntimeError("Not enough data to build sequences for any (slice_type, nssi_id) group. Try reducing --window or --horizon, or check data density per group.")

    X = np.stack(X_list).astype(np.float32)
    y = np.array(y_list).astype(np.float32).reshape(-1, 1)
    return X, y

## Model Building Functions

In [None]:
def build_model(input_shape: Tuple[int, int]) -> tf.keras.Model:
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(16, activation="relu"),
        Dense(1)  # regression
    ])
    model.compile(loss=tf.keras.losses.Huber(delta=100), optimizer="adam")
    # model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    #               loss="mse",
    #               metrics=[tf.keras.metrics.MeanAbsoluteError()])
    return model

## Training and Utility Functions

In [None]:
def train_val_split_time(
    df: pd.DataFrame,
    val_split: float
) -> pd.Timestamp:
    """
    Compute a time threshold so that approximately val_split of the data by time goes to validation.
    """
    times = df["time"].sort_values().unique()
    if len(times) < 10:
        # fallback
        cutoff_idx = int(len(times) * (1 - val_split))
        return pd.to_datetime(times[max(0, cutoff_idx-1)], utc=True)
    cutoff_idx = int(len(times) * (1 - val_split))
    cutoff_time = pd.to_datetime(times[cutoff_idx], utc=True)
    return cutoff_time

In [None]:
def fit_model(
    X_train: np.ndarray, y_train: np.ndarray,
    X_val: np.ndarray, y_val: np.ndarray,
    model_dir: str,
    epochs: int,
    batch_size: int
) -> tf.keras.Model:
    os.makedirs(model_dir, exist_ok=True)
    ckpt_path = os.path.join(model_dir, "best_prb_lstm.keras")

    callbacks = [
        EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
        ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3),
        ModelCheckpoint(ckpt_path, monitor="val_loss", save_best_only=True)
    ]

    model = build_model(input_shape=(X_train.shape[1], X_train.shape[2]))
    model.summary()
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=1
    )
    # Save final as well
    final_path = os.path.join(model_dir, "final_prb_lstm.keras")
    model.save(final_path)
    return model

In [None]:
def save_artifacts(model_dir: str, slice_enc: OneHotEncoder, nssi_enc: OneHotEncoder, scalers: Dict[str, MinMaxScaler],
                   meta: Dict):
    os.makedirs(model_dir, exist_ok=True)
    dump(slice_enc, os.path.join(model_dir, "slice_onehot.joblib"))
    dump(nssi_enc, os.path.join(model_dir, "nssi_onehot.joblib"))
    for k, sc in scalers.items():
        dump(sc, os.path.join(model_dir, f"scaler_{k}.joblib"))
    with open(os.path.join(model_dir, "meta.json"), "w") as f:
        json.dump(meta, f, indent=2)

## Execution Pipeline

### Step 1: Pull Data from InfluxDB

In [None]:
# 1) Pull data
df = fetch_from_influx()
print(f"Data shape: {df.shape}")
print(f"Date range: {df['time'].min()} to {df['time'].max()}")
print(f"Unique slice types: {df['slice_type'].nunique()}")
print(f"Unique NSSI IDs: {df['nssi_id'].nunique()}")
df.head()

### Step 2: Prepare Encoders and Scalers

In [None]:
# 2) Prepare encoders/scalers
slice_enc, nssi_enc, scalers = prepare_encoders_and_scalers(df)
print(f"Slice encoder categories: {slice_enc.categories_[0]}")
print(f"NSSI encoder categories: {len(nssi_enc.categories_[0])} unique IDs")

### Step 3: Build Sequences

In [None]:
# 3) Build sequences
X, y = make_sequences_per_slice(
        df, window, horizon,
        target_column="prb_dl",
        slice_enc=slice_enc, nssi_enc=nssi_enc, scalers=scalers
    )
print(f"Sequences shape: X={X.shape}, y={y.shape}")
print(f"Feature dimension: {X.shape[2]}")

### Step 4: Train/Validation Split by Time

In [None]:
# 4) Train/val split by time: compute cutoff on original df time
cutoff_time = train_val_split_time(df, val_split=0.2)
print(f"Validation cutoff time: {cutoff_time}")

# Map each sequence to an "end time" by using the window end indices
# This part needs to be consistent with how make_sequences_per_slice iterates
end_times = []

for (slice_type, nssi_id), g in df.groupby(["slice_type", "nssi_id"]):
    g = g.sort_values("time").reset_index(drop=True)
    total_len = len(g)
    if total_len < window + horizon:
        continue
    for i in range(total_len - window - horizon + 1):
        end_time = g.loc[i + window - 1, "time"]
        end_times.append(end_time.value)  # int ns since epoch

end_times = np.array(end_times)
if len(end_times) != len(X):
    # Fallback if mapping is problematic (e.g. if X is empty due to filtering in make_sequences)
    print("[WARN] Mismatch in sequence count for time-based split. Using simple random split.")
    split_idx = int(0.8 * len(X))
    mask_train = np.zeros(len(X), dtype=bool)
    mask_train[:split_idx] = True
    mask_val = ~mask_train
else:
    mask_train = end_times < pd.Timestamp(cutoff_time).value
    mask_val = ~mask_train
    if mask_train.sum() == 0 or mask_val.sum() == 0:
        print("[WARN] Time-based split resulted in empty train/val set. Using simple random split.")
        split_idx = int(0.8 * len(X))
        mask_train = np.zeros(len(X), dtype=bool)
        mask_train[:split_idx] = True
        mask_val = ~mask_train

X_train, y_train = X[mask_train], y[mask_train]
X_val, y_val = X[mask_val], y[mask_val]

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Train/Val ratio: {len(X_train)/(len(X_train)+len(X_val)):.2f}/{len(X_val)/(len(X_train)+len(X_val)):.2f}")

### Step 5: Train Model

In [None]:
# 5) Train
model = fit_model(
        X_train, y_train, X_val, y_val,
        model_dir="models",
        epochs=20,
        batch_size=64
    )

### Step 6: Save Artifacts

In [None]:
# 6) Save artifacts
meta = {
        "window": window,
        "horizon": horizon,
        "features": ["onehot(slice_type)", "onehot(nssi_id)", "prb_dl", "data_dl", "rrc_succ"],
        "target": "prb_dl",
        "measurement": measurement,
        "bucket": bucket,
        "start": start,
        "val_cutoff_time": pd.to_datetime(cutoff_time).isoformat()
    }
save_artifacts("models", slice_enc, nssi_enc, scalers, meta)
print("Model artifacts saved to 'models' directory")

### Step 7: Evaluate Model

In [None]:
# 7) Evaluate (denormalized MAE)
y_val_pred_scaled = model.predict(X_val).reshape(-1, 1)
y_val_pred = scalers["y"].inverse_transform(y_val_pred_scaled)
y_val_true = scalers["y"].inverse_transform(y_val)

mae = np.mean(np.abs(y_val_true - y_val_pred))
print(f"Validation MAE (in PRB units): {mae:.4f}")

rmse = root_mean_squared_error(y_val_true, y_val_pred)
print("RMSE:", rmse)

mae = mean_absolute_error(y_val_true, y_val_pred)
print("MAE:", mae)

r2 = r2_score(y_val_true, y_val_pred)
print("R2 Score:", r2)

In [None]:
# Visualization
plt.figure(figsize=(12,6))
plt.plot(y_val_true[:200], label="Actual PRB Usage", color="green")
plt.plot(y_val_pred[:200], label="Forecasted PRB Usage", color="red")
plt.title("LSTM Prediction of PRB Usage")
plt.xlabel("Time")
plt.ylabel("PRB Usage (%)")
plt.legend()
plt.grid(True)

plt.savefig('lstm_forecast.jpeg', format='jpeg')
plt.close()
print("Forecast plot saved as 'lstm_forecast.jpeg'")

## Summary

The notebook has successfully:
1. ✅ Fetched NSSAI performance data from InfluxDB
2. ✅ Prepared encoders and scalers for data preprocessing
3. ✅ Created time series sequences for LSTM training
4. ✅ Split data into training and validation sets using time-based split
5. ✅ Trained LSTM model with early stopping and learning rate scheduling
6. ✅ Saved model artifacts (encoders, scalers, metadata) for deployment
7. ✅ Evaluated model performance and generated visualization

The trained model and all preprocessing artifacts are now available in the `models` directory for use in production.