In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from segrnn import SegRNNModel
from pytorch_lightning import Trainer

In [2]:
# Load the dataset
# Replace 'your_dataset.csv' with your actual data file
df = pd.read_csv('JRB.csv', low_memory=False)

# Step 1: Ensure 'valid' column is datetime and sort the DataFrame
df['valid'] = pd.to_datetime(df['valid'])
df = df.sort_values(by=['station', 'valid']).reset_index(drop=True)

In [3]:
df.head()

Unnamed: 0,station,valid,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,...,wxcodes,ice_accretion_1hr,ice_accretion_3hr,ice_accretion_6hr,peak_wind_gust,peak_wind_drct,peak_wind_time,feel,metar,snowdepth
0,JRB,2016-07-21 08:15:00,73.4,66.2,78.19,240.0,5.0,0.0,30.17,M,...,M,M,M,M,M,M,M,73.4,KJRB 211215Z 24005KT 10SM CLR 23/19 A3017 RMK AO1,M
1,JRB,2016-07-21 08:35:00,73.4,66.2,78.19,240.0,5.0,0.0,30.17,M,...,M,M,M,M,M,M,M,73.4,KJRB 211235Z 24005KT 10SM CLR 23/19 A3017 RMK AO1,M
2,JRB,2016-07-21 08:55:00,75.2,66.2,73.61,240.0,6.0,0.0,30.17,M,...,M,M,M,M,M,M,M,75.2,KJRB 211255Z 24006KT 10SM CLR 24/19 A3017 RMK AO1,M
3,JRB,2016-07-21 09:15:00,75.2,66.2,73.61,240.0,5.0,0.0,30.17,M,...,M,M,M,M,M,M,M,75.2,KJRB 211315Z 24005KT 10SM CLR 24/19 A3017 RMK AO1,M
4,JRB,2016-07-21 09:35:00,78.8,66.2,65.33,240.0,5.0,0.0,30.16,M,...,M,M,M,M,M,M,M,80.82,KJRB 211335Z 24005KT 10SM CLR 26/19 A3016 RMK AO1,M


In [4]:
# Step 2: Replace placeholders with np.nan in continuous columns
continuous_cols = ['tmpf', 'dwpf', 'relh', 'feel', 'drct', 'sknt', 'gust',
                   'peak_wind_gust', 'peak_wind_drct', 'alti', 'mslp', 'vsby',
                   'p01i', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr',
                   'skyl1', 'skyl2', 'skyl3', 'skyl4', 'snowdepth', 'peak_wind_time']

# List of placeholders to replace
placeholders = ['M', 'T', '', 'NaN', 'NULL', 'None']

# Replace placeholders with np.nan
df[continuous_cols] = df[continuous_cols].replace(placeholders, np.nan).astype(str)

# Convert continuous columns to numeric, coercing errors to np.nan
for col in continuous_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

  df[continuous_cols] = df[continuous_cols].replace(placeholders, np.nan).astype(str)


In [5]:
print("Missing values in continuous columns before processing:")
print(df[continuous_cols].isnull().sum())

Missing values in continuous columns before processing:
tmpf                   6865
dwpf                   7015
relh                   7112
feel                   7125
drct                  25052
sknt                   5024
gust                 101714
peak_wind_gust       108345
peak_wind_drct       108345
alti                  12732
mslp                  37235
vsby                   9941
p01i                   7730
ice_accretion_1hr    114664
ice_accretion_3hr    114664
ice_accretion_6hr    114664
skyl1                 54223
skyl2                 93650
skyl3                107296
skyl4                114664
snowdepth            114664
peak_wind_time       114664
dtype: int64


In [6]:
# Step 3: Handle missing values in continuous variables


# Identify columns to drop due to high NaN count
nan_threshold = df.shape[0] / 2                     # Remove columns with more than 50% missing values
print(f"nan thresh is {nan_threshold}")
bad_columns = [col for col in df.columns if df[col].isnull().sum() >= nan_threshold]
print(f"bad columns are {bad_columns}")

# Add less relevant and irrelevant features to the removal list
irrelevant_features = [
    'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr',  # Fully NaN
    'skyl1', 'skyl2', 'skyl3', 'skyl4',  # Less relevant (Sky level altitudes)
    'skyc1', 'skyc2', 'skyc3', 'skyc4',  # Less relevant (Sky coverage)
    'wxcodes',  # Categorical, redundant with precipitation/visibility
    'metar'  # Text format, unusable directly
]

# Combine both lists and ensure no duplicates
columns_to_remove = list(set(bad_columns + irrelevant_features))
df.drop(columns=columns_to_remove, inplace=True)

# Update continuous columns to exclude removed columns
continuous_cols = list(set(continuous_cols) - set(columns_to_remove))
print(f"Remaining continuous columns: {continuous_cols}")


# Apply linear interpolation within each station group using transform
df[continuous_cols] = df.groupby('station')[continuous_cols].transform(
    lambda group: group.interpolate(method='linear')
)

# Handle any remaining missing values with forward and backward fill using transform
df[continuous_cols] = df.groupby('station')[continuous_cols].transform(
    lambda group: group.ffill().bfill()
)


# Verify missing values are filled
print("Missing values in continuous columns after processing:")
print(df[continuous_cols].isnull().sum())

nan thresh is 57332.0
bad columns are ['gust', 'skyl2', 'skyl3', 'skyl4', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'snowdepth']
Remaining continuous columns: ['sknt', 'vsby', 'tmpf', 'mslp', 'drct', 'feel', 'alti', 'relh', 'dwpf', 'p01i']
Missing values in continuous columns after processing:
sknt    0
vsby    0
tmpf    0
mslp    0
drct    0
feel    0
alti    0
relh    0
dwpf    0
p01i    0
dtype: int64


In [7]:
# Step 5: Feature scaling
# List of all features (excluding 'valid' and 'metar')
feature_cols = continuous_cols #+ categorical_cols
print(df[feature_cols].shape)

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
df[feature_cols] = scaler.fit_transform(df[feature_cols])


(114664, 10)


In [8]:
# Step 6: Prepare sequences for LSTM input
# Assuming we are predicting 'tmpf' (temperature) as the target variable
# and using previous 24 time steps/8 hours (n_steps_in) to predict the next time step/20 minutes from now (n_steps_out)
# create sliding window sequences X: (114640, 24, 10), y: (114640, 10)

n_steps_in = 24  # Number of past time steps
n_steps_out = 1  # Number of future time steps to predict

# We'll create sequences for each station separately
def create_sequences(data, n_steps_in, n_steps_out):
    X, y = [], []
    for i in range(len(data) - n_steps_in - n_steps_out + 1):
        X.append(data[i:(i + n_steps_in), :])
        y.append(data[(i + n_steps_in):(i + n_steps_in + n_steps_out), :])
    return np.array(X), np.array(y)

# Prepare data for each station
X_list = []
y_list = []
stations = df['station'].unique()

for station in stations:
    station_data = df[df['station'] == station]
    station_data = station_data.reset_index(drop=True)
    data_values = station_data[feature_cols].values
    # target_col_index = feature_cols.index('tmpf')  # Index of target variable in features

    X_station, y_station = create_sequences(data_values, n_steps_in, n_steps_out)
    X_list.append(X_station)
    y_list.append(y_station)


# Concatenate data from all stations
X = np.concatenate(X_list, axis=0)
y = np.concatenate(y_list, axis=0)


if n_steps_out == 1:
    y = y.squeeze(1)  # Shape becomes (num_samples, num_features) = (114640, 10) for JRB


print(X.shape)
print(y.shape)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

(114640, 24, 10)
(114640, 10)


In [10]:
# Step 7: Split the data into training and testing sets
# Since it's time-series data, we'll use the first 80% for training and the rest for testing
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Now the data is ready for training the LSTM model

# Define a PyTorch Dataset
class WeatherDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [11]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

# Create DataLoaders
batch_size = 32
train_dataset = WeatherDataset(X_train, y_train)
test_dataset = WeatherDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Hyperparameters for SegRNN
input_size = X.shape[2]  # Number of features
hidden_size = 512  # Based on the SEGRNN paper
output_size = X.shape[2]  # Predict all features
segment_length = 8  # Based on the SEGRNN paper
learning_rate = 0.001

# Initialize SegRNNModel
model = SegRNNModel(
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size,
    segment_length=segment_length,
    learning_rate=learning_rate
)

# Logger
logger = TensorBoardLogger("logs", name="segrnn_experiment")

# Checkpoint callback
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints/",
    filename="segrnn-{epoch:02d}-{val_loss:.4f}",
    save_top_k=1,
    monitor="val_loss",
    mode="min"
)

# Trainer with logging and checkpointing
trainer = Trainer(
    max_epochs=4,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    logger=logger,
    callbacks=[checkpoint_callback]
)

# Train the model
trainer.fit(model, train_loader)

# Optional: Evaluate on the test set
trainer.test(model, test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/global/common/software/nersc9/pytorch/2.3.1/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type    | Params
--------------------------------------
0 | model     | SegRNN  | 2.4 M 
1 | criterion | MSELoss | 0     
--------------------------------------
2.4 M     Trainable params
0         Non-trainab

Training: |          | 0/? [00:00<?, ?it/s]

Train Loss: 0.9447845816612244
Train Loss: 0.9561176300048828
Train Loss: 0.9602943658828735
Train Loss: 0.7648112177848816
Train Loss: 0.5991136431694031
Train Loss: 0.290162056684494
Train Loss: 0.3903270959854126
Train Loss: 0.24228759109973907
Train Loss: 0.7545002698898315
Train Loss: 0.38825684785842896
Train Loss: 0.2472890466451645
Train Loss: 0.31164565682411194
Train Loss: 0.47263866662979126
Train Loss: 0.6484565734863281
Train Loss: 0.29444652795791626
Train Loss: 0.22517013549804688
Train Loss: 0.4742286205291748
Train Loss: 0.3288589119911194
Train Loss: 0.19025753438472748
Train Loss: 0.36365360021591187
Train Loss: 0.28349801898002625
Train Loss: 0.38309264183044434
Train Loss: 0.30847975611686707
Train Loss: 0.4357635974884033
Train Loss: 0.20580950379371643
Train Loss: 0.3819229304790497
Train Loss: 0.28686487674713135
Train Loss: 0.19210803508758545
Train Loss: 0.15714730322360992
Train Loss: 0.6124169826507568
Train Loss: 0.31789615750312805
Train Loss: 0.4533916115

/global/common/software/nersc9/pytorch/2.3.1/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:383: `ModelCheckpoint(monitor='val_loss')` could not find the monitored key in the returned metrics: ['train_loss', 'epoch', 'step']. HINT: Did you call `log('val_loss', value)` in the `LightningModule`?


Train Loss: 0.11743336170911789
Train Loss: 0.15199284255504608
Train Loss: 0.06366278976202011
Train Loss: 0.19907379150390625
Train Loss: 0.3182869553565979
Train Loss: 0.17663787305355072
Train Loss: 0.18975253403186798
Train Loss: 0.12797127664089203
Train Loss: 0.5500736236572266
Train Loss: 0.11451206356287003
Train Loss: 0.09223281592130661
Train Loss: 0.09154974669218063
Train Loss: 0.087398461997509
Train Loss: 0.16638368368148804
Train Loss: 0.0649772435426712
Train Loss: 0.1584843099117279
Train Loss: 0.36171528697013855
Train Loss: 0.15791568160057068
Train Loss: 0.06776218861341476
Train Loss: 0.3353234827518463
Train Loss: 0.12725265324115753
Train Loss: 0.17687301337718964
Train Loss: 0.1509089469909668
Train Loss: 0.1365426629781723
Train Loss: 0.10375374555587769
Train Loss: 0.13433238863945007
Train Loss: 0.31988391280174255
Train Loss: 0.4362429678440094
Train Loss: 0.22620956599712372
Train Loss: 0.17686139047145844
Train Loss: 0.2797509729862213
Train Loss: 0.09097

`Trainer.fit` stopped: `max_epochs=4` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
SLURM auto-requeueing enabled. Setting signal handlers.


Train Loss: 0.11891057342290878
Train Loss: 0.10402374714612961
Train Loss: 0.057672351598739624
Train Loss: 0.3073763847351074


/global/common/software/nersc9/pytorch/2.3.1/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=127` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.11884204298257828
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.11884204298257828}]

In [19]:
import nersc_tensorboard_helper
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [20]:
%tensorboard --logdir logs --port=6006

Reusing TensorBoard on port 6006 (pid 2118084), started 0:08:04 ago. (Use '!kill 2118084' to kill it.)

In [21]:
from tensorboard import notebook
notebook.list() # View open TensorBoard instances

Known TensorBoard instances:
  - port 6006: logdir logs (started 0:08:12 ago; pid 2118084)


In [17]:
nersc_tensorboard_helper.tb_address()

Selecting TensorBoard with logdir logs (started 0:06:13 ago; port 6006, pid 2118084).
