In [94]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from segrnn import SegRNNModel
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
import os
from utils import preprocess_and_save_data

In [113]:
preprocess_and_save_data('./csvs/ROC.csv', normalize=True)
preprocess_and_save_data('./csvs/JRB.csv', normalize=True)
preprocess_and_save_data('./csvs/BGM.csv', normalize=True)
preprocess_and_save_data('./csvs/PEO.csv', normalize=True)
preprocess_and_save_data('./csvs/RME.csv', normalize=True)
preprocess_and_save_data('./csvs/MSS.csv', normalize=True)

Missing values in continuous columns before processing:
tmpf                    1
dwpf                    1
relh                    1
feel                    2
drct                  132
sknt                    3
gust                 8162
peak_wind_gust       9053
peak_wind_drct       9053
alti                    0
mslp                 1907
vsby                    0
p01i                 1739
ice_accretion_1hr    9886
ice_accretion_3hr    9890
ice_accretion_6hr    9886
skyl1                 723
skyl2                4099
skyl3                6972
skyl4                9526
snowdepth            9774
peak_wind_time       9892
dtype: int64
nan thresh is 4946.0
bad columns are ['gust', 'skyl3', 'skyl4', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'snowdepth']
10 remaining continuous columns: ['p01i', 'drct', 'alti', 'relh', 'feel', 'dwpf', 'sknt', 'mslp', 'vsby', 'tmpf']
Missing values in continuous columns after processi

  df[continuous_cols] = df[continuous_cols].replace(placeholders, np.nan).astype(str)


Missing values in continuous columns before processing:
tmpf                  158
dwpf                  185
relh                  185
feel                  185
drct                 2076
sknt                   77
gust                 8500
peak_wind_gust       8819
peak_wind_drct       8819
alti                    4
mslp                 1937
vsby                    5
p01i                    0
ice_accretion_1hr    9699
ice_accretion_3hr    9699
ice_accretion_6hr    9699
skyl1                4189
skyl2                7621
skyl3                8931
skyl4                9699
snowdepth            9699
peak_wind_time       9699
dtype: int64
nan thresh is 4849.5
bad columns are ['gust', 'skyl2', 'skyl3', 'skyl4', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'snowdepth']
10 remaining continuous columns: ['p01i', 'drct', 'alti', 'relh', 'feel', 'dwpf', 'sknt', 'mslp', 'vsby', 'tmpf']
Missing values in continuous columns after

  df[continuous_cols] = df[continuous_cols].replace(placeholders, np.nan).astype(str)


Missing values in continuous columns before processing:
tmpf                    17
dwpf                    17
relh                    17
feel                    21
drct                   294
sknt                    49
gust                  9340
peak_wind_gust       10857
peak_wind_drct       10857
alti                     1
mslp                  3709
vsby                    19
p01i                  1628
ice_accretion_1hr    11542
ice_accretion_3hr    11575
ice_accretion_6hr    11574
skyl1                 3157
skyl2                 7148
skyl3                 9487
skyl4                11580
snowdepth            11580
peak_wind_time       11580
dtype: int64
nan thresh is 5790.0
bad columns are ['gust', 'skyl2', 'skyl3', 'skyl4', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'snowdepth']
10 remaining continuous columns: ['p01i', 'drct', 'alti', 'relh', 'feel', 'dwpf', 'sknt', 'mslp', 'vsby', 'tmpf']
Missing values in co

  df[continuous_cols] = df[continuous_cols].replace(placeholders, np.nan).astype(str)


Missing values in continuous columns before processing:
tmpf                     1
dwpf                     1
relh                     1
feel                    16
drct                   714
sknt                    49
gust                  8960
peak_wind_gust       10299
peak_wind_drct       10299
alti                     0
mslp                  2995
vsby                     2
p01i                  1782
ice_accretion_1hr    10941
ice_accretion_3hr    10951
ice_accretion_6hr    10950
skyl1                 3412
skyl2                 7151
skyl3                 9214
skyl4                10953
snowdepth            10953
peak_wind_time       10953
dtype: int64
nan thresh is 5476.5
bad columns are ['gust', 'skyl2', 'skyl3', 'skyl4', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'snowdepth']
10 remaining continuous columns: ['p01i', 'drct', 'alti', 'relh', 'feel', 'dwpf', 'sknt', 'mslp', 'vsby', 'tmpf']
Missing values in co

  df[continuous_cols] = df[continuous_cols].replace(placeholders, np.nan).astype(str)


Missing values in continuous columns before processing:
tmpf                     0
dwpf                     0
relh                     0
feel                     1
drct                   272
sknt                    17
gust                  9605
peak_wind_gust       10334
peak_wind_drct       10334
alti                     0
mslp                  2916
vsby                     3
p01i                  1374
ice_accretion_1hr    10841
ice_accretion_3hr    10843
ice_accretion_6hr    10841
skyl1                 3226
skyl2                 6727
skyl3                 8968
skyl4                10844
snowdepth            10844
peak_wind_time       10844
dtype: int64
nan thresh is 5422.0
bad columns are ['gust', 'skyl2', 'skyl3', 'skyl4', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'snowdepth']
10 remaining continuous columns: ['p01i', 'drct', 'alti', 'relh', 'feel', 'dwpf', 'sknt', 'mslp', 'vsby', 'tmpf']
Missing values in co

  df[continuous_cols] = df[continuous_cols].replace(placeholders, np.nan).astype(str)


Missing values in continuous columns before processing:
tmpf                     2
dwpf                    40
relh                    40
feel                    46
drct                   182
sknt                    39
gust                  9526
peak_wind_gust       10533
peak_wind_drct       10533
alti                     0
mslp                  3289
vsby                     2
p01i                  1748
ice_accretion_1hr    11114
ice_accretion_3hr    11196
ice_accretion_6hr    11185
skyl1                 3700
skyl2                 7756
skyl3                 9777
skyl4                11205
snowdepth            11205
peak_wind_time       11205
dtype: int64
nan thresh is 5602.5
bad columns are ['gust', 'skyl2', 'skyl3', 'skyl4', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'peak_wind_gust', 'peak_wind_drct', 'peak_wind_time', 'snowdepth']
10 remaining continuous columns: ['p01i', 'drct', 'alti', 'relh', 'feel', 'dwpf', 'sknt', 'mslp', 'vsby', 'tmpf']
Missing values in co

['p01i',
 'drct',
 'alti',
 'relh',
 'feel',
 'dwpf',
 'sknt',
 'mslp',
 'vsby',
 'tmpf']

In [114]:
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

In [115]:
nylocations = './csvs/_nylocations.csv'
latlongs = pd.read_csv(nylocations)
latlongs

Unnamed: 0,stid,station_name,lat,lon,elev,begints,endts,iem_network
0,6B9,Skaneateles,42.914,-76.4408,304.32452,2016-07-22 00:00,,NY_ASOS
1,ALB,ALBANY COUNTY ARPT,42.7576,-73.8036,89.0,1945-01-01 00:00,,NY_ASOS
2,ART,WATERTOWN INTL ARPT,43.9888,-76.0262,99.0,1949-04-30 00:00,,NY_ASOS
3,BGM,BINGHAMTON/BROOME,42.2086,-75.9797,497.0,1948-01-01 00:00,,NY_ASOS
4,BUF,BUFFALO INTL ARPT,42.9408,-78.7358,215.0,1942-01-31 00:00,,NY_ASOS
5,DKK,DUNKIRK AIRPORT,42.4933,-79.272,203.0,1948-12-31 00:00,,NY_ASOS
6,DSV,DANSVILLE MUNICIPAL,42.5709,-77.713,209.0,1948-12-31 00:00,,NY_ASOS
7,ELM,Elmira / Corning,42.1571,-76.8994,287.12537,1949-02-01 00:00,,NY_ASOS
8,ELZ,Wellsville Municipal,42.1078,-77.9842,639.0,1978-06-13 00:00,,NY_ASOS
9,FOK,WESTHAMPTON BEACH,40.8436,-72.6318,20.0,1943-07-18 00:00,,NY_ASOS


In [116]:
roc = './csvs/ROC_processed.csv'
roc_p = pd.read_csv(roc)
roc_p

Unnamed: 0,station,valid,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,vsby,feel
0,ROC,2024-01-01 00:11:00,-1.187030,-0.927562,0.803586,-0.980217,-0.300193,-0.008612,0.087353,0.155196,0.143572,-1.168495
1,ROC,2024-01-01 00:54:00,-1.187030,-0.927562,0.803586,-0.980217,-0.086351,-0.008612,0.087353,0.155196,0.143572,-1.204129
2,ROC,2024-01-01 01:54:00,-1.130031,-0.927562,0.590945,-1.074090,-0.727875,-0.008612,0.087353,0.168066,0.143572,-1.023110
3,ROC,2024-01-01 02:54:00,-1.187030,-0.927562,0.803586,-0.980217,-0.727875,-0.008612,0.087353,0.180937,-1.944840,-1.078698
4,ROC,2024-01-01 03:04:00,-1.187030,-0.927562,0.803586,-1.074090,-0.514034,-0.008612,0.131318,0.180937,-2.292908,-1.127160
...,...,...,...,...,...,...,...,...,...,...,...,...
9887,ROC,2024-11-28 19:54:00,-0.788042,-0.505809,0.831515,1.178877,0.341331,-0.008612,-1.143678,-1.119017,0.491640,-0.850642
9888,ROC,2024-11-28 20:54:00,-0.788042,-0.505809,0.831515,1.085004,-0.514034,-0.115499,-1.011782,-0.977438,0.491640,-0.732338
9889,ROC,2024-11-28 21:54:00,-0.845040,-0.566059,0.827072,0.615635,-0.300193,-0.222385,-0.879886,-0.822988,0.491640,-0.824986
9890,ROC,2024-11-28 22:54:00,-0.845040,-0.566059,0.827072,0.427888,-0.086351,-0.222385,-0.791955,-0.758634,0.491640,-0.856819


In [117]:
required_stations = ['JRB', 'ROC', 'BGM', 'MSS', 'PEO', 'RME']

### first get the locations of stations
# Filter the DataFrame for the required stations
stations_df = latlongs[latlongs['stid'].isin(required_stations)]
# Create a dictionary from the filtered DataFrame
stations_latlong = stations_df.set_index('stid')[['lat', 'lon']].T.to_dict()

print(stations_latlong)

### now get station data itself
processed_data_paths = {station:f'./csvs/{station}_processed.csv' for station in required_stations}
print(processed_data_paths)

station_features = []
normalized_latlongs = []
for stid in required_stations:
   station_data = pd.read_csv(processed_data_paths[stid])
   features = torch.tensor(station_data.drop(columns=['station', 'valid']).values, dtype=torch.float)
   mean_features = features.mean(dim=0)  # Mean of all the features to get "average weather"

   # Append latitude and longitude to the feature vector
   lat, long = stations_latlong[stid]['lat'], stations_latlong[stid]['lon']
   nlat = (lat+90)/ (180)
   nlong = (long+180)/ (360) 
   lat_long = torch.tensor([nlat,nlong], dtype=torch.float)
   print(f"latlong is {lat_long}")
   combined_features = torch.cat((mean_features, lat_long))  # Concatenate features with lat/lon

   station_features.append(combined_features)
   normalized_latlongs.append([nlat, nlong])

print(f'n latlongs: {normalized_latlongs}')

node_features = torch.stack(station_features)
node_features += torch.randn_like(node_features) * 0.01  # noise
print(node_features.shape)
print(node_features)
print(torch.var(node_features, dim=0))

def calculate_distances(latlongs):
   num_stations = len(latlongs)
   distances = np.zeros((num_stations, num_stations))
   for i, coord1 in enumerate(latlongs):
      for j, coord2 in enumerate(latlongs):
         # Calculate Euclidean distance for normalized coordinates
         distances[i, j] = np.linalg.norm(np.array(coord1) - np.array(coord2))
   return distances

distances = calculate_distances(normalized_latlongs)
print("Distance Matrix:")
print(distances)

distance_threshold = 0.015  # Adjust this threshold based on your scale and data
edges = []

for i in range(len(distances)):
   for j in range(len(distances)):
      if i != j and distances[i, j] <= distance_threshold:
         edges.append((i, j))

# Define edges 
edge_index = torch.tensor(edges, dtype=torch.long).T  # Transpose to match edge_index format
print("Edge Index:")
print(edge_index)

# Create the graph data
data = Data(x=node_features, edge_index=edge_index)
print(data)

{'BGM': {'lat': 42.2086, 'lon': -75.9797}, 'JRB': {'lat': 40.7012, 'lon': -74.009}, 'MSS': {'lat': 44.9358, 'lon': -74.8456}, 'PEO': {'lat': 42.6441, 'lon': -77.0529}, 'RME': {'lat': 43.2239, 'lon': -75.3953}, 'ROC': {'lat': 43.1167, 'lon': -77.6767}}
{'JRB': './csvs/JRB_processed.csv', 'ROC': './csvs/ROC_processed.csv', 'BGM': './csvs/BGM_processed.csv', 'MSS': './csvs/MSS_processed.csv', 'PEO': './csvs/PEO_processed.csv', 'RME': './csvs/RME_processed.csv'}
latlong is tensor([0.7261, 0.2944])
latlong is tensor([0.7395, 0.2842])
latlong is tensor([0.7345, 0.2889])
latlong is tensor([0.7496, 0.2921])
latlong is tensor([0.7369, 0.2860])
latlong is tensor([0.7401, 0.2906])
n latlongs: [[0.7261177777777778, 0.29441944444444446], [0.7395372222222223, 0.2842313888888889], [0.7344922222222222, 0.2889452777777778], [0.7496433333333333, 0.29209555555555555], [0.7369116666666667, 0.2859641666666667], [0.7401327777777779, 0.2905686111111111]]
torch.Size([6, 12])
tensor([[ 2.6720e-03,  1.4051e-02,

In [134]:
class SimpleGNN(torch.nn.Module):
   def __init__(self, input_dim, hidden_dim, output_dim):
      super(SimpleGNN, self).__init__()
      self.conv1 = GCNConv(input_dim, hidden_dim)
      self.conv2 = GCNConv(hidden_dim, output_dim)

   def forward(self, data):
      x, edge_index = data.x, data.edge_index
      x = self.conv1(x, edge_index)
      x = F.relu(x)
      x = self.conv2(x, edge_index)
      return x  # Embeddings for each node

# Initialize the GNN
input_dim = node_features.shape[1]  # Latitude and longitude plus features
hidden_dim = 14
output_dim = 12  # Embedding size
gnn = SimpleGNN(input_dim, hidden_dim, output_dim)

In [135]:
embeddings = gnn(data)

# Print embeddings
print("initial embeddings for each station:")
print(embeddings)

initial embeddings for each station:
tensor([[ 0.0684,  0.0852,  0.1688,  0.2278,  0.0418,  0.0956, -0.1624,  0.0038,
          0.1482, -0.0409,  0.0031,  0.0531],
        [ 0.0751,  0.0930,  0.1893,  0.2534,  0.0484,  0.1060, -0.1807,  0.0052,
          0.1662, -0.0466,  0.0058,  0.0577],
        [ 0.0759,  0.0943,  0.1887,  0.2539,  0.0473,  0.1064, -0.1810,  0.0046,
          0.1657, -0.0459,  0.0042,  0.0587],
        [ 0.0670,  0.0830,  0.1700,  0.2271,  0.0437,  0.0949, -0.1619,  0.0049,
          0.1492, -0.0421,  0.0057,  0.0514],
        [ 0.0824,  0.1021,  0.2068,  0.2771,  0.0525,  0.1160, -0.1977,  0.0056,
          0.1815, -0.0506,  0.0056,  0.0635],
        [ 0.0824,  0.1021,  0.2068,  0.2771,  0.0525,  0.1160, -0.1977,  0.0056,
          0.1815, -0.0506,  0.0056,  0.0635]], grad_fn=<AddBackward0>)


Now train the gnn

In [139]:
from torch.nn.functional import cosine_similarity

# Compute similarity matrix from node features
similarity_matrix = torch.mm(node_features, node_features.T)
print(f'similarity matrix is {similarity_matrix}')

# Define unsupervised loss function with regularization
def contrastive_loss(embeddings, similarity_matrix, distance_matrix, lambda_diversity=0.1, lambda_distance=0.1):
   pred_similarity = torch.mm(embeddings, embeddings.T)
   mse_loss = torch.nn.functional.mse_loss(pred_similarity, similarity_matrix)

   diversity_loss = -torch.var(embeddings, dim=0).mean()  # Penalize low variance

   pred_distance_matrix = torch.cdist(embeddings, embeddings, p=2)
   distance_loss = torch.nn.functional.mse_loss(pred_distance_matrix, distance_matrix)

   total_loss = mse_loss + lambda_diversity * diversity_loss + lambda_distance * distance_loss
   return total_loss

# Training loop
epochs = 50
optimizer = torch.optim.Adam(gnn.parameters(), lr=0.001)

gnn.train()
for epoch in range(epochs):
   optimizer.zero_grad()  # Reset gradients
   embeddings = gnn(data)  # Forward pass

   # Compute contrastive loss
   loss = contrastive_loss(embeddings, similarity_matrix, torch.Tensor(distances))
   
   # Backward pass and optimization
   loss.backward()
   optimizer.step()

   # Print loss and gradient information
   print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
   # for name, param in gnn.named_parameters():
   #    if param.grad is not None:
   #       print(f"Gradient for {name}: {param.grad.abs().mean().item():.6f}")
   #    else:
   #       print(f"No gradient for {name}.")


similarity matrix is tensor([[0.6343, 0.6253, 0.6198, 0.6477, 0.6313, 0.6271],
        [0.6253, 0.6177, 0.6117, 0.6398, 0.6239, 0.6195],
        [0.6198, 0.6117, 0.6076, 0.6346, 0.6179, 0.6137],
        [0.6477, 0.6398, 0.6346, 0.6640, 0.6464, 0.6422],
        [0.6313, 0.6239, 0.6179, 0.6464, 0.6306, 0.6261],
        [0.6271, 0.6195, 0.6137, 0.6422, 0.6261, 0.6239]])
Epoch 1, Loss: 0.0073
Epoch 2, Loss: 0.0060
Epoch 3, Loss: 0.0055
Epoch 4, Loss: 0.0055
Epoch 5, Loss: 0.0058
Epoch 6, Loss: 0.0060
Epoch 7, Loss: 0.0060
Epoch 8, Loss: 0.0058
Epoch 9, Loss: 0.0056
Epoch 10, Loss: 0.0055
Epoch 11, Loss: 0.0054
Epoch 12, Loss: 0.0054
Epoch 13, Loss: 0.0055
Epoch 14, Loss: 0.0056
Epoch 15, Loss: 0.0056
Epoch 16, Loss: 0.0056
Epoch 17, Loss: 0.0055
Epoch 18, Loss: 0.0054
Epoch 19, Loss: 0.0053
Epoch 20, Loss: 0.0053
Epoch 21, Loss: 0.0053
Epoch 22, Loss: 0.0054
Epoch 23, Loss: 0.0054
Epoch 24, Loss: 0.0054
Epoch 25, Loss: 0.0053
Epoch 26, Loss: 0.0053
Epoch 27, Loss: 0.0053
Epoch 28, Loss: 0.

In [140]:
learned_embeddings = gnn(data)  # Get final embeddings
print("Learned embeddings:")
print(learned_embeddings)

Learned embeddings:
tensor([[ 0.1667,  0.1649,  0.2706,  0.3670,  0.1308,  0.1921, -0.2734,  0.1110,
          0.2683, -0.1271,  0.0816,  0.1544],
        [ 0.1802,  0.1772,  0.2979,  0.4051,  0.1424,  0.2087, -0.3003,  0.1195,
          0.2960, -0.1381,  0.0875,  0.1659],
        [ 0.1807,  0.1784,  0.2975,  0.4050,  0.1414,  0.2093, -0.3004,  0.1188,
          0.2952, -0.1371,  0.0858,  0.1669],
        [ 0.1658,  0.1628,  0.2714,  0.3674,  0.1326,  0.1913, -0.2733,  0.1124,
          0.2699, -0.1289,  0.0845,  0.1529],
        [ 0.1935,  0.1903,  0.3223,  0.4398,  0.1515,  0.2248, -0.3251,  0.1263,
          0.3201, -0.1464,  0.0899,  0.1781],
        [ 0.1935,  0.1903,  0.3223,  0.4398,  0.1515,  0.2248, -0.3251,  0.1263,
          0.3201, -0.1464,  0.0899,  0.1781]], grad_fn=<AddBackward0>)


In [141]:
for r in required_stations:
   df = pd.read_csv(f'./csvs/{r}_processed.csv')
   print(r)
   print(df.shape)

JRB
(9699, 12)
ROC
(9892, 12)
BGM
(11580, 12)
MSS
(11205, 12)
PEO
(10953, 12)
RME
(10844, 12)


for each station, add on the static embedding that we just learned

In [142]:
all_data = []

for idx, (station, path) in enumerate(processed_data_paths.items()):
   # Load the CSV into a DataFrame
   df = pd.read_csv(path)
   print(df.shape)
   
   # Get the corresponding embedding for this station
   embedding = learned_embeddings[idx].detach().numpy()
   
   # Add the embedding as new columns to the DataFrame
   for i, value in enumerate(embedding):
      df[f'embedding_{i}'] = value

   print(df.shape)
   
   # Append to the list of all data
   all_data.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(all_data, ignore_index=True)

# Display the result
print(combined_df.shape)
combined_df.head()


(9699, 12)
(9699, 24)
(9892, 12)
(9892, 24)
(11580, 12)
(11580, 24)
(11205, 12)
(11205, 24)
(10953, 12)
(10953, 24)
(10844, 12)
(10844, 24)
(64173, 24)


Unnamed: 0,station,valid,tmpf,dwpf,relh,drct,sknt,p01i,alti,mslp,...,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,embedding_11
0,JRB,2024-01-01 00:56:00,-1.145898,-1.25443,-0.58024,1.168285,0.073238,0.0,0.084109,0.070262,...,0.270572,0.367036,0.130773,0.192087,-0.273424,0.110965,0.268316,-0.127137,0.081605,0.154424
1,JRB,2024-01-01 01:56:00,-1.145898,-1.191784,-0.44919,1.168285,-0.198038,0.0,0.084109,0.070262,...,0.270572,0.367036,0.130773,0.192087,-0.273424,0.110965,0.268316,-0.127137,0.081605,0.154424
2,JRB,2024-01-01 02:56:00,-1.145898,-1.129138,-0.312656,1.070846,-0.198038,0.0,0.084109,0.070262,...,0.270572,0.367036,0.130773,0.192087,-0.273424,0.110965,0.268316,-0.127137,0.081605,0.154424
3,JRB,2024-01-01 03:56:00,-1.145898,-1.066491,-0.171735,1.168285,-0.198038,0.0,0.084109,0.083014,...,0.270572,0.367036,0.130773,0.192087,-0.273424,0.110965,0.268316,-0.127137,0.081605,0.154424
4,JRB,2024-01-01 04:56:00,-1.145898,-1.066491,-0.171735,1.070846,0.615791,0.0,0.084109,0.108517,...,0.270572,0.367036,0.130773,0.192087,-0.273424,0.110965,0.268316,-0.127137,0.081605,0.154424


In [143]:
# Step 6: Prepare sequences for LSTM input
# Assuming we are predicting 'tmpf' (temperature) as the target variable
# and using previous 24 time steps/8 hours (n_steps_in) to predict the next time step/20 minutes from now (n_steps_out)
# create sliding window sequences X: (114640, 24, 10), y: (114640, 10)
feature_cols = list(set(df.columns) - set(['station', 'valid']))

n_steps_in = 24  # Number of past time steps
n_steps_out = 1  # Number of future time steps to predict

# We'll create sequences for each station separately
def create_sequences(data, n_steps_in, n_steps_out):
   X, y = [], []
   for i in range(len(data) - n_steps_in - n_steps_out + 1):
      X.append(data[i:(i + n_steps_in), :])
      y.append(data[(i + n_steps_in):(i + n_steps_in + n_steps_out), :])
   return np.array(X), np.array(y)

# Prepare data for each station
X_list = []
y_list = []
stations = df['station'].unique()

for station in stations:
   station_data = df[df['station'] == station]
   station_data = station_data.reset_index(drop=True)
   data_values = station_data[feature_cols].values
   # target_col_index = feature_cols.index('tmpf')  # Index of target variable in features

   X_station, y_station = create_sequences(data_values, n_steps_in, n_steps_out)
   X_list.append(X_station)
   y_list.append(y_station)


# Concatenate data from all stations
X = np.concatenate(X_list, axis=0)
y = np.concatenate(y_list, axis=0)


if n_steps_out == 1:
   y = y.squeeze(1)  # Shape becomes (num_samples, num_features) = (114640, 10) for JRB


print(X.shape)
print(y.shape)

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

(10820, 24, 22)
(10820, 22)


In [130]:
# Step 7: Split the data into training and testing sets
# Since it's time-series data, we'll use the first 80% for training and the rest for testing
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Now the data is ready for training the LSTM model

# Define a PyTorch Dataset
class WeatherDataset(Dataset):
   def __init__(self, X, y):
      self.X = X
      self.y = y
   def __len__(self):
      return len(self.X)
   def __getitem__(self, idx):
      return self.X[idx], self.y[idx]

In [131]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

# Create DataLoaders
batch_size = 32
train_dataset = WeatherDataset(X_train, y_train)
test_dataset = WeatherDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Hyperparameters for SegRNN
input_size = X.shape[2]  # Number of features
hidden_size = 512  # Based on the SEGRNN paper
output_size = X.shape[2]  # Predict all features
segment_length = 8  # Based on the SEGRNN paper
learning_rate = 0.001

# Initialize SegRNNModel
model = SegRNNModel(
   input_size=input_size,
   hidden_size=hidden_size,
   output_size=output_size,
   segment_length=segment_length,
   learning_rate=learning_rate
)

# Logger
logger = TensorBoardLogger("logs", name="segrnn_experiment")

# Checkpoint callback
checkpoint_callback = ModelCheckpoint(
   dirpath="checkpoints/",
   filename="segrnn-{epoch:02d}-{val_loss:.4f}",
   save_top_k=1,
   monitor="val_loss",
   mode="min"
)

# Trainer with logging and checkpointing
trainer = Trainer(
   max_epochs=3,
   accelerator="gpu" if torch.cuda.is_available() else "cpu",
   devices=1,
   logger=logger,
   callbacks=[checkpoint_callback]
)

# Train the model
trainer.fit(model, train_loader)

# Optional: Evaluate on the test set
trainer.test(model, test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\neela\anaconda3\envs\torch_projects\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | model     | SegRNN  | 2.4 M  | train
1 | criterion | MSELoss | 0      | train
----------------------------------------------
2.4 M     Trainable params
0         Non-trainab

Training: |          | 0/? [00:00<?, ?it/s]

Train Loss: 0.493105947971344
Train Loss: 0.4144764542579651
Train Loss: 0.34679508209228516
Train Loss: 0.25418820977211
Train Loss: 0.2302674949169159
Train Loss: 0.17468594014644623
Train Loss: 0.17893333733081818
Train Loss: 0.17039082944393158
Train Loss: 0.17804650962352753
Train Loss: 0.15777702629566193
Train Loss: 0.1397632658481598
Train Loss: 0.16257230937480927
Train Loss: 0.16951799392700195
Train Loss: 0.14495372772216797
Train Loss: 0.14141404628753662
Train Loss: 0.12969093024730682
Train Loss: 0.14675498008728027
Train Loss: 0.12434204667806625
Train Loss: 0.12029087543487549
Train Loss: 0.12323443591594696
Train Loss: 0.11878649890422821
Train Loss: 0.11114782840013504
Train Loss: 0.09644042700529099
Train Loss: 0.10359539091587067
Train Loss: 0.2443937212228775
Train Loss: 0.10298041999340057
Train Loss: 0.09228543937206268
Train Loss: 0.10065064579248428
Train Loss: 0.11743097007274628
Train Loss: 0.073731429874897
Train Loss: 0.12144314497709274
Train Loss: 0.08308

c:\Users\neela\anaconda3\envs\torch_projects\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:384: `ModelCheckpoint(monitor='val_loss')` could not find the monitored key in the returned metrics: ['train_loss', 'epoch', 'step']. HINT: Did you call `log('val_loss', value)` in the `LightningModule`?


Train Loss: 0.07047335058450699
Train Loss: 0.05333167314529419
Train Loss: 0.07986544072628021
Train Loss: 0.07371044158935547
Train Loss: 0.03721301257610321
Train Loss: 0.07225953787565231
Train Loss: 0.05510362237691879
Train Loss: 0.03936866670846939
Train Loss: 0.05492657050490379
Train Loss: 0.08432330936193466
Train Loss: 0.03691588342189789
Train Loss: 0.05276528000831604
Train Loss: 0.051505718380212784
Train Loss: 0.042562615126371384
Train Loss: 0.036678314208984375
Train Loss: 0.07124004513025284
Train Loss: 0.04386189952492714
Train Loss: 0.04809940978884697
Train Loss: 0.0901201069355011
Train Loss: 0.058993931859731674
Train Loss: 0.10471037775278091
Train Loss: 0.07012633234262466
Train Loss: 0.0696418359875679
Train Loss: 0.16390573978424072
Train Loss: 0.07807870209217072
Train Loss: 0.023498108610510826
Train Loss: 0.05143878981471062
Train Loss: 0.06038970872759819
Train Loss: 0.03757967799901962
Train Loss: 0.06288834661245346
Train Loss: 0.08170296251773834
Train

`Trainer.fit` stopped: `max_epochs=3` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Train Loss: 0.057691533118486404
Train Loss: 0.04575655981898308
Train Loss: 0.0483672209084034
Train Loss: 0.10465351492166519
Train Loss: 0.05177747830748558
Train Loss: 0.03623504564166069
Train Loss: 0.12023012340068817
Train Loss: 0.05959544703364372


c:\Users\neela\anaconda3\envs\torch_projects\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.053827907890081406}]

In [132]:
%load_ext tensorboard

In [133]:
%tensorboard --logdir logs --port=6006

Reusing TensorBoard on port 6006 (pid 23700), started 5:38:35 ago. (Use '!kill 23700' to kill it.)