In [20]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
warnings.filterwarnings("ignore")

In [21]:
df=pd.read_csv("weather_cleaned.csv")

In [22]:
import pandas as pd
import numpy as np
import torch

# Convert boolean columns to integers (0 and 1)
df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)

# Encode the city names as integers
df['city_idx'] = df['location_city'].astype('category').cat.codes  # Numeric index for each city
features = ['humidity_2m', 'dew_point_2m', 'precip', 'snowfall', 'snow_depth', 
            'msl_pressure', 'surface_pressure', 'cloud_cover_low', 'cloud_cover_mid',
            'cloud_cover_high', 'et0', 'vpd', 'wind_speed_10m', 'wind_dir_10m', 
            'soil_temp_0_7cm', 'soil_moisture_0_7cm', 'latitude', 'longitude'] + \
           [col for col in df.columns if col.startswith('weather_')]

target = 'temp_2m'
T = 3  # Number of timesteps
input_dim = len(features)

# Lists to hold training and testing data across all cities
X_train_list, y_train_list, city_train_list = [], [], []
X_test_list, y_test_list, city_test_list = [], [], []

# To compute normalization statistics
all_train_data = []

# Process each city independently
for city_id, city_data in df.groupby('city_idx'):
    city_data = city_data.sort_values(by='time').reset_index(drop=True)
    input_data = city_data[features].values
    targets = city_data[target].values
    
    # Determine train size for the current city
    N = len(input_data) - T  # Total number of sequences
    train_size = int(0.90 * N)
    
    # Split data for normalization calculation (only on training set)
    train_data = input_data[:train_size]
    all_train_data.append(train_data)  # Collect all training data for global normalization
    
    # Convert to torch tensors
    input_data_tensor = torch.tensor(input_data, dtype=torch.float32)
    target_tensor = torch.tensor(targets, dtype=torch.float32)
    
    # Initialize train and test tensors for this city
    X_train_city = torch.zeros((train_size, T, input_dim), dtype=torch.float32)
    y_train_city = torch.zeros((train_size, 1), dtype=torch.float32)
    X_test_city = torch.zeros((N - train_size, T, input_dim), dtype=torch.float32)
    y_test_city = torch.zeros((N - train_size, 1), dtype=torch.float32)
    
    # Prepare training sequences
    for t in range(train_size):
        X_train_city[t] = input_data_tensor[t:t + T]
        y_train_city[t] = target_tensor[t + T]
    
    # Prepare testing sequences
    for i in range(N - train_size):
        t = i + train_size
        X_test_city[i] = input_data_tensor[t:t + T]
        y_test_city[i] = target_tensor[t + T]
    
    # Prepare city indices for train and test
    city_train_city = torch.full((train_size,), city_id, dtype=torch.long)
    city_test_city = torch.full((N - train_size,), city_id, dtype=torch.long)
    
    # Append city-specific data to the main lists
    X_train_list.append(X_train_city)
    y_train_list.append(y_train_city)
    city_train_list.append(city_train_city)
    
    X_test_list.append(X_test_city)
    y_test_list.append(y_test_city)
    city_test_list.append(city_test_city)

# Concatenate all training data for normalization calculation
all_train_data = np.vstack(all_train_data)
train_mean = all_train_data.mean(axis=0)
train_std = all_train_data.std(axis=0)

# Concatenate all citiesâ€™ data to form the final tensors
X_train = torch.cat(X_train_list, dim=0)
y_train = torch.cat(y_train_list, dim=0)
city_train = torch.cat(city_train_list, dim=0)

X_test = torch.cat(X_test_list, dim=0)
y_test = torch.cat(y_test_list, dim=0)
city_test = torch.cat(city_test_list, dim=0)

# Normalize X_train and X_test
train_mean_tensor = torch.tensor(train_mean, dtype=torch.float32)
train_std_tensor = torch.tensor(train_std, dtype=torch.float32)

# Normalize along the feature dimension (last dimension)
X_train = (X_train - train_mean_tensor) / train_std_tensor
X_test = (X_test - train_mean_tensor) / train_std_tensor

# Print shapes to confirm the results
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("city_train shape:", city_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("city_test shape:", city_test.shape)


X_train shape: torch.Size([977490, 3, 31])
y_train shape: torch.Size([977490, 1])
city_train shape: torch.Size([977490])
X_test shape: torch.Size([108615, 3, 31])
y_test shape: torch.Size([108615, 1])
city_test shape: torch.Size([108615])


In [11]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


class CNNModelWithEmbedding(nn.Module):
    def __init__(self, input_dim, num_channels1, num_channels2, embedding_dim, num_cities, dropout_rate):
        super(CNNModelWithEmbedding, self).__init__()
        
        # Embedding for city index
        self.city_embedding = nn.Embedding(num_embeddings=num_cities, embedding_dim=embedding_dim)
        
        # CNN layers
        self.conv1 = nn.Conv1d(in_channels=input_dim + embedding_dim, out_channels=num_channels1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=num_channels1, out_channels=num_channels2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        
        # Fully connected layers
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(num_channels2 * (T // 2), 64)  # Adjust based on sequence length (T)
        self.fc2 = nn.Linear(64, 16)
        self.output = nn.Linear(16, 1)
        
    def forward(self, x, city_idx):
        # Add city embeddings
        city_emb = self.city_embedding(city_idx).unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([x, city_emb], dim=2)
        
        # Permute for CNN (channels-first format)
        x = x.permute(0, 2, 1)
        
        # Pass through CNN layers
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        
        # Flatten for fully connected layers
        x = x.view(x.size(0), -1)
        
        # Pass through fully connected layers
        x = self.dropout(self.fc1(x))
        x = self.relu(x)
        x = self.dropout(self.fc2(x))
        x = self.relu(x)
        x = self.output(x)
        
        return x


# Optuna objective function
def objective(trial):
    # Sample hyperparameters
    embedding_dim = trial.suggest_int("embedding_dim", 4, 16)
    num_channels1 = trial.suggest_int("num_channels1", 32, 128, step=16)
    num_channels2 = trial.suggest_int("num_channels2", 64, 256, step=16)
    dropout_rate = trial.suggest_float("dropout_rate", 0.2, 0.5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])

    n_splits_per_city = 5
    tscv = TimeSeriesSplit(n_splits=n_splits_per_city)

    # Store city-specific splits
    city_splits = []

    for city in city_train.unique().tolist():  # Get unique city IDs
        city_mask = (city_train == city)
        city_X = X_train[city_mask]
        city_y = y_train[city_mask]
        
        # Subsample 50% of the data for faster tuning
        subsample_size = int(0.1 * len(city_X))
        subsample_indices = torch.arange(len(city_X))[:subsample_size]
        city_X = city_X[subsample_indices]
        city_y = city_y[subsample_indices]

        # Split city data temporally
        for train_idx, val_idx in tscv.split(city_X):
            city_splits.append((
                city_X[train_idx], city_X[val_idx],
                city_y[train_idx], city_y[val_idx],
                torch.full((len(train_idx),), city, dtype=torch.long),
                torch.full((len(val_idx),), city, dtype=torch.long)
            ))

    # Combine city-specific splits into global folds
    n_global_folds = 5
    global_folds = [[] for _ in range(n_global_folds)]

    for i, split in enumerate(city_splits):
        global_folds[i % n_global_folds].append(split)

    # Combine splits within each global fold
    combined_folds = []
    for fold in global_folds:
        X_train_fold = torch.cat([f[0] for f in fold])
        X_val_fold = torch.cat([f[1] for f in fold])
        y_train_fold = torch.cat([f[2] for f in fold])
        y_val_fold = torch.cat([f[3] for f in fold])
        city_train_fold = torch.cat([f[4] for f in fold])
        city_val_fold = torch.cat([f[5] for f in fold])

        combined_folds.append((X_train_fold, X_val_fold, y_train_fold, y_val_fold, city_train_fold, city_val_fold))

    # Perform training and validation
    fold_val_losses = []

    for X_train_fold, X_val_fold, y_train_fold, y_val_fold, city_train_fold, city_val_fold in combined_folds:
        train_dataset = torch.utils.data.TensorDataset(X_train_fold, y_train_fold, city_train_fold)
        val_dataset = torch.utils.data.TensorDataset(X_val_fold, y_val_fold, city_val_fold)

        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model
        model = CNNModelWithEmbedding(
            input_dim=X_train.shape[2],
            num_channels1=num_channels1,
            num_channels2=num_channels2,
            embedding_dim=embedding_dim,
            num_cities=city_train.max().item() + 1,
            dropout_rate=dropout_rate
        ).to(device)

        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        loss_fn = nn.MSELoss()

        # Train
        for epoch in range(5):  # Use fewer epochs for tuning
            model.train()
            for X_batch, y_batch, city_batch in train_loader:
                X_batch, y_batch, city_batch = X_batch.to(device), y_batch.to(device), city_batch.to(device)
                optimizer.zero_grad()
                output = model(X_batch, city_batch).squeeze()
                loss = loss_fn(output, y_batch)
                loss.backward()
                optimizer.step()

        # Validate
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch, city_batch in val_loader:
                X_batch, y_batch, city_batch = X_batch.to(device), y_batch.to(device), city_batch.to(device)
                output = model(X_batch, city_batch).squeeze()
                loss = loss_fn(output, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(val_loader.dataset)
        fold_val_losses.append(val_loss)

    avg_val_loss = np.mean(fold_val_losses)
    return avg_val_loss


# Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, n_jobs=4)

# Best hyperparameters
print("Best Hyperparameters:", study.best_params)
print("Best Validation Loss:", study.best_value)


[I 2024-11-30 22:50:47,179] A new study created in memory with name: no-name-358798cb-e4c3-4345-a8f4-93e7503feb9b


Using device: cuda


[I 2024-11-30 22:52:04,473] Trial 0 finished with value: 508.5619718593085 and parameters: {'embedding_dim': 14, 'num_channels1': 112, 'num_channels2': 64, 'dropout_rate': 0.33380184650556455, 'learning_rate': 0.0004517897758154489, 'batch_size': 256}. Best is trial 0 with value: 508.5619718593085.
[I 2024-11-30 22:52:09,717] Trial 2 finished with value: 524.1092224299067 and parameters: {'embedding_dim': 7, 'num_channels1': 128, 'num_channels2': 160, 'dropout_rate': 0.43173772319519815, 'learning_rate': 0.0004391997894515235, 'batch_size': 128}. Best is trial 0 with value: 508.5619718593085.
[I 2024-11-30 22:52:21,621] Trial 1 finished with value: 479.9785383203177 and parameters: {'embedding_dim': 12, 'num_channels1': 96, 'num_channels2': 64, 'dropout_rate': 0.30258056667255423, 'learning_rate': 0.0009312602462542639, 'batch_size': 128}. Best is trial 1 with value: 479.9785383203177.
[I 2024-11-30 22:52:59,435] Trial 3 finished with value: 463.97075144037717 and parameters: {'embeddi

Best Hyperparameters: {'embedding_dim': 9, 'num_channels1': 48, 'num_channels2': 80, 'dropout_rate': 0.260359973377371, 'learning_rate': 0.007897233987265898, 'batch_size': 64}
Best Validation Loss: 408.8404535665214


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


class CNNModelWithEmbedding(nn.Module):
    def __init__(self, input_dim, num_channels1, num_channels2, embedding_dim, num_cities, dropout_rate):
        super(CNNModelWithEmbedding, self).__init__()
        
        # Embedding for city index
        self.city_embedding = nn.Embedding(num_embeddings=num_cities, embedding_dim=embedding_dim)
        
        # CNN layers
        self.conv1 = nn.Conv1d(in_channels=input_dim + embedding_dim, out_channels=num_channels1, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=num_channels1, out_channels=num_channels2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        
        # Fully connected layers
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(num_channels2 * (T // 2), 64)  # Adjust based on sequence length (T)
        self.fc2 = nn.Linear(64, 16)
        self.output = nn.Linear(16, 1)
        
    def forward(self, x, city_idx):
        # Add city embeddings
        city_emb = self.city_embedding(city_idx).unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([x, city_emb], dim=2)
        
        # Permute for CNN (channels-first format)
        x = x.permute(0, 2, 1)
        
        # Pass through CNN layers
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        
        # Flatten for fully connected layers
        x = x.view(x.size(0), -1)
        
        # Pass through fully connected layers
        x = self.dropout(self.fc1(x))
        x = self.relu(x)
        x = self.dropout(self.fc2(x))
        x = self.relu(x)
        x = self.output(x)
        
        return x
    
    
    

# Hyperparameters
input_dim = X_train.shape[2]  # Number of features
num_cities = city_train.max().item() + 1  # Total number of unique cities
epochs = 1000
batch_size = 64
embedding_dim = 9 # Dimension of the embedding layer
num_channels1=48
num_channels2=80
dropout_rate=0.25

# Initialize model, criterion, and optimizer
model =  model = CNNModelWithEmbedding(
            input_dim=X_train.shape[2],
            num_channels1=num_channels1,
            num_channels2=num_channels2,
            embedding_dim=embedding_dim,
            num_cities=city_train.max().item() + 1,
            dropout_rate=dropout_rate
        ).to(device).to(device)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=7e-3, weight_decay=1e-5)

# Convert data to PyTorch tensors and move to device
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
city_train_tensor = torch.tensor(city_train, dtype=torch.long).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
city_test_tensor = torch.tensor(city_test, dtype=torch.long).to(device)

# DataLoader for batching
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor, city_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor, city_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Using device: cuda


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Initialize variables to track the best model
best_val_loss = np.inf  # Set to infinity initially
best_checkpoint_path = "/home/research/a.naveen/denoise40/weather/cnn/best_model.pth"

train_losses=[]
val_losses=[]
# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch, city_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch, city_batch)
        loss = loss_fn(output.squeeze(), y_batch.squeeze())
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * X_batch.size(0)

    # Validation step
    model.eval()
    val_loss = 0
    
    if epoch%5==0:
        with torch.no_grad():
            for X_batch, y_batch, city_batch in test_loader:
                output = model(X_batch, city_batch)
                loss = loss_fn(output.squeeze(), y_batch.squeeze())
                val_loss += loss.item() * X_batch.size(0)

        # Calculate average losses
        train_loss /= len(train_loader.dataset)
        val_loss /= len(test_loader.dataset)

        # Append losses for plotting
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Save the model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'val_loss': val_loss
            }, best_checkpoint_path)
            print(f"Best model saved at epoch {epoch+1} with validation loss: {val_loss:.4f}")


Epoch 1/1000, Train Loss: 86.2398, Validation Loss: 7.9235
Best model saved at epoch 1 with validation loss: 7.9235
Epoch 6/1000, Train Loss: 23.0107, Validation Loss: 8.6021
Epoch 11/1000, Train Loss: 22.5063, Validation Loss: 6.1876
Best model saved at epoch 11 with validation loss: 6.1876
Epoch 16/1000, Train Loss: 22.1113, Validation Loss: 3.4444
Best model saved at epoch 16 with validation loss: 3.4444
Epoch 21/1000, Train Loss: 21.9548, Validation Loss: 4.5751
Epoch 26/1000, Train Loss: 22.1569, Validation Loss: 4.3184
Epoch 31/1000, Train Loss: 21.8042, Validation Loss: 5.8662
Epoch 36/1000, Train Loss: 21.2109, Validation Loss: 4.4837
Epoch 41/1000, Train Loss: 19.3400, Validation Loss: 3.9116
Epoch 46/1000, Train Loss: 19.2736, Validation Loss: 3.4771
Epoch 51/1000, Train Loss: 19.3179, Validation Loss: 3.0226
Best model saved at epoch 51 with validation loss: 3.0226
Epoch 56/1000, Train Loss: 19.2099, Validation Loss: 4.8454
Epoch 61/1000, Train Loss: 19.3394, Validation Loss

# LSTM_CNN

In [27]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


class HybridCNNLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, lstm_hidden_dim, num_channels1, num_channels2, num_cities, dropout_rate):
        super(HybridCNNLSTM, self).__init__()
        
        # Embedding for city index
        self.city_embedding = nn.Embedding(num_embeddings=num_cities, embedding_dim=embedding_dim)
        
        # CNN layers
        self.conv1 = nn.Conv1d(in_channels=input_dim + embedding_dim, out_channels=num_channels1,  kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=num_channels1, out_channels=num_channels2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=num_channels2, hidden_size=lstm_hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        
        # Fully connected layers
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(lstm_hidden_dim * 2, 64)  # Bidirectional LSTM output
        self.fc2 = nn.Linear(64, 16)
        self.output = nn.Linear(16, 1)
        
    def forward(self, x, city_idx):
        # Add city embeddings
        city_emb = self.city_embedding(city_idx).unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([x, city_emb], dim=2)
        
        # Permute for CNN (channels-first format)
        x = x.permute(0, 2, 1)
        
        # Pass through CNN layers
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        
        # Permute back for LSTM (batch-first format)
        x = x.permute(0, 2, 1)
        
        # Pass through LSTM
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output from the last timestep
        
        # Pass through fully connected layers
        x = self.dropout(self.fc1(x))
        x = self.relu(x)
        x = self.dropout(self.fc2(x))
        x = self.relu(x)
        x = self.output(x)
        
        return x


def objective(trial):
    # Sample hyperparameters
    embedding_dim = trial.suggest_int("embedding_dim", 4, 16)
    lstm_hidden_dim = trial.suggest_categorical("lstm_hidden_dim", [64, 128, 256])
    num_channels1 = trial.suggest_int("num_channels1", 32, 128, step=16)
    num_channels2 = trial.suggest_int("num_channels2", 64, 256, step=16)
    dropout_rate = trial.suggest_float("dropout_rate", 0.2, 0.5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])

    n_splits_per_city = 5
    tscv = TimeSeriesSplit(n_splits=n_splits_per_city)

    # Store city-specific splits
    city_splits = []

    for city in city_train.unique().tolist():  # Get unique city IDs
        city_mask = (city_train == city)
        city_X = X_train[city_mask]
        city_y = y_train[city_mask]
        
        # Subsample 50% of the data for faster tuning
        subsample_size = int(0.1 * len(city_X))
        subsample_indices = torch.arange(len(city_X))[:subsample_size]
        city_X = city_X[subsample_indices]
        city_y = city_y[subsample_indices]

        # Split city data temporally
        for train_idx, val_idx in tscv.split(city_X):
            city_splits.append((
                city_X[train_idx], city_X[val_idx],
                city_y[train_idx], city_y[val_idx],
                torch.full((len(train_idx),), city, dtype=torch.long),
                torch.full((len(val_idx),), city, dtype=torch.long)
            ))

    # Combine city-specific splits into global folds
    n_global_folds = 5
    global_folds = [[] for _ in range(n_global_folds)]

    for i, split in enumerate(city_splits):
        global_folds[i % n_global_folds].append(split)

    # Combine splits within each global fold
    combined_folds = []
    for fold in global_folds:
        X_train_fold = torch.cat([f[0] for f in fold])
        X_val_fold = torch.cat([f[1] for f in fold])
        y_train_fold = torch.cat([f[2] for f in fold])
        y_val_fold = torch.cat([f[3] for f in fold])
        city_train_fold = torch.cat([f[4] for f in fold])
        city_val_fold = torch.cat([f[5] for f in fold])

        combined_folds.append((X_train_fold, X_val_fold, y_train_fold, y_val_fold, city_train_fold, city_val_fold))

    # Perform training and validation
    fold_val_losses = []

    for X_train_fold, X_val_fold, y_train_fold, y_val_fold, city_train_fold, city_val_fold in combined_folds:
        train_dataset = torch.utils.data.TensorDataset(X_train_fold, y_train_fold, city_train_fold)
        val_dataset = torch.utils.data.TensorDataset(X_val_fold, y_val_fold, city_val_fold)

        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model
        model = HybridCNNLSTM(
            input_dim=X_train.shape[2],
            embedding_dim=embedding_dim,
            lstm_hidden_dim=lstm_hidden_dim,
            num_channels1=num_channels1,
            num_channels2=num_channels2,
            num_cities=city_train.max().item() + 1,
            dropout_rate=dropout_rate
        ).to(device)

        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        loss_fn = nn.MSELoss()

        # Train
        for epoch in range(5):  # Use fewer epochs for tuning
            model.train()
            for X_batch, y_batch, city_batch in train_loader:
                X_batch, y_batch, city_batch = X_batch.to(device), y_batch.to(device), city_batch.to(device)
                optimizer.zero_grad()
                output = model(X_batch, city_batch).squeeze()
                loss = loss_fn(output, y_batch)
                loss.backward()
                optimizer.step()

        # Validate
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch, city_batch in val_loader:
                X_batch, y_batch, city_batch = X_batch.to(device), y_batch.to(device), city_batch.to(device)
                output = model(X_batch, city_batch).squeeze()
                loss = loss_fn(output, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(val_loader.dataset)
        fold_val_losses.append(val_loss)

    avg_val_loss = np.mean(fold_val_losses)
    return avg_val_loss


# Optuna study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, n_jobs=4)

# Best hyperparameters
print("Best Hyperparameters:", study.best_params)
print("Best Validation Loss:", study.best_value)


[I 2024-12-01 14:23:55,474] A new study created in memory with name: no-name-e31246dd-7e35-4ff1-a481-26d3faaddeed


Using device: cuda


[I 2024-12-01 14:26:27,714] Trial 2 finished with value: 1584.5580678185095 and parameters: {'embedding_dim': 15, 'lstm_hidden_dim': 64, 'num_channels1': 128, 'num_channels2': 240, 'dropout_rate': 0.3737043635986238, 'learning_rate': 0.0002572020852429956, 'batch_size': 256}. Best is trial 2 with value: 1584.5580678185095.
[I 2024-12-01 14:26:34,919] Trial 1 finished with value: 340.3819314415565 and parameters: {'embedding_dim': 11, 'lstm_hidden_dim': 128, 'num_channels1': 128, 'num_channels2': 112, 'dropout_rate': 0.2707011995697526, 'learning_rate': 0.0087541747148776, 'batch_size': 64}. Best is trial 1 with value: 340.3819314415565.
[I 2024-12-01 14:26:42,699] Trial 3 finished with value: 328.6687864896334 and parameters: {'embedding_dim': 5, 'lstm_hidden_dim': 256, 'num_channels1': 128, 'num_channels2': 256, 'dropout_rate': 0.2577330083500772, 'learning_rate': 0.0028869231158946958, 'batch_size': 128}. Best is trial 3 with value: 328.6687864896334.
[I 2024-12-01 14:26:59,498] Tria

Best Hyperparameters: {'embedding_dim': 5, 'lstm_hidden_dim': 64, 'num_channels1': 80, 'num_channels2': 64, 'dropout_rate': 0.20078875098427415, 'learning_rate': 0.002703238330335273, 'batch_size': 128}
Best Validation Loss: 299.3201611961952


In [42]:
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


class HybridCNNLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, lstm_hidden_dim, num_channels1, num_channels2, num_cities, dropout_rate):
        super(HybridCNNLSTM, self).__init__()
        
        # Embedding for city index
        self.city_embedding = nn.Embedding(num_embeddings=num_cities, embedding_dim=embedding_dim)
        
        # CNN layers
        self.conv1 = nn.Conv1d(in_channels=input_dim + embedding_dim, out_channels=num_channels1,  kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=num_channels1, out_channels=num_channels2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.relu = nn.ReLU()
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=num_channels2, hidden_size=lstm_hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        
        # Fully connected layers
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(lstm_hidden_dim * 2, 64)  # Bidirectional LSTM output
        self.fc2 = nn.Linear(64, 16)
        self.output = nn.Linear(16, 1)
        
    def forward(self, x, city_idx):
        # Add city embeddings
        city_emb = self.city_embedding(city_idx).unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([x, city_emb], dim=2)
        
        # Permute for CNN (channels-first format)
        x = x.permute(0, 2, 1)
        
        # Pass through CNN layers
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        
        # Permute back for LSTM (batch-first format)
        x = x.permute(0, 2, 1)
        
        # Pass through LSTM
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output from the last timestep
        
        # Pass through fully connected layers
        x = self.dropout(self.fc1(x))
        x = self.relu(x)
        x = self.dropout(self.fc2(x))
        x = self.relu(x)
        x = self.output(x)
        
        return x
    

# Hyperparameters
input_dim = X_train.shape[2]  # Number of features
num_cities = city_train.max().item() + 1  # Total number of unique cities
epochs = 1000
batch_size = 128
embedding_dim = 5 # Dimension of the embedding layer
num_channels1=48
num_channels2=80
dropout_rate=0.2
lstm_hidden_dim=128
lr=2e-3

# Initialize model, criterion, and optimizer
model = HybridCNNLSTM(
            input_dim=X_train.shape[2],
            embedding_dim=embedding_dim,
            lstm_hidden_dim=lstm_hidden_dim,
            num_channels1=num_channels1,
            num_channels2=num_channels2,
            num_cities=city_train.max().item() + 1,
            dropout_rate=dropout_rate
        ).to(device)

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

# Convert data to PyTorch tensors and move to device
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
city_train_tensor = torch.tensor(city_train, dtype=torch.long).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
city_test_tensor = torch.tensor(city_test, dtype=torch.long).to(device)

# DataLoader for batching
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor, city_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor, city_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Using device: cuda


In [43]:
import matplotlib.pyplot as plt
import numpy as np

# Initialize variables to track the best model
best_val_loss = np.inf  # Set to infinity initially
best_checkpoint_path = "/home/research/a.naveen/denoise40/weather/cnn/best_lstmcnn_model.pth"

train_losses=[]
val_losses=[]
# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch, city_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch, city_batch)
        loss = loss_fn(output.squeeze(), y_batch.squeeze())
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * X_batch.size(0)

    # Validation step
    model.eval()
    val_loss = 0
    
    if epoch%5==0:
        with torch.no_grad():
            for X_batch, y_batch, city_batch in test_loader:
                output = model(X_batch, city_batch)
                loss = loss_fn(output.squeeze(), y_batch.squeeze())
                val_loss += loss.item() * X_batch.size(0)

        # Calculate average losses
        train_loss /= len(train_loader.dataset)
        val_loss /= len(test_loader.dataset)

        # Append losses for plotting
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Save the model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'val_loss': val_loss
            }, best_checkpoint_path)
            print(f"Best model saved at epoch {epoch+1} with validation loss: {val_loss:.4f}")


Epoch 1/1000, Train Loss: 142.7550, Validation Loss: 7.4418
Best model saved at epoch 1 with validation loss: 7.4418
Epoch 6/1000, Train Loss: 17.2835, Validation Loss: 2.8043
Best model saved at epoch 6 with validation loss: 2.8043
Epoch 11/1000, Train Loss: 14.9947, Validation Loss: 2.4815
Best model saved at epoch 11 with validation loss: 2.4815
Epoch 16/1000, Train Loss: 14.9034, Validation Loss: 2.2871
Best model saved at epoch 16 with validation loss: 2.2871
Epoch 21/1000, Train Loss: 14.7778, Validation Loss: 2.5742
Epoch 26/1000, Train Loss: 14.6351, Validation Loss: 2.5518
Epoch 31/1000, Train Loss: 14.7007, Validation Loss: 2.3480
Epoch 36/1000, Train Loss: 14.7339, Validation Loss: 2.1894
Best model saved at epoch 36 with validation loss: 2.1894
Epoch 41/1000, Train Loss: 14.7130, Validation Loss: 2.4523
Epoch 46/1000, Train Loss: 14.5241, Validation Loss: 2.1677
Best model saved at epoch 46 with validation loss: 2.1677
Epoch 51/1000, Train Loss: 14.5378, Validation Loss: 2.

In [40]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# LSTM Model with Attention and Embedding
class LSTMModelWithAttention(nn.Module):
    def __init__(self, input_dim, num_cities, embedding_dim, hidden_size, dropout):
        super().__init__()
        self.city_embedding = nn.Embedding(num_embeddings=num_cities, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(
            input_size=input_dim + embedding_dim,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(dropout)
        
        # Attention Layer
        self.attention = nn.Linear(hidden_size * 2, 1)  # Bidirectional LSTM output size is `hidden_size * 2`
        
        # Fully connected layers
        self.linear1 = nn.Linear(hidden_size * 2, 64)
        self.linear2 = nn.Linear(64, 8)
        self.output_linear = nn.Linear(8, 1)

    def forward(self, x, city_idx):
        # City embedding
        city_emb = self.city_embedding(city_idx).unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([x, city_emb], dim=2)
        
        # LSTM
        lstm_output, _ = self.lstm(x)  # Shape: (batch_size, seq_len, hidden_size * 2)
        
        # Attention Mechanism
        attention_weights = torch.softmax(self.attention(lstm_output), dim=1)  # Shape: (batch_size, seq_len, 1)
        context_vector = torch.sum(attention_weights * lstm_output, dim=1)  # Weighted sum over timesteps
        
        # Fully connected layers
        x = self.dropout(context_vector)
        x = self.linear1(x)
        x = self.linear2(x)
        x = self.output_linear(x)
        return x

# Hyperparameters
input_dim = X_train.shape[2]  # Number of features
num_cities = city_train.max().item() + 1  # Total number of unique cities
epochs = 1000
hidden_size = 128
dropout = 0.39
batch_size = 64
embedding_dim = 7  # Dimension of the embedding layer

# Initialize model, criterion, and optimizer
model = LSTMModelWithAttention(input_dim=input_dim, num_cities=num_cities, embedding_dim=embedding_dim, hidden_size=hidden_size, dropout=dropout).to(device)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

# Convert data to PyTorch tensors and move to device
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
city_train_tensor = torch.tensor(city_train, dtype=torch.long).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
city_test_tensor = torch.tensor(city_test, dtype=torch.long).to(device)

# DataLoader for batching
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor, city_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor, city_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Using device: cuda


In [41]:
import matplotlib.pyplot as plt
import numpy as np

# Initialize variables to track the best model
best_val_loss = np.inf  # Set to infinity initially
best_checkpoint_path = "/home/research/a.naveen/denoise40/weather/lstm/best_attention_model.pth"

train_losses=[]
val_losses=[]
# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch, city_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch, city_batch)
        loss = loss_fn(output.squeeze(), y_batch.squeeze())
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * X_batch.size(0)

    # Validation step
    model.eval()
    val_loss = 0
    
    if epoch%5==0:
        with torch.no_grad():
            for X_batch, y_batch, city_batch in test_loader:
                output = model(X_batch, city_batch)
                loss = loss_fn(output.squeeze(), y_batch.squeeze())
                val_loss += loss.item() * X_batch.size(0)

        # Calculate average losses
        train_loss /= len(train_loader.dataset)
        val_loss /= len(test_loader.dataset)

        # Append losses for plotting
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Save the model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'val_loss': val_loss
            }, best_checkpoint_path)
            print(f"Best model saved at epoch {epoch+1} with validation loss: {val_loss:.4f}")


Epoch 1/1000, Train Loss: 52.4353, Validation Loss: 2.3194
Best model saved at epoch 1 with validation loss: 2.3194
Epoch 6/1000, Train Loss: 2.1198, Validation Loss: 1.8095
Best model saved at epoch 6 with validation loss: 1.8095
Epoch 11/1000, Train Loss: 1.9548, Validation Loss: 1.7551
Best model saved at epoch 11 with validation loss: 1.7551
Epoch 16/1000, Train Loss: 1.8474, Validation Loss: 1.5817
Best model saved at epoch 16 with validation loss: 1.5817
Epoch 21/1000, Train Loss: 1.7777, Validation Loss: 1.5027
Best model saved at epoch 21 with validation loss: 1.5027
Epoch 26/1000, Train Loss: 1.7433, Validation Loss: 1.5193
Epoch 31/1000, Train Loss: 1.7130, Validation Loss: 1.4482
Best model saved at epoch 31 with validation loss: 1.4482
Epoch 36/1000, Train Loss: 1.6881, Validation Loss: 1.4296
Best model saved at epoch 36 with validation loss: 1.4296
Epoch 41/1000, Train Loss: 1.6746, Validation Loss: 1.4128
Best model saved at epoch 41 with validation loss: 1.4128
Epoch 46

KeyboardInterrupt: 

In [26]:
!df -h


Filesystem                                           Size  Used Avail Use% Mounted on
devtmpfs                                             378G     0  378G   0% /dev
tmpfs                                                378G  5.8M  378G   1% /dev/shm
tmpfs                                                378G  4.0G  374G   2% /run
tmpfs                                                378G     0  378G   0% /sys/fs/cgroup
/dev/sda2                                             64G   44G   21G  69% /
/dev/nvme0n1p1                                       3.5T   25G  3.5T   1% /scratch
/dev/sda1                                            2.0G  437M  1.6G  22% /boot
/dev/sda5                                            373G  2.7G  370G   1% /tmp
nfs.seas.wustl.edu:/seaslab/compute                 1000G  606G  395G  61% /project/compute
nfs.seas.wustl.edu:/seaslab/home-compute              20T   12T  7.6T  62% /home/research
cigserver3.engr.wustl.edu:/export1/project           3.7T  2.8T  

In [25]:
import torch

# Check PyTorch version
print("PyTorch version:", torch.__version__)

# Check CUDA availability and version
if torch.cuda.is_available():
    print("CUDA is available")
    print("CUDA version:", torch.version.cuda)
else:
    print("CUDA is not available")


PyTorch version: 2.1.0
CUDA is available
CUDA version: 11.8
The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.
