# AI Model for Predicting Sea Surface Currents and Wind Conditions

This notebook outlines the steps to develop a CNN-LSTM hybrid model for predicting future sea surface currents and wind conditions. We'll go through data preprocessing, model definition, training, and evaluation.


## 1. Data Preprocessing

### Import Necessary Libraries

In [1]:
import xarray as xr
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TimeDistributed, Conv1D, MaxPooling1D, Flatten, LSTM, Dense
from tqdm.keras import TqdmCallback




### Load the Datasets

In [2]:
ssc_ds = xr.open_dataset('Data/Processed_SSC_Data.nc', engine='h5netcdf')
wind_ds = xr.open_dataset('Data/Processed_Wind_Data.nc', engine='h5netcdf')

### Normalize the Data

In [3]:
variables_to_normalize = ['u', 'v', 'u10', 'v10'] 

# Initialize the scaler
scaler = StandardScaler()

# Normalize variables in the SSC dataset
ssc_variables_to_normalize = ['u', 'v']  
for var in ssc_variables_to_normalize:
    data = ssc_ds[var].values.flatten()
    scaled_data = scaler.fit_transform(data.reshape(-1, 1)).reshape(ssc_ds[var].shape)
    ssc_ds[var].values = scaled_data

# Normalize variables in the Wind dataset
wind_variables_to_normalize = ['u10', 'v10']  
for var in wind_variables_to_normalize:
    data = wind_ds[var].values.flatten()
    scaled_data = scaler.fit_transform(data.reshape(-1, 1)).reshape(wind_ds[var].shape)
    wind_ds[var].values = scaled_data

### Interpolating Wind Data

In [4]:
# Interpolate wind data to match the SSC dataset's spatial grid
latitude_new = ssc_ds['lat']  # Latitudes from the SSC dataset
longitude_new = ssc_ds['lon']  # Longitudes from the SSC dataset

wind_ds_interpolated = wind_ds.interp(latitude=latitude_new, longitude=longitude_new, method='linear')

### Combine SSC and Wind Data for Sequences

In [5]:
# Correct approach to flatten and combine after ensuring matching spatial dimensions
ssc_flat = np.stack([ssc_ds['u'].values.flatten(), ssc_ds['v'].values.flatten()], axis=-1)
wind_flat = np.stack([wind_ds_interpolated['u10'].values.flatten(), wind_ds_interpolated['v10'].values.flatten()], axis=-1)

# Ensure this by verifying their shapes
print(ssc_flat.shape, wind_flat.shape)

# If the shapes match, proceed to concatenate
combined_data = np.concatenate([ssc_flat, wind_flat], axis=1)

(1150050, 2) (1150050, 2)


### Sequence Creation for Model Input
Here we convert the dataset into sequences that the model can use for training. Each sequence contains data from a series of consecutive time steps.

In [6]:
def create_sequences(input_data, sequence_length=30):
    sequences = []
    output_data = []
    for i in range(len(input_data) - sequence_length):
        sequences.append(input_data[i:i+sequence_length])
        output_data.append(input_data[i+sequence_length])
    return np.array(sequences), np.array(output_data)

sequence_length = 30  # Number of time steps for each input sequence

# Assuming combined_data is shaped correctly and represents your entire dataset's flattened spatial points across all time steps
X, y = create_sequences(combined_data, sequence_length)

### Split Data into Training and Validation Sets

In [8]:
# Define width and channels
width = 2  # Number of spatial dimensions (latitude and longitude)
channels = 4  # Number of variables (u, v, u10, v10)

# Split the sequences into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Original X_train shape: {X_train.shape}")
print(f"Original X_val shape: {X_val.shape}")
total_elements_train = np.prod(X_train.shape)
total_elements_val = np.prod(X_val.shape)
print(f"Total elements in X_train: {total_elements_train}")
print(f"Total elements in X_val: {total_elements_val}")

expected_elements = sequence_length * width * channels
print(f"Expected number of elements after reshape: {expected_elements}")

Original X_train shape: (920016, 30, 4)
Original X_val shape: (230004, 30, 4)
Total elements in X_train: 110401920
Total elements in X_val: 27600480
Expected number of elements after reshape: 240


### Model Architecture Definition

In [9]:
def build_model(sequence_length, width, channels):
    model = Sequential([
        TimeDistributed(Conv1D(64, 3, activation='relu'), input_shape=(sequence_length, width, channels)),
        TimeDistributed(MaxPooling1D(2)),
        TimeDistributed(Flatten()),
        LSTM(50, activation='relu'),
        Dense(4, activation='linear')  # Output features: u, v, u10, v10
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

### Model Training

In [12]:
# Define model parameters
width = 22 * 25  # Total spatial points when flattened
channels = 4  # Features per spatial point

X_train_reshaped = X_train.reshape((-1, sequence_length, width, channels))
X_val_reshaped = X_val.reshape((-1, sequence_length, width, channels))

# Build the model
model = build_model(sequence_length, width, channels)

history = model.fit(
    X_train_reshaped, y_train, 
    epochs=20, 
    validation_data=(X_val_reshaped, y_val),
    verbose=0,  # Turn off the default verbose output
    callbacks=[TqdmCallback(verbose=2)]  # Adjust verbosity for TqdmCallback
)

ValueError: cannot reshape array of size 27600480 into shape (30,920016,4)

### Model Evaluation

In [None]:
val_loss = model.evaluate(X_val_reshaped, y_val)
print(f'Validation loss: {val_loss}')