### Data Preparation

In [2]:
import os
import pandas as pd
from PIL import Image,UnidentifiedImageError
import numpy as np

# Path to the folder containing subfolders for each day
image_folder = './dataset/006_Banqiao'

# Load numerical data
numerical_data = pd.read_csv('./dataset/Banqiao_2022_old.csv')
#drop the Station column
numerical_data = numerical_data.drop(columns=['Station'])
# Ensure the 'date' column in numerical_data is in datetime format
print(numerical_data.head())
numerical_data['date'] = pd.to_datetime(numerical_data['date'],format="%d-%m-%Y %H:%M",dayfirst=True)

# Function to load images for a specific date
def load_images_for_date(date):
    date_str = date.strftime('%Y%m%d')
    images = []
    for hour in range(24):
        img_name=f"006-{date_str}{hour:02d}00.jpg"
        img_path = os.path.join(image_folder, date_str, img_name)
        try: 
            if os.path.exists(img_path):
                with Image.open(img_path) as img:
                    images.append(img.copy())
            else:
                images.append(None)
        except(OSError,UnidentifiedImageError):
            images.append(None)
    return images

# Create a dictionary to store images by date
image_data = {date: load_images_for_date(date) for date in numerical_data['date'].dt.date.unique()}


               date measurement     0     1     2     3     4     5     6  \
0  01-02-2022 00:00    AMB_TEMP    16    16    16  16.4  16.4  16.7  16.8   
1  01-02-2022 00:00         CH4     2  2.02  2.05  2.04  2.07  2.09  2.08   
2  01-02-2022 00:00          CO  0.24  0.25  0.23   0.2   0.2  0.21  0.21   
3  01-02-2022 00:00        NMHC  0.03  0.04  0.02  0.02  0.03  0.03  0.02   
4  01-02-2022 00:00          NO   0.2   0.6   0.6   0.6   0.5   0.6   0.5   

      7  ...    14    15    16    17    18    19    20    21    22    23  
0  16.9  ...    17  16.6  16.3  16.3  16.3  16.1  16.2  16.5  16.5  16.4  
1  2.11  ...  2.14  2.09  2.04  2.04  2.08  2.09  2.08  2.04  2.01  1.99  
2  0.28  ...  0.51  0.41   0.3  0.34  0.41  0.34  0.35  0.28  0.24  0.22  
3  0.05  ...  0.14   0.1  0.05  0.07  0.12  0.07  0.09  0.04  0.02     0  
4     1  ...   2.7   1.8   1.1   0.9   4.6   0.8   0.8   0.9   0.5   0.5  

[5 rows x 26 columns]


In [3]:
print(numerical_data)

           date measurement     0     1     2     3     4     5     6     7  \
0    2022-02-01    AMB_TEMP    16    16    16  16.4  16.4  16.7  16.8  16.9   
1    2022-02-01         CH4     2  2.02  2.05  2.04  2.07  2.09  2.08  2.11   
2    2022-02-01          CO  0.24  0.25  0.23   0.2   0.2  0.21  0.21  0.28   
3    2022-02-01        NMHC  0.03  0.04  0.02  0.02  0.03  0.03  0.02  0.05   
4    2022-02-01          NO   0.2   0.6   0.6   0.6   0.5   0.6   0.5     1   
...         ...         ...   ...   ...   ...   ...   ...   ...   ...   ...   
1057 2022-03-31         THC  2.11  2.03  1.98  1.99  1.98  2.07     2  2.04   
1058 2022-03-31       WD_HR    58    86    78    73    91    70   112    64   
1059 2022-03-31  WIND_DIREC    62    98    73    97    77    31   103    60   
1060 2022-03-31  WIND_SPEED   1.3   2.1   2.7     2   1.7   0.6   2.2   1.3   
1061 2022-03-31       WS_HR   1.1   1.6   1.9   1.6   1.3   0.8   1.7   1.5   

      ...    14    15    16    17    18    19    20

### Data Preprocessing

In [4]:
import torchvision.transforms as transforms
import torch
# Preprocess images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224 directly
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Extract features from images
def extract_image_features(images):
    features = []
    for img in images:
        if img is not None:
            img = transform(img)  # Resize and preprocess image
            features.append(img)
        else:
            # Handle missing images by appending zeros or any placeholder value
            features.append(torch.zeros(3, 224, 224))  # Assuming RGB images
    return features

# Extract image features for each date
image_features = {date: extract_image_features(images) for date, images in image_data.items()}


  from .autonotebook import tqdm as notebook_tqdm


### Combining features

In [5]:
print(image_features)

{datetime.date(2022, 2, 1): [tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]]), tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
      

In [6]:
print(numerical_data.head(100))
# numerical_data['date']=numerical_data['date'].dt.month

         date measurement     0     1     2     3     4     5     6     7  \
0  2022-02-01    AMB_TEMP    16    16    16  16.4  16.4  16.7  16.8  16.9   
1  2022-02-01         CH4     2  2.02  2.05  2.04  2.07  2.09  2.08  2.11   
2  2022-02-01          CO  0.24  0.25  0.23   0.2   0.2  0.21  0.21  0.28   
3  2022-02-01        NMHC  0.03  0.04  0.02  0.02  0.03  0.03  0.02  0.05   
4  2022-02-01          NO   0.2   0.6   0.6   0.6   0.5   0.6   0.5     1   
..        ...         ...   ...   ...   ...   ...   ...   ...   ...   ...   
95 2022-02-06         NO2   7.5   4.7   4.2     5   4.8   4.8     7    10   
96 2022-02-06         NOx   8.4   5.1   4.7   5.6   5.4   5.2   7.5  10.7   
97 2022-02-06          O3  46.5    51  50.9  48.5  47.2  45.2  40.9  36.9   
98 2022-02-06        PM10    25    28    26    29    23    16    17    11   
99 2022-02-06       PM2.5    22    20    21    21    20    13     9     9   

    ...    14    15    16    17    18    19    20    21    22    23  
0   .

In [7]:
print(numerical_data.shape)

(1062, 26)


In [8]:
# Reshape numerical data to match image data
#drop the Station column
# print(numerical_data.head())
# numerical_data = numerical_data.drop(columns=['Station'])
numerical_data = numerical_data.melt(id_vars=['date', 'measurement'], var_name='hour', value_name='value')
numerical_data['hour'] = numerical_data['hour'].astype(int)
numerical_data=numerical_data.pivot(index=['date','hour'],columns='measurement',values='value').reset_index()
# numerical_data['date']=numerical_data['date'].dt.month
numerical_data['month']=numerical_data['date'].dt.month
print(numerical_data.head(100))
# Create combined features for each date and hour
combined_features = []
targets = []
#instead of each row, we take 18 rows at a time
for idx, row in numerical_data.iterrows():
    print(row)
    date = row['date'].date()
    print(date)
    hour = row['hour']
    numerical_features = row.drop(['date']).values
    # print(hour)
    # print(numerical_features)
    # print(image_features[date][23])
    img_features = image_features[date][hour]
    print(img_features)
    print(numerical_features)
    #we can consider adding latitute and longitude as well to the input features
    # numerical_features=np.array(numerical_features).reshape(1,-1)
    # combined_feature=
    # combined_feature=np.array()
    combined_feature = np.concatenate((numerical_features, img_features),axis=None)
    combined_features.append(combined_feature)
    # targets.append(row['value'])  # Assuming the target is the value for that hour
    targets.append(row.drop(['date','hour','month']).values)  # Assuming the target is the value for that hour

combined_features = np.array(combined_features)
targets = np.array(targets)


measurement       date  hour AMB_TEMP   CH4    CO  NMHC   NO  NO2  NOx    O3  \
0           2022-02-01     0       16     2  0.24  0.03  0.2  7.1  7.4  37.9   
1           2022-02-01     1       16  2.02  0.25  0.04  0.6  6.2  6.8  36.9   
2           2022-02-01     2       16  2.05  0.23  0.02  0.6  5.6  6.3  35.8   
3           2022-02-01     3     16.4  2.04   0.2  0.02  0.6    4  4.6  37.5   
4           2022-02-01     4     16.4  2.07   0.2  0.03  0.5  3.7  4.3  37.3   
..                 ...   ...      ...   ...   ...   ...  ...  ...  ...   ...   
95          2022-02-04    23     13.5  2.02  0.28  0.01  0.6    9  9.6  36.6   
96          2022-02-05     0     13.5  2.01  0.25  0.01  0.4  7.1  7.6  39.1   
97          2022-02-05     1     13.6  2.01  0.24     0  0.8  5.4  6.3  40.6   
98          2022-02-05     2     13.6  2.02  0.24     0  0.8  4.6  5.4  41.7   
99          2022-02-05     3     13.5  2.02  0.24     0  0.9  3.6  4.5  43.3   

measurement  ... PM2.5 RAINFALL  RH  SO

In [9]:
print(numerical_data)
# targets=np.array(targets).reshape(-1)
print(targets)
# targets=targets[:,np.newaxis]
# targets=targets.squeeze()
# targets.reshape(-1,1)
print(targets.shape)

measurement       date  hour AMB_TEMP   CH4    CO  NMHC   NO   NO2   NOx  \
0           2022-02-01     0       16     2  0.24  0.03  0.2   7.1   7.4   
1           2022-02-01     1       16  2.02  0.25  0.04  0.6   6.2   6.8   
2           2022-02-01     2       16  2.05  0.23  0.02  0.6   5.6   6.3   
3           2022-02-01     3     16.4  2.04   0.2  0.02  0.6     4   4.6   
4           2022-02-01     4     16.4  2.07   0.2  0.03  0.5   3.7   4.3   
...                ...   ...      ...   ...   ...   ...  ...   ...   ...   
1411        2022-03-31    19     20.1     2  0.41  0.08  1.3  19.3  20.6   
1412        2022-03-31    20     19.9  2.01   0.4  0.05  1.1    17  18.2   
1413        2022-03-31    21     19.2  2.01  0.38  0.03  1.1  12.3  13.4   
1414        2022-03-31    22     18.8  2.01  0.36  0.02  0.9  12.7  13.7   
1415        2022-03-31    23     18.3     2  0.29     0  0.8   9.1    10   

measurement    O3  ... PM2.5 RAINFALL  RH  SO2   THC WD_HR WIND_DIREC  \
0            3

In [10]:
print(numerical_data.shape)
print(numerical_data.head(100))

(1416, 21)
measurement       date  hour AMB_TEMP   CH4    CO  NMHC   NO  NO2  NOx    O3  \
0           2022-02-01     0       16     2  0.24  0.03  0.2  7.1  7.4  37.9   
1           2022-02-01     1       16  2.02  0.25  0.04  0.6  6.2  6.8  36.9   
2           2022-02-01     2       16  2.05  0.23  0.02  0.6  5.6  6.3  35.8   
3           2022-02-01     3     16.4  2.04   0.2  0.02  0.6    4  4.6  37.5   
4           2022-02-01     4     16.4  2.07   0.2  0.03  0.5  3.7  4.3  37.3   
..                 ...   ...      ...   ...   ...   ...  ...  ...  ...   ...   
95          2022-02-04    23     13.5  2.02  0.28  0.01  0.6    9  9.6  36.6   
96          2022-02-05     0     13.5  2.01  0.25  0.01  0.4  7.1  7.6  39.1   
97          2022-02-05     1     13.6  2.01  0.24     0  0.8  5.4  6.3  40.6   
98          2022-02-05     2     13.6  2.02  0.24     0  0.8  4.6  5.4  41.7   
99          2022-02-05     3     13.5  2.02  0.24     0  0.9  3.6  4.5  43.3   

measurement  ... PM2.5 RAINF

In [11]:
print(combined_features.shape)

(1416, 150548)


In [12]:
print(combined_features.shape)
print(combined_features)
#convert all fields to float
#there are some fields that are not float
def to_float_with_nan(x):
    try:
        return float(x)
    except ValueError:
        return np.nan

# Vectorize the function to apply it to the entire array
vectorized_to_float_with_nan = np.vectorize(to_float_with_nan)

# Replace garbage values with np.nan and convert to float
combined_features = vectorized_to_float_with_nan(combined_features).astype(np.float32)
targets=vectorized_to_float_with_nan(targets).astype(np.float32)

(1416, 150548)
[[0 '16' '2' ... 0.0 0.0 0.0]
 [1 '16' '2.02' ... 0.0 0.0 0.0]
 [2 '16' '2.05' ... 0.0 0.0 0.0]
 ...
 [21 '19.2' '2.01' ... 0.0 0.0 0.0]
 [22 '18.8' '2.01' ... 0.0 0.0 0.0]
 [23 '18.3' '2' ... 0.0 0.0 0.0]]


In [13]:
print(combined_features)
print(targets.shape)

[[ 0.   16.    2.   ...  0.    0.    0.  ]
 [ 1.   16.    2.02 ...  0.    0.    0.  ]
 [ 2.   16.    2.05 ...  0.    0.    0.  ]
 ...
 [21.   19.2   2.01 ...  0.    0.    0.  ]
 [22.   18.8   2.01 ...  0.    0.    0.  ]
 [23.   18.3   2.   ...  0.    0.    0.  ]]
(1416, 18)


In [14]:
print("Checking for NaN and infinite values in combined_features and targets...")
print("NaNs in combined_features:", np.isnan(combined_features).sum())
print("Infinite values in combined_features:", np.isinf(combined_features).sum())
print("NaNs in targets:", np.isnan(targets).sum())
print("Infinite values in targets:", np.isinf(targets).sum())

Checking for NaN and infinite values in combined_features and targets...
NaNs in combined_features: 243
Infinite values in combined_features: 0
NaNs in targets: 243
Infinite values in targets: 0


In [15]:
# Replace NaNs and infinite values in combined_features and targets with zeros
combined_features = np.nan_to_num(combined_features, nan=0.0, posinf=0.0, neginf=0.0)
targets = np.nan_to_num(targets, nan=0.0, posinf=0.0, neginf=0.0)

In [16]:
# Replace NaNs in combined_features with column means
print("Replacing NaNs in combined_features with column means...")
col_mean_combined = np.nanmean(combined_features, axis=0)
inds_combined = np.where(np.isnan(combined_features))
combined_features[inds_combined] = np.take(col_mean_combined, inds_combined[1])

Replacing NaNs in combined_features with column means...


In [17]:
# Replace NaNs in targets with column means
print("Replacing NaNs in targets with column means...")
col_mean_targets = np.nanmean(targets, axis=0)
inds_targets = np.where(np.isnan(targets))
targets[inds_targets] = np.take(col_mean_targets, inds_targets[1])

Replacing NaNs in targets with column means...


In [18]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
combined_features[:,:20]=scaler.fit_transform(combined_features[:,:20])

### Model Design

In [19]:
from copy import deepcopy

print(combined_features.shape)
print(targets.shape)
targets=vectorized_to_float_with_nan(targets).astype(np.float32)

(1416, 150548)
(1416, 18)


In [20]:
print(combined_features.shape)

(1416, 150548)


In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Ensure features and targets are tensors
combined_features = torch.tensor(combined_features, dtype=torch.float32)
targets = torch.tensor(targets, dtype=torch.float32)

# Extract numerical and image features
num_features = combined_features[:, :20]  # First 20 columns are numerical features
num_features = num_features.unsqueeze(1)  # Add an extra dimension for GRU
img_features = combined_features[:, 20:]  # Remaining columns are image features

# Reshape image features to (batch_size, 3, 224, 224)
img_features = img_features.reshape(-1, 3, 224, 224)
#set some of the columns of every row to -1 to simulate missing data
mask = torch.rand_like(num_features) < 0.1
num_features[mask] = -1
# move them to gpu
num_features, img_features, targets = num_features.to(device), img_features.to(device), targets.to(device)
# Define custom dataset
class MultimodalDataset(Dataset):
    def __init__(self, num_features, img_features, targets):
        self.num_features = num_features
        self.img_features = img_features
        self.targets = targets

    def __len__(self):
        return len(self.num_features)

    def __getitem__(self, idx):
        return self.num_features[idx], self.img_features[idx], self.targets[idx]

# Split data into train, validation, test sets
train_size = int(0.7 * len(num_features))
val_size = int(0.1 * len(num_features))
test_size = len(num_features) - train_size - val_size

train_dataset = MultimodalDataset(num_features[:train_size], img_features[:train_size], targets[:train_size])
val_dataset = MultimodalDataset(num_features[train_size:train_size + val_size], img_features[train_size:train_size + val_size], targets[train_size:train_size + val_size])
test_dataset = MultimodalDataset(num_features[train_size + val_size:], img_features[train_size + val_size:], targets[train_size + val_size:])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define the multimodal neural network
class MultimodalNet(nn.Module):
    def __init__(self):
        super(MultimodalNet, self).__init__()
        
        # Define GRU for numerical features
        self.gru_num = nn.GRU(20, 64, 1, batch_first=True)
        
        # Load pre-trained MobileNetV3 as feature extractor
        mobilenet = models.mobilenet_v3_small(pretrained=True)
        self.mobilenet_features = mobilenet.features
        self.mobilenet_classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(576, 128)
        )  # Change to output 128-dim features
        
        # Define linear layers for numerical features
        self.fc1_num = nn.Linear(64, 128)
        self.fc2_num = nn.Linear(128, 64)
        
        # Define linear layers for combined features
        self.fc1_combined = nn.Linear(192, 64)
        self.fc2_combined = nn.Linear(64, 18)  # Output size (18 for regression)

    def forward(self, x_num, x_img):
        # Extract features using GRU for numerical features
        x_num, _ = self.gru_num(x_num)
        x_num = x_num[:, -1, :]  # Only take the last hidden state
        
        # Extract features using MobileNetV3 for image features
        x_img = self.mobilenet_features(x_img)
        x_img = self.mobilenet_classifier(x_img)
        
        # Apply linear layers for numerical features
        x_num = torch.relu(self.fc1_num(x_num))
        x_num = torch.relu(self.fc2_num(x_num))
        
        # Apply linear layers for combined features
        x_combined = torch.cat((x_num, x_img), dim=1)
        x_combined = torch.relu(self.fc1_combined(x_combined))
        x_combined = self.fc2_combined(x_combined)
        
        return x_combined

# Instantiate and train the model
model = MultimodalNet().to(device)
criterion = nn.MSELoss()

optimizer = optim.Adam([
    {'params': model.gru_num.parameters()},  # GRU parameters
    {'params': model.mobilenet_features.parameters(), 'lr': 0.0001},  # MobileNetV3 feature extractor parameters with a lower learning rate
    {'params': model.mobilenet_classifier.parameters(), 'lr': 0.0001},  # MobileNetV3 classifier parameters with a lower learning rate
    {'params': model.fc1_num.parameters()},
    {'params': model.fc2_num.parameters()},
    {'params': model.fc1_combined.parameters()},
    {'params': model.fc2_combined.parameters()}
], lr=0.001)  # Default learning rate for other parameters

# Training loop
for epoch in range(200):
    print(f'Epoch {epoch+1}')
    model.train()
    running_loss = 0.0
    for num_inputs, img_inputs, targets_ in train_loader:
        # num_inputs, img_inputs, targets_ = num_inputs.to(device), img_inputs.to(device), targets_.to(device)

        # Set some of the values in num_inputs to -1 with a probability of 0.1
        # mask = torch.rand_like(num_inputs) < 0.1
        # num_inputs[mask] = -1

        optimizer.zero_grad()
        # outputs = model(num_inputs.unsqueeze(1), img_inputs)
        outputs = model(num_inputs, img_inputs)
        loss = criterion(outputs, targets_)
        if torch.isnan(loss):
            print("NaN loss detected")
            print("Numerical Inputs: ", num_inputs)
            print("Image Inputs: ", img_inputs)
            print("Outputs: ", outputs)
            print("Targets: ", targets_)
            break
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss}')


Using device: cuda




Epoch 1
Epoch 1, Loss: 110816.95361328125
Epoch 2
Epoch 2, Loss: 85235.74255371094
Epoch 3
Epoch 3, Loss: 32394.897094726562
Epoch 4
Epoch 4, Loss: 15674.736053466797
Epoch 5
Epoch 5, Loss: 12755.024017333984
Epoch 6
Epoch 6, Loss: 11292.537574768066
Epoch 7
Epoch 7, Loss: 10241.203155517578
Epoch 8
Epoch 8, Loss: 9842.643013000488
Epoch 9
Epoch 9, Loss: 9704.066696166992
Epoch 10


In [None]:
# Save the model
model_path = 'multimodal_net_gru_mobilenet_dropout.pth'
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': running_loss,
}, model_path)
print(f'Model saved to {model_path}')

In [None]:
class MultimodalNet(nn.Module):
    def __init__(self):
        super(MultimodalNet, self).__init__()
        self.fc1_num = nn.Linear(64, 128)  # 20 numerical features
        self.fc2_num = nn.Linear(128, 64)
        
        self.gru_num = nn.GRU(20, 64, 1, batch_first=True)
        self.gru_img=nn.GRU(2048, 1024, 1, batch_first=True)
        self.fc1_img = nn.Linear(1024, 128)  # ResNet output size
        self.fc2_img = nn.Linear(128, 64)

        self.fc1_combined = nn.Linear(128, 64)
        self.fc2_combined = nn.Linear(64, 18)  # Output size (18 for regression)

    def forward(self, x):
        x_num = x[:, :20].unsqueeze(1)  # First 20 features are numerical
        x_img = x[:, 20:].unsqueeze(1)  # The rest are image features
        x_num, _ = self.gru_num(x_num)
        x_img, _ = self.gru_img(x_img)
        x_num = x_num[:, -1, :]  # Only take the last hidden state
        x_img = x_img[:, -1, :]  # Only take the last hidden state
        x_num = torch.relu(self.fc1_num(x_num))
        x_num = torch.relu(self.fc2_num(x_num))

        x_img = torch.relu(self.fc1_img(x_img))
        x_img = torch.relu(self.fc2_img(x_img))

        x_combined = torch.cat((x_num, x_img), dim=1)
        x_combined = torch.relu(self.fc1_combined(x_combined))
        x_combined = self.fc2_combined(x_combined)

        return x_combined

### Evaluation

In [None]:
# Evaluation
model.eval()
total_loss = 0.0
with torch.no_grad():
    for inputs, targets_ in val_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets_)
        total_loss += loss.item()
print(f'Validation Loss: {total_loss/len(val_loader)}')
# Deployment
# Implement a system to gather new data, preprocess it, and use the trained model to make predictions


### Prediction

In [None]:
# Convert relevant columns to numeric type
numerical_data = numerical_data.apply(pd.to_numeric, errors='coerce')

# Calculate the mean of each measurement for every month, excluding NaN values
monthly_means = numerical_data.groupby('month').mean()

print("Monthly Means:")
print(monthly_means)

In [None]:
import os
import pandas as pd
from PIL import Image,UnidentifiedImageError
import numpy as np
#the values are a bit off for tucheng
#maybe try with banqiao itself
# Path to the folder containing subfolders for each day
image_folder = './input/20220202'

# Load numerical data
numerical_input = pd.read_csv('./input/Tucheng_2022.csv')
#drop the Station column
numerical_input = numerical_input.drop(columns=['Station'])
# Ensure the 'date' column in numerical_data is in datetime format
print(numerical_input.head())
numerical_input['date'] = pd.to_datetime(numerical_input['date'],format="%d-%m-%Y %H:%M",dayfirst=True)

# Function to load images for a specific date
def load_images_for_date(date):
    date_str = date.strftime('%Y%m%d')
    images = []
    for hour in range(24):
        img_name=f"005-{date_str}{hour:02d}00.jpg"
        img_path = os.path.join(image_folder, img_name)
        try: 
            if os.path.exists(img_path):
                with Image.open(img_path) as img:
                    images.append(img.copy())
            else:
                images.append(None)
        except(OSError,UnidentifiedImageError):
            images.append(None)
    return images

# Create a dictionary to store images by date
image_input = {date: load_images_for_date(date) for date in numerical_input['date'].dt.date.unique()}

In [None]:
import torchvision.transforms as transforms
from torchvision.models import resnet50,ResNet50_Weights
import torch

# Preprocess images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load pretrained ResNet model
resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Remove final classification layer

# Extract features from images
def extract_image_features(images):
    features = []
    for img in images:
        if img is not None:
            img = transform(img).unsqueeze(0)  # Add batch dimension
            with torch.no_grad():
                feature = resnet(img).squeeze()  # Remove batch dimension
            features.append(feature.numpy())
        else:
            features.append(np.zeros(2048))  # Handle missing images
    return features

# Extract image features for each date
image_input_features = {date: extract_image_features(images) for date, images in image_input.items()}
#free up space
del resnet

In [None]:
print(image_input_features)

In [None]:
# Reshape numerical data to match image data
numerical_input = numerical_input.melt(id_vars=['date', 'measurement'], var_name='hour', value_name='value')
numerical_input['hour'] = numerical_input['hour'].astype(int)
numerical_input=numerical_input.pivot(index=['date','hour'],columns='measurement',values='value').reset_index()
numerical_input['month']=numerical_input['date'].dt.month
print(numerical_input.head(100))
# Create combined features for each date and hour
combined_input_features = []
targets_input = []
#instead of each row, we take 18 rows at a time
for idx, row in numerical_input.iterrows():
    date = row['date'].date()
    hour = row['hour']
    numerical_features = row.drop(['date']).values
    img_features = image_input_features[date][hour]
    print(img_features)
    print(numerical_features)
    combined_feature = np.concatenate((numerical_features, img_features),axis=None)
    combined_input_features.append(combined_feature)
    targets_input.append(row.drop(['date','hour','month']).values)  # Assuming the target is the value for that hour

combined_input_features = np.array(combined_input_features)
targets_input = np.array(targets_input)


In [None]:
print(combined_input_features.shape)
print(combined_input_features)
#convert all fields to float
#there are some fields that are not float
def to_float_with_nan(x):
    try:
        return float(x)
    except ValueError:
        return 0

# Vectorize the function to apply it to the entire array
vectorized_to_float_with_nan = np.vectorize(to_float_with_nan)

# Replace garbage values with np.nan and convert to float
combined_input_features = vectorized_to_float_with_nan(combined_input_features).astype(np.float32)
targets_input=vectorized_to_float_with_nan(targets_input).astype(np.float32)

In [None]:
print(combined_input_features)
print(targets_input.shape)

In [None]:
print("Checking for NaN and infinite values in combined_input_features and targets_input...")
print("NaNs in combined_input_features:", np.isnan(combined_input_features).sum())
print("Infinite values in combined_input_features:", np.isinf(combined_input_features).sum())
print("NaNs in targets_input:", np.isnan(targets_input).sum())
print("Infinite values in targets_input:", np.isinf(targets_input).sum())

In [None]:
# Replace NaNs and infinite values in combined_input_features and targets_input with zeros
combined_input_features = np.nan_to_num(combined_input_features, nan=0.0, posinf=0.0, neginf=0.0)
targets_input = np.nan_to_num(targets_input, nan=0.0, posinf=0.0, neginf=0.0)

In [None]:
# Replace NaNs in combined_input_features with column means
print("Replacing NaNs in combined_input_features with column means...")
col_mean_combined = np.nanmean(combined_input_features, axis=0)
inds_combined = np.where(np.isnan(combined_input_features))
combined_input_features[inds_combined] = np.take(col_mean_combined, inds_combined[1])

In [None]:
# Replace NaNs in targets_input with column means
print("Replacing NaNs in targets_input with column means...")
col_mean_targets_input = np.nanmean(targets_input, axis=0)
inds_targets_input = np.where(np.isnan(targets_input))
targets_input[inds_targets_input] = np.take(col_mean_targets_input, inds_targets_input[1])

In [None]:
print(numerical_input)

In [None]:
#predict using the model
# Evaluation
input_loader=DataLoader(MultimodalDataset(combined_input_features, targets_input), batch_size=32)
model.eval()
total_loss = 0.0
measures=["AMB_TEMP","CH4","CO","NHMC","NO","NO2","NOx","O3","PM10","PM2.5","RAINFALL","RH","SO2","THC","WD_HR","WIND_DIREC","WIND_SPEED","WS_HR"]
with torch.no_grad():
    for inputs, targets_ in input_loader:
        outputs = model(inputs)
        for i in range(24):
            print(f'Hour {i}')
            for j in range(18):
                print(f'{measures[j]}: {outputs[i][j]}')
        loss = criterion(outputs, targets_)
        total_loss += loss.item()
print(f'Validation Loss: {total_loss/len(input_loader)}')
# Deployment
# Implement a system to gather new data, preprocess it, and use the trained model to make predictions
