In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from utils.preprocessing import preprocess_data

In [None]:
data_train = pd.read_csv('data/imputed_train.csv')
data_test = pd.read_csv('data/test.csv')

In [None]:
preprocessed_train = preprocess_data(data_train)
preprocessed_test = preprocess_data(data_test)

In [None]:
columns_to_drop= ['Year', 'Month', 'Weekday', 'Day', 'Hour']
preprocessed_train = preprocessed_train.drop(columns = columns_to_drop)
preprocessed_test = preprocessed_test.drop(columns = columns_to_drop)

In [None]:
nan_columns_train = preprocessed_train.columns[preprocessed_train.isna().any()].tolist()
print("Columns with NaN values in preprocessed_train:", nan_columns_train)

In [None]:
preprocessed_train.fillna(0, inplace=True)
preprocessed_test.fillna(0, inplace=True)

In [None]:
target_cols = ['valeur_NO2', 'valeur_CO', 'valeur_O3', 'valeur_PM10', 'valeur_PM25']
numeric_cols = ['precipitation', 'wind_speed', 'temperature', 'humidity', 'pressure', 'visibility']
categorical_cols = ['is_holiday', 'is_jour_ferie', 'is_winter', 'is_spring', 'is_summer', 'is_fall', 'is_weekend']
time_cols = ['DayOfYear_sin', 'DayOfYear_cos', 'HourOfDay_sin', 'HourOfDay_cos']


In [None]:
numeric_scaler = StandardScaler()

preprocessed_train[numeric_cols] = numeric_scaler.fit_transform(preprocessed_train[numeric_cols])
preprocessed_test[numeric_cols] = numeric_scaler.transform(preprocessed_test[numeric_cols])


In [None]:
target_scaler = StandardScaler()

preprocessed_train[target_cols] = target_scaler.fit_transform(preprocessed_train[target_cols])

    

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, target_cols=None, seq_length=24):
        self.data = data
        self.target_cols = target_cols
        self.seq_length = seq_length

    def __len__(self):
        # Return the full length of the dataset
        return len(self.data)

    def __getitem__(self, idx):
        # Get the starting index for the sequence
        start_idx = max(0, idx - self.seq_length + 1)  # Ensure we get enough history
        seq = self.data[start_idx:idx + 1]  # Get the sequence up to the current index

        # Pad sequence if it's shorter than seq_length
        if len(seq) < self.seq_length:
            pad_length = self.seq_length - len(seq)
            # Pad with zeros (or another value) at the beginning
            seq = np.pad(seq, ((pad_length, 0), (0, 0)), 'constant')

        # Prepare features and labels
        features = seq[:, :-len(self.target_cols)] if self.target_cols is not None else seq
        labels = seq[-1, -len(self.target_cols):] if self.target_cols is not None else None

        return (
            torch.tensor(features, dtype=torch.float32),
            torch.tensor(labels, dtype=torch.float32) if labels is not None else None
        )

In [None]:
train_data = np.concatenate([preprocessed_train[numeric_cols].values, preprocessed_train[target_cols].values, preprocessed_train[time_cols].values], axis=1)  # Concatenate features and targets
test_data = np.concatenate([preprocessed_test[numeric_cols].values, preprocessed_test[time_cols].values, np.full((len(preprocessed_test), len(target_cols)), np.nan)], axis=1)

In [None]:
train_dataset = TimeSeriesDataset(train_data, target_cols=target_cols, seq_length=24)
test_dataset = TimeSeriesDataset(test_data, target_cols=target_cols, seq_length=24)


In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :]) 
        return out

In [None]:
input_size = len(numeric_cols)+ len(time_cols) 
hidden_size = 10  
output_size = len(target_cols)

In [None]:
model = SimpleLSTM(input_size, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.MSELoss()

In [None]:
for epoch in range(20):  # Adjust number of epochs as needed
    for batch_seq, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_seq)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

# Prediction on the test dataset
model.eval()  # Set model to evaluation mode
predictions = []
with torch.no_grad():
    for batch_seq, labels in test_loader:
        outputs = model(batch_seq)
        predictions.append(outputs.numpy())

# Concatenate predictions
predictions = np.concatenate(predictions, axis=0)
print(predictions.shape)


In [None]:
predictions = target_scaler.inverse_transform(predictions)

In [None]:


# Create submission DataFrame
submission = pd.DataFrame(predictions, columns=target_cols)
n_predictions = len(predictions)

# Align IDs: Start from seq_length - 1 to match the predictions
submission['id'] = pd.to_datetime(preprocessed_test['id']).dt.strftime('%Y-%m-%d %H')
submission = submission[['id'] + target_cols]
# Save predictions to CSV
submission.to_csv('submission_lstm_7.csv', index=False)

print("Submission file created: submission.csv")