# **Autoencoder Model**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import random

# Set the seed
random.seed(19)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data = pd.read_csv("/content/drive/My Drive/DataThesis/BETN073/working_data.csv")

obj = 0.2

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model

In [4]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Select columns to normalize
columns_to_normalize = ['Concentration']

# Fit and transform the selected columns
data_normalized = data.copy()  # Make a copy to avoid modifying the original data
data_normalized[columns_to_normalize] = scaler.fit_transform(data_normalized[columns_to_normalize])

data_normalized = data_normalized[data_normalized['Year'] != 2008]
data_normalized = data_normalized.reset_index()
data_normalized = data_normalized.drop(columns='index')

# Display the normalized data
print(data_normalized)

      Year  Month  Day  Concentration  DayOfWeek  Weekend
0     2009      1    1       0.041503          4        0
1     2009      1    2       0.019506          5        0
2     2009      1    3       0.027512          6        1
3     2009      1    4       0.069484          0        1
4     2009      1    5       0.137438          1        0
...    ...    ...  ...            ...        ...      ...
3983  2019     12   27       0.174413          5        0
3984  2019     12   28       0.146468          6        1
3985  2019     12   29       0.134440          0        1
3986  2019     12   30       0.254002          1        0
3987  2019     12   31       0.086472          2        0

[3988 rows x 6 columns]


In [5]:
missing_values = int(len(data_normalized) * obj)
random_indices = np.random.choice(data_normalized.index, missing_values, replace=False)
random_indices.sort()
data_normalized.loc[random_indices, 'Concentration'] = -1

observed_data = data_normalized[data_normalized['Concentration'] != -1]
missing_data = data_normalized[data_normalized['Concentration'] == -1]

In [11]:
# Define the autoencoder architecture
output_dim_concentration = 1  # Concentration feature
input_dim_year = 1  # Date feature
input_dim_month = 1  # Date feature
input_dim_day = 1  # Date feature
input_dim_dayweek = 1  # Hour feature
input_dim_weekend = 1  # Date feature
encoding_dim = 1  # Adjust the size of the encoded representation as needed

output_concentration = Input(shape=(output_dim_concentration,))
input_year = Input(shape=(input_dim_year,))
input_month = Input(shape=(input_dim_month,))
input_day = Input(shape=(input_dim_day,))
input_dayweek = Input(shape=(input_dim_dayweek,))
input_weekend = Input(shape=(input_dim_weekend,))

# Concatenate inputs
concatenated = Concatenate()([input_year, input_month, input_day, input_dayweek, input_weekend])

# Encoder layers
encoded = Dense(128, activation='relu')(concatenated)  # First hidden layer
encoded = Dense(64, activation='relu')(encoded)      # Second hidden layer
encoded = Dense(16, activation='relu')(encoded)      # Third hidden layer
encoded = Dense(8, activation='relu')(encoded)      # Fourth hidden layer
encoded = Dense(encoding_dim, activation='relu')(encoded)  # Encoding layer #32,16,8

# Decoder layers
decoded = Dense(8, activation='relu')(encoded)      # First hidden layer in decoder
decoded = Dense(16, activation='relu')(decoded)      # Second hidden layer in decoder
decoded = Dense(64, activation='relu')(decoded)      # Third hidden layer in decoder
decoded = Dense(128, activation='relu')(decoded)      # Fourth hidden layer in decoder
decoded = Dense(1, activation='linear')(decoded)

autoencoder = Model([input_year, input_month, input_day, input_dayweek, input_weekend], decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

observed_data_float = {
    'Concentration': observed_data['Concentration'].astype('float32'),
    'Year': observed_data['Year'].astype('float32'),
    'Month': observed_data['Month'].astype('float32'),
    'Day': observed_data['Day'].astype('float32'),
    'DayOfWeek': observed_data['DayOfWeek'].astype('float32'),
    'Weekend': observed_data['Weekend'].astype('float32')
}

missing_data_float = {
    'Concentration': missing_data['Concentration'].astype('float32'),
    'Year': missing_data['Year'].astype('float32'),
    'Month': missing_data['Month'].astype('float32'),
    'Day': missing_data['Day'].astype('float32'),
    'DayOfWeek': missing_data['DayOfWeek'].astype('float32'),
    'Weekend': missing_data['Weekend'].astype('float32')
}

# Train the autoencoder using only observed data
autoencoder.fit([observed_data_float['Year'], observed_data_float['Month'], observed_data_float['Day'], observed_data_float['DayOfWeek'], observed_data_float['Weekend']],
                observed_data_float['Concentration'], epochs=10, batch_size=64, shuffle=True, validation_split=0.2)

# Predict concentrations for missing dates
predicted_concentrations = autoencoder.predict([missing_data_float['Year'], missing_data_float['Month'], missing_data_float['Day'], missing_data_float['DayOfWeek'], missing_data_float['Weekend']])
predicted_measurement = scaler.inverse_transform(predicted_concentrations)


# Fill in the missing values in the DataFrame with the predicted values
missing_data_float['Concentration'] = predicted_measurement.flatten()

to_pred = data.loc[random_indices]
to_pred_concentration = to_pred['Concentration'].to_numpy()
mse_autoencoder = mean_squared_error(to_pred_concentration, missing_data_float['Concentration'])
mse_autoencoder

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


379.3936310991651

In [12]:
mse_autoencoder

379.3936310991651