# **Autoencoder Model**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import sklearn.model_selection as skm

import random

# Set the seed
random.seed(19)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data = pd.read_csv("/content/drive/My Drive/DataThesis/BETN073/working_data.csv")

obj = 0.2

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model

In [4]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Select columns to normalize
columns_to_normalize = ['Concentration']

# Fit and transform the selected columns
data_normalized = data.copy()  # Make a copy to avoid modifying the original data
data_normalized[columns_to_normalize] = scaler.fit_transform(data_normalized[columns_to_normalize])

data_normalized['day_lag1'] = data_normalized['Concentration'].shift(1)
data_normalized['day_lag7'] = data_normalized['Concentration'].shift(7)

data_normalized.dropna(subset=['day_lag7'], inplace=True)
data_normalized = data_normalized.reset_index()

# Display the normalized data
print(data_normalized)

      index  Year  Month  Day  Concentration  DayOfWeek  Weekend  day_lag1  \
0         7  2008      1    8       0.331712          2        0  0.420715   
1         8  2008      1    9       0.329523          3        0  0.331712   
2         9  2008      1   10       0.323105          4        0  0.329523   
3        10  2008      1   11       0.325419          5        0  0.323105   
4        11  2008      1   12       0.342657          6        1  0.325419   
...     ...   ...    ...  ...            ...        ...      ...       ...   
4342   4349  2019     12   27       0.174413          5        0  0.153308   
4343   4350  2019     12   28       0.146468          6        1  0.174413   
4344   4351  2019     12   29       0.134440          0        1  0.146468   
4345   4352  2019     12   30       0.254002          1        0  0.134440   
4346   4353  2019     12   31       0.086472          2        0  0.254002   

      day_lag7  
0     0.049331  
1     0.263704  
2     0.2225

In [5]:
missing_values = int(len(data_normalized) * obj)
random_indices = np.random.choice(data_normalized.index, missing_values, replace=False)
random_indices.sort()
data_normalized.loc[random_indices, 'Concentration'] = -1

observed_data = data_normalized[data_normalized['Concentration'] != -1]
missing_data = data_normalized[data_normalized['Concentration'] == -1]

In [6]:
# Define the autoencoder architecture
output_dim_concentration = 1  # Concentration feature
input_dim_year = 1  # Date feature
input_dim_month = 1  # Date feature
input_dim_day = 1  # Date feature
input_dim_dayweek = 1  # Hour feature
input_dim_weekend = 1  # Date feature
input_dim_daylag = 1 # 1 day lag
input_dim_weeklag = 1 # 1 week lag
encoding_dim = 2  # Adjust the size of the encoded representation as needed

output_concentration = Input(shape=(output_dim_concentration,))
input_year = Input(shape=(input_dim_year,))
input_month = Input(shape=(input_dim_month,))
input_day = Input(shape=(input_dim_day,))
input_dayweek = Input(shape=(input_dim_dayweek,))
input_weekend = Input(shape=(input_dim_weekend,))
input_lag1 = Input(shape=(input_dim_daylag,))
input_lag7 = Input(shape=(input_dim_weeklag,))

# Concatenate inputs
concatenated = Concatenate()([input_year, input_month, input_day, input_dayweek, input_weekend, input_lag1, input_lag7])

# Encoder layers
encoded = Dense(32, activation='relu')(concatenated)  # First hidden layer
encoded = Dense(16, activation='relu')(encoded)      # Second hidden layer
encoded = Dense(8, activation='relu')(encoded)      # Third hidden layer
encoded = Dense(encoding_dim, activation='relu')(encoded)  # Encoding layer

# Decoder layers
decoded = Dense(8, activation='relu')(encoded)      # First hidden layer in decoder
decoded = Dense(16, activation='relu')(decoded)      # Second hidden layer in decoder
decoded = Dense(32, activation='relu')(decoded)      # Third hidden layer in decoder
decoded = Dense(1, activation='linear')(decoded)

autoencoder = Model([input_year, input_month, input_day, input_dayweek, input_weekend, input_lag1, input_lag7], decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

observed_data_float = {
    'Concentration': observed_data['Concentration'].astype('float32'),
    'Year': observed_data['Year'].astype('float32'),
    'Month': observed_data['Month'].astype('float32'),
    'Day': observed_data['Day'].astype('float32'),
    'DayOfWeek': observed_data['DayOfWeek'].astype('float32'),
    'Weekend': observed_data['Weekend'].astype('float32'),
    'day_lag1': observed_data['day_lag1'].astype('float32'),
    'day_lag7': observed_data['day_lag7'].astype('float32'),
}

missing_data_float = {
    'Concentration': missing_data['Concentration'].astype('float32'),
    'Year': missing_data['Year'].astype('float32'),
    'Month': missing_data['Month'].astype('float32'),
    'Day': missing_data['Day'].astype('float32'),
    'DayOfWeek': missing_data['DayOfWeek'].astype('float32'),
    'Weekend': missing_data['Weekend'].astype('float32'),
    'day_lag1': missing_data['day_lag1'].astype('float32'),
    'day_lag7': missing_data['day_lag7'].astype('float32'),
}

# Train the autoencoder using only observed data
autoencoder.fit([observed_data_float['Year'], observed_data_float['Month'], observed_data_float['Day'], observed_data_float['DayOfWeek'], observed_data_float['Weekend'], observed_data_float['day_lag1'], observed_data_float['day_lag7']],
                observed_data_float['Concentration'], epochs=20, batch_size=8, shuffle=True, validation_split=0.2)

# Predict concentrations for missing dates
predicted_concentrations = autoencoder.predict([missing_data_float['Year'], missing_data_float['Month'], missing_data_float['Day'], missing_data_float['DayOfWeek'], missing_data_float['Weekend'], missing_data['day_lag1'], missing_data['day_lag7']])
predicted_measurement = scaler.inverse_transform(predicted_concentrations)


# Fill in the missing values in the DataFrame with the predicted values
missing_data_float['Concentration'] = predicted_measurement.flatten()

to_pred = data.loc[random_indices]
to_pred_concentration = to_pred['Concentration'].to_numpy()
mse_autoencoder = mean_squared_error(to_pred_concentration, missing_data_float['Concentration'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [7]:
mse_autoencoder

382.81107635438036