# Wage LSTM Model - Training (1997-2020), Training (2021-2023)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input

import gdown
import time

In [None]:
# Load the dataset
url = 'https://drive.google.com/uc?id=1wjTiPLhi938Ro-jfjVHF0d_YPvsLaRc3'

# Download the file
output = 'data_wage.csv'
gdown.download(url, output, quiet=False)

# Check the file content
with open(output, 'r') as file:
    content = file.read()
    print("File content preview:")
    print(content[:500]) 

# Load the CSV file 
try:
    data = pd.read_csv(output, delimiter=',')  
    print(data.head())
except pd.errors.ParserError as e:
    print("Error parsing CSV file:", e)

In [None]:
# Convert 'ref_date' to datetime
data['ref_date'] = pd.to_datetime(data['ref_date'], format='%Y-%m-%d')

# Label encode the 'occupation_classification' column
data['occupation_classification'] = data['occupation_classification'].astype(str)
label_encoder = LabelEncoder()
data['occupation_code'] = label_encoder.fit_transform(data['occupation_classification'])

In [None]:
# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data[['value', 'date_ordinal', 'sex_binary', 'age_group_numeric', 'geo_code', 'occupation_code']])

# Function to create sequences for forecasting
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length, 0]  # Assuming 'value' column is the target
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Define sequence length
SEQ_LENGTH = 12  # Example: use past 12 months to predict the next month

# Split the data into training and testing sets based on the date
train_data = data[data['ref_date'] < '2021-01-01']
test_data = data[data['ref_date'] >= '2021-01-01']

# Normalize training and testing data separately
scaled_train_data = scaler.fit_transform(train_data[['value', 'date_ordinal', 'year', 'month', 'sex_binary', 'age_group_numeric', 'geo_code', 'occupation_code']])
scaled_test_data = scaler.transform(test_data[['value', 'date_ordinal', 'year', 'month', 'sex_binary', 'age_group_numeric', 'geo_code', 'occupation_code']])

In [None]:
# Create sequences for training and testing
X_train, y_train = create_sequences(scaled_train_data, SEQ_LENGTH)
X_test, y_test = create_sequences(scaled_test_data, SEQ_LENGTH)

# Define the LSTM model for forecasting
model = Sequential()
model.add(Input(shape=(SEQ_LENGTH, X_train.shape[2])))  # Use Input layer to specify input shape
model.add(LSTM(50, activation='relu'))
model.add(Dense(1))  # Output layer for regression
model.compile(optimizer='adam', loss='mse')

print(model.summary())

# Train the model
start = time.time()
history = model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.2)
end = time.time()

# Convert elapsed time to minutes and seconds
elapsed_time = end - start
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)

print(f"\nTraining time: {minutes} minutes and {seconds} seconds")

In [None]:
# Define the smoothing function
def smooth_curve(points, factor=0.9):
    smoothed_points = []
    for point in points:
        if smoothed_points:
            previous = smoothed_points[-1]
            smoothed_points.append(previous * factor + point * (1 - factor))
        else:
            smoothed_points.append(point)
    return smoothed_points

# Retrieve loss and validation loss from history
loss = history.history['loss']
val_loss = history.history['val_loss']

# Smooth the loss curves
smoothed_loss = smooth_curve(loss)
smoothed_val_loss = smooth_curve(val_loss)

# Plot smoothed training and validation loss
plt.plot(range(1, len(smoothed_loss) + 1), smoothed_loss, label='Smoothed Training Loss')
plt.plot(range(1, len(smoothed_val_loss) + 1), smoothed_val_loss, label='Smoothed Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Identify the epoch with the lowest validation loss
best_epoch = np.argmin(smoothed_val_loss) + 1
print(f"Best epoch based on validation loss: {best_epoch}")


# Refine the LSTM model 
model = Sequential()
model.add(Input(shape=(SEQ_LENGTH, X_train.shape[2]))) 
model.add(LSTM(50, activation='relu'))
model.add(Dense(1))  
model.compile(optimizer='adam', loss='mse')

print(model.summary())

# Train the model
start = time.time()
history = model.fit(X_train, y_train, epochs=best_epoch, batch_size=32, validation_split=0.2)
end = time.time()

print(f"\nTraining time: {minutes} minutes and {seconds} seconds")

In [None]:
# Evaluate the model
loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')

# Make predictions
predictions = model.predict(X_test)

# Inverse transform the predictions and the actual values
predictions_inv = scaler.inverse_transform(np.concatenate((predictions, np.zeros((predictions.shape[0], scaled_train_data.shape[1] - 1))), axis=1))[:, 0]
y_test_inv = scaler.inverse_transform(np.concatenate((y_test.reshape(-1, 1), np.zeros((y_test.shape[0], scaled_train_data.shape[1] - 1))), axis=1))[:, 0]

# Compare the first few predictions with the actual values
comparison = pd.DataFrame({'Actual': y_test_inv, 'Predicted': predictions_inv})
print(comparison.head())