<a href="https://colab.research.google.com/github/nicha1997/Data-Analytics-for-Business/blob/main/LSTM_Model_for_2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import layers, Input
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Import data
file_path = r'D:\desktop\set.xlsx'  # Local file path
data = pd.read_excel(file_path)
dates = pd.to_datetime(data['date'])  # Convert the date to date format

# Define a function to create a dataset, considering leap years.
def create_dataset_for_single_item(dates, dataset, lookback_years=4):
    dataX, dataY = [], []
    target_dates = pd.date_range(start='2017-01-01', end='2017-12-31')  # Target dates are the dates from 2017.

    for target_date in target_dates:
        x = []
        # Get data from the past 4 years.
        for year in range(lookback_years):
            current_date = target_date - pd.DateOffset(years=year)  # Looking back at past years.

            # Find the index of the current date in the data.
            current_date_idx = np.where(dates == current_date)[0]

            if len(current_date_idx) > 0:
                x.append(dataset[current_date_idx[0]])
            else:
                print(f"Date not found: {Current_date} in dataset")  # Debug information.

        if len(x) == lookback_years:  # Ensure there is complete data for four years.
            dataX.append(x)
            # The sales on the current date in 2017 as the label.
            target_idx = np.where(dates == target_date)[0]
            if len(target_idx) > 0:
                dataY.append(dataset[target_idx[0]])
            else:
                print(f"Target date not found: {target_date} in dataset")  # Debug information.

    return np.array(dataX), np.array(dataY)

# Predict 'store1item1'.
target_column = 'store1item1'
print(f"Predicting sales of {target_comlumn}...")

# Get the sales data for 'store1item1'.
dataset = data[target_column].values

# Normalize the data.
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_target = scaler.fit_transform(dataset.reshape(-1, 1))  # Normalize the target data.

# Call the function to create the dataset.
trainX, trainY = create_dataset_for_single_item(dates, scaled_target, lookback_years=4)

# Debug information.
print(f"trainX shape: {trainX.shape}, trainY shape: {trainY.shape}")

# Check if enough data has been generated.
if len(trainX) == 0 or len(trainY) == 0:
    print("The generated dataset is empty, please check the input data and date range.")
    exit()


split_index = int(0.8 * len(trainX))
X_train, X_test = trainX[:split_index], trainX[split_index:]
y_train, y_test = trainY[:split_index], trainY[split_index:]

# Ensure the dimensions of X_train and X_test are correct.
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build the LSTM model.
input_shape = Input(shape=(X_train.shape[1], 1))  # Four years of daily data.
lstm1 = layers.LSTM(128, return_sequences=True)(input_shape)
lstm2 = layers.LSTM(256)(lstm1)
dense1 = layers.Dense(128, activation="relu")(lstm2)
dropout = layers.Dropout(rate=0.2)(dense1)
output_shape = layers.Dense(1)(dropout)

# Define the model.
lstm_model = tf.keras.Model(inputs=input_shape, outputs=output_shape)
lstm_model.compile(loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=["mse"])

# Train the model.
lstm_model.fit(X_train, y_train, batch_size=32, epochs=100, validation_split=0.2, verbose=1)

# Predict the testing set.
y_pred = lstm_model.predict(X_test)

# Inverse normalize the predicted and actual values.
y_test_real = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_real = scaler.inverse_transform(y_pred)

# Calculate RMSE and R².
rmse = np.sqrt(mean_squared_error(y_test_real, y_pred_real))
r2 = r2_score(y_test_real, y_pred_real)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")

# Save the actual and predicted values as a CSV file.
comparison_df = pd.DataFrame({
    'Date': pd.date_range(start='2017-01-01', periods=len(y_test_real)),
    'Actual': y_test_real.flatten(),
    'Predicted': y_pred_real.flatten(),
    'StoreItem': target_column
})

output_file_path = r'D:\desktop\store1item1_actual_vs_predicted_2017.csv'
comparison_df.to_csv(output_file_path, index=False)
print(f"The actual and predicted values have been saved as CSV files:{output_file_path}")

# Plot a comparison chart of actual values and predicted values.
plt.figure(figsize=(10, 6))
plt.plot(comparison_df['Date'], comparison_df['Actual'], label='Actual', color='blue')
plt.plot(comparison_df['Date'], comparison_df['Predicted'], label='Predicted', color='red')
plt.title(f'Actual vs Predicted for {target_column} in 2017')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()