# Step-by-Step Implementation of GNN for Predicting Rice Prices

This notebook demonstrates how to implement a Graph Neural Network (GNN) to predict rice prices for the next 30 days across 38 provinces in Indonesia. The model captures both temporal (time-based) and spatial (province-based) relationships.

### Step 1: Prepare the Dataset
1. Load the dataset `harga_beras_premium.csv`.
2. Preprocess the data:
   - Convert the `Tanggal` column to datetime format.
   - Normalize the `Harga` column using `MinMaxScaler`.
   - Create an adjacency matrix to represent spatial relationships between provinces.
   - Generate a feature matrix for time-series data.

### Step 2: Prepare Data for PyTorch Geometric
1. Create an `edge_index` tensor to represent spatial connections.
2. Use the time-series data as node features.
3. Define the target as the last 30 days of prices.

### Step 3: Define the Spatio-Temporal GCN Model
1. Use two GCN layers to capture spatial relationships.
2. Add a fully connected layer to capture temporal dependencies.

### Step 4: Train the Model
1. Use Mean Squared Error (MSE) as the loss function.
2. Optimize the model using Adam optimizer.
3. Train the model for a specified number of epochs.

### Step 5: Make Predictions
1. Use the trained model to predict rice prices for the next 30 days.
2. Denormalize the predictions to get the actual prices.

In [25]:
# Step 1: Prepare the Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
df = pd.read_csv('harga_beras_premium.csv')
df = df.dropna()  # Remove rows with missing values
df['Tanggal'] = pd.to_datetime(df['Tanggal'])
df.sort_values('Tanggal', inplace=True)

# Normalize the price column
scaler = MinMaxScaler()
df['Harga'] = scaler.fit_transform(df[['Harga']])

# Create adjacency matrix based on provinces
provinces = df['Provinsi'].unique()
province_index = {province: idx for idx, province in enumerate(provinces)}
adj_matrix = np.zeros((len(provinces), len(provinces)))

# Example: Connect provinces based on geographical proximity (simplified)
for i in range(len(provinces)):
    for j in range(len(provinces)):
        if i != j:
            adj_matrix[i, j] = 1  # Fully connected graph for simplicity

# Ensure all time-series data have the same length
min_length = min(len(df[df['Provinsi'] == province]['Harga'].values) for province in provinces)

features = []
for province in provinces:
    province_data = df[df['Provinsi'] == province]['Harga'].values
    if len(province_data) > min_length:
        province_data = province_data[:min_length]  # Trim to minimum length
    elif len(province_data) < min_length:
        province_data = np.pad(province_data, (0, min_length - len(province_data)), 'constant')  # Pad with zeros
    features.append(province_data)

features = np.array(features).T  # Shape: (time_steps, num_provinces)

  df['Tanggal'] = pd.to_datetime(df['Tanggal'])


In [26]:
# Step 3: Define the Spatio-Temporal GCN Model
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv

class SpatioTemporalGCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SpatioTemporalGCN, self).__init__()
        self.gcn1 = GCNConv(input_dim, hidden_dim)
        self.gcn2 = GCNConv(hidden_dim, output_dim)
        self.fc = nn.Linear(output_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.gcn1(x, edge_index)
        x = torch.relu(x)
        x = self.gcn2(x, edge_index)
        x = self.fc(x)
        return x

# Initialize the model
input_dim = features.shape[1]  # Number of provinces
hidden_dim = 64
output_dim = features.shape[1]  # Match the number of provinces (38)
model = SpatioTemporalGCN(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

In [27]:
# Import required libraries
from torch_geometric.data import Data
import torch.optim as optim
from torch_geometric.utils import add_self_loops

# Prepare data for PyTorch Geometric
edge_index = torch.tensor(np.array(np.nonzero(adj_matrix)), dtype=torch.long)
x = torch.tensor(features[-30:], dtype=torch.float)  # Use only the last 30 days
data = Data(x=x, edge_index=edge_index)

# Adjust edge_index to match the reduced input data (30 days)
valid_nodes = list(range(x.size(0)))  # Nodes corresponding to the last 30 days
edge_index = edge_index[:, (edge_index[0] < len(valid_nodes)) & (edge_index[1] < len(valid_nodes))]

# Recalculate edge_index to ensure it matches valid nodes
valid_indices = torch.arange(x.size(0))
edge_index = edge_index[:, (edge_index[0] < valid_indices.size(0)) & (edge_index[1] < valid_indices.size(0))]

# Remap edge_index to match valid nodes
node_mapping = {old_idx: new_idx for new_idx, old_idx in enumerate(valid_nodes)}
edge_index = torch.tensor(
    [[node_mapping[node.item()] for node in edge_index[0] if node.item() in node_mapping],
     [node_mapping[node.item()] for node in edge_index[1] if node.item() in node_mapping]],
    dtype=torch.long
)

# Revalidate edge_index to ensure it matches valid nodes
valid_edges = (edge_index[0] < len(valid_nodes)) & (edge_index[1] < len(valid_nodes))
edge_index = edge_index[:, valid_edges]

# Ensure edge_index matches valid nodes and add self-loops
edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

# Define target (last 30 days of prices)
y = torch.tensor(features[-30:], dtype=torch.float)

# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

RuntimeError: index 30 is out of bounds for dimension 0 with size 30

In [None]:
# Step 5: Make Predictions
model.eval()
predicted_prices = model(data).detach().numpy()
predicted_prices = scaler.inverse_transform(predicted_prices)  # Denormalize

# Print the predicted prices for the next 30 days
print("Predicted Prices for the Next 30 Days:", predicted_prices)

In [None]:
# Step 6: Evaluate the Model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate evaluation metrics
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, mape, r2

# Denormalize the true values
y_true = scaler.inverse_transform(y.numpy())

# Calculate metrics
rmse, mae, mape, r2 = calculate_metrics(y_true, predicted_prices)

# Print the results
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}%")
print(f"R2 Score: {r2}")