In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, r2_score
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [2]:

xy = pd.read_parquet('XY_version1.parquet')
xy.head()

Unnamed: 0,year,area,avg_yield_maize_corn_1y,avg_yield_maize_corn_3y,avg_yield_maize_corn_6y,avg_yield_other_vegetables_fresh_nec_1y,avg_yield_other_vegetables_fresh_nec_3y,avg_yield_other_vegetables_fresh_nec_6y,avg_yield_potatoes_1y,avg_yield_potatoes_3y,...,Y_oil_palm_fruit,Y_other_vegetables_fresh_nec,Y_potatoes,Y_rice,Y_soya_beans,Y_sugar_beet,Y_sugar_cane,Y_tomatoes,Y_watermelons,Y_wheat
0,1983,Afghanistan,1665.8,1668.633333,1636.283333,6919.2,6846.166667,6561.216667,15511.4,15265.133333,...,,7065.7,15764.7,2258.1,,20000.0,19375.0,,9754.9,1258.0
1,1984,Afghanistan,1664.1,1666.3,1649.75,7065.7,6959.033333,6775.366667,15764.7,15566.6,...,,7155.1,14444.4,2241.6,,20000.0,19354.8,,9630.0,1231.9
2,1985,Afghanistan,1661.2,1663.7,1656.9,7155.1,7046.666667,6897.8,14444.4,15240.166667,...,,7145.9,14090.9,2248.2,,3333.3,19333.3,,9556.7,1227.7
3,1986,Afghanistan,1665.2,1663.5,1666.066667,7145.9,7122.233333,6984.2,14090.9,14766.666667,...,,7249.5,15866.7,2240.0,,12500.0,20000.0,,10058.8,1189.0
4,1987,Afghanistan,1687.5,1671.3,1668.8,7249.5,7183.5,7071.266667,15866.7,14800.666667,...,,7524.8,15500.0,2000.0,,15000.0,20000.0,,9952.9,1284.9


In [3]:
latlong  = pd.read_csv('coordinates_countries_full_209.csv')
latlong['area'] = latlong['Area'].str.replace(' ', '_')
latlong = latlong[['area', 'latitude', 'longitude']]

In [4]:
xy = xy.merge(latlong, on='area')
xy = pd.get_dummies(xy, columns=['area'])
xy.columns = (
    xy.columns
    .str.strip()             # remove leading/trailing spaces
    .str.replace(' ', '_')   # replace spaces
    .str.replace(r'[^A-Za-z0-9_]', '', regex=True)  # remove special chars
)
xy.columns.tolist()

['year',
 'avg_yield_maize_corn_1y',
 'avg_yield_maize_corn_3y',
 'avg_yield_maize_corn_6y',
 'avg_yield_other_vegetables_fresh_nec_1y',
 'avg_yield_other_vegetables_fresh_nec_3y',
 'avg_yield_other_vegetables_fresh_nec_6y',
 'avg_yield_potatoes_1y',
 'avg_yield_potatoes_3y',
 'avg_yield_potatoes_6y',
 'avg_yield_rice_1y',
 'avg_yield_rice_3y',
 'avg_yield_rice_6y',
 'avg_yield_sugar_cane_1y',
 'avg_yield_sugar_cane_3y',
 'avg_yield_sugar_cane_6y',
 'avg_yield_wheat_1y',
 'avg_yield_wheat_3y',
 'avg_yield_wheat_6y',
 'avg_yield_oil_palm_fruit_1y',
 'avg_yield_oil_palm_fruit_3y',
 'avg_yield_oil_palm_fruit_6y',
 'avg_yield_barley_1y',
 'avg_yield_barley_3y',
 'avg_yield_barley_6y',
 'avg_yield_soya_beans_1y',
 'avg_yield_soya_beans_3y',
 'avg_yield_soya_beans_6y',
 'avg_yield_sugar_beet_1y',
 'avg_yield_sugar_beet_3y',
 'avg_yield_sugar_beet_6y',
 'avg_yield_watermelons_1y',
 'avg_yield_watermelons_3y',
 'avg_yield_watermelons_6y',
 'avg_yield_cucumbers_and_gherkins_1y',
 'avg_yield_cuc

In [5]:
features_col =[
 'avg_yield_maize_corn_1y',
 'avg_yield_maize_corn_3y',
 'avg_yield_maize_corn_6y',
 'avg_yield_other_vegetables_fresh_nec_1y',
 'avg_yield_other_vegetables_fresh_nec_3y',
 'avg_yield_other_vegetables_fresh_nec_6y',
 'avg_yield_potatoes_1y',
 'avg_yield_potatoes_3y',
 'avg_yield_potatoes_6y',
 'avg_yield_rice_1y',
 'avg_yield_rice_3y',
 'avg_yield_rice_6y',
 'avg_yield_sugar_cane_1y',
 'avg_yield_sugar_cane_3y',
 'avg_yield_sugar_cane_6y',
 'avg_yield_wheat_1y',
 'avg_yield_wheat_3y',
 'avg_yield_wheat_6y',
 'avg_yield_oil_palm_fruit_1y',
 'avg_yield_oil_palm_fruit_3y',
 'avg_yield_oil_palm_fruit_6y',
 'avg_yield_barley_1y',
 'avg_yield_barley_3y',
 'avg_yield_barley_6y',
 'avg_yield_soya_beans_1y',
 'avg_yield_soya_beans_3y',
 'avg_yield_soya_beans_6y',
 'avg_yield_sugar_beet_1y',
 'avg_yield_sugar_beet_3y',
 'avg_yield_sugar_beet_6y',
 'avg_yield_watermelons_1y',
 'avg_yield_watermelons_3y',
 'avg_yield_watermelons_6y',
 'avg_yield_cucumbers_and_gherkins_1y',
 'avg_yield_cucumbers_and_gherkins_3y',
 'avg_yield_cucumbers_and_gherkins_6y',
 'avg_yield_tomatoes_1y',
 'avg_yield_tomatoes_3y',
 'avg_yield_tomatoes_6y',
 'avg_yield_bananas_1y',
 'avg_yield_bananas_3y',
 'avg_yield_bananas_6y',
 'avg_yield_cassava_fresh_1y',
 'avg_yield_cassava_fresh_3y',
 'avg_yield_cassava_fresh_6y',
 'rain_Jan',
 'rain_Feb',
 'rain_Mar',
 'rain_Apr',
 'rain_May',
 'rain_Jun',
 'rain_Jul',
 'rain_Aug',
 'rain_Sep',
 'rain_Oct',
 'rain_Nov',
 'rain_Dec',
 'avg_rain_1_3',
 'avg_rain_3_6',
 'avg_rain_6_9',
 'avg_rain_10_12',
 'avg_rain_1_12',
 'solar_Jan',
 'solar_Feb',
 'solar_Mar',
 'solar_Apr',
 'solar_May',
 'solar_Jun',
 'solar_Jul',
 'solar_Aug',
 'solar_Sep',
 'solar_Oct',
 'solar_Nov',
 'solar_Dec',
 'avg_solar_1_3',
 'avg_solar_3_6',
 'avg_solar_6_9',
 'avg_solar_10_12',
 'avg_solar_1_12',
 'temp_Jan',
 'temp_Feb',
 'temp_Mar',
 'temp_Apr',
 'temp_May',
 'temp_Jun',
 'temp_Jul',
 'temp_Aug',
 'temp_Sep',
 'temp_Oct',
 'temp_Nov',
 'temp_Dec',
 'avg_temp_1_3',
 'avg_temp_3_6',
 'avg_temp_6_9',
 'avg_temp_10_12',
 'avg_temp_1_12',
 'sum_rain_1_3',
 'sum_rain_3_6',
 'sum_rain_6_9',
 'sum_rain_10_12',
 'sum_rain_1_12',

 'latitude',
 'longitude',
 'area_Afghanistan',
 'area_Albania',
 'area_Algeria',
 'area_Angola',
 'area_Antigua_and_Barbuda',
 'area_Argentina',
 'area_Armenia',
 'area_Australia',
 'area_Austria',
 'area_Azerbaijan',
 'area_Bahamas',
 'area_Bangladesh',
 'area_Barbados',
 'area_Belarus',
 'area_Belgium',
 'area_BelgiumLuxembourg',
 'area_Belize',
 'area_Benin',
 'area_Bhutan',
 'area_Bolivia_Plurinational_State_of',
 'area_Bosnia_and_Herzegovina',
 'area_Botswana',
 'area_Brazil',
 'area_Bulgaria',
 'area_Burkina_Faso',
 'area_Burundi',
 'area_Cabo_Verde',
 'area_Cambodia',
 'area_Cameroon',
 'area_Canada',
 'area_Central_African_Republic',
 'area_Chad',
 'area_Chile',
 'area_China',
 'area_China_Taiwan_Province_of',
 'area_China_mainland',
 'area_Colombia',
 'area_Comoros',
 'area_Congo',
 'area_Costa_Rica',
 'area_Croatia',
 'area_Cuba',
 'area_Czechia',
 'area_Czechoslovakia',
 'area_Cte_dIvoire',
 'area_Democratic_Peoples_Republic_of_Korea',
 'area_Democratic_Republic_of_the_Congo',
 'area_Denmark',
 'area_Djibouti',
 'area_Dominica',
 'area_Dominican_Republic',
 'area_Ecuador',
 'area_Egypt',
 'area_El_Salvador',
 'area_Eritrea',
 'area_Eswatini',
 'area_Ethiopia',
 'area_Ethiopia_PDR',
 'area_Fiji',
 'area_France',
 'area_French_Guiana',
 'area_Gabon',
 'area_Gambia',
 'area_Georgia',
 'area_Germany',
 'area_Ghana',
 'area_Greece',
 'area_Grenada',
 'area_Guatemala',
 'area_Guinea',
 'area_GuineaBissau',
 'area_Guyana',
 'area_Haiti',
 'area_Honduras',
 'area_Hungary',
 'area_India',
 'area_Indonesia',
 'area_Iran_Islamic_Republic_of',
 'area_Iraq',
 'area_Israel',
 'area_Italy',
 'area_Jamaica',
 'area_Japan',
 'area_Jordan',
 'area_Kazakhstan',
 'area_Kenya',
 'area_Kuwait',
 'area_Kyrgyzstan',
 'area_Lao_Peoples_Democratic_Republic',
 'area_Lebanon',
 'area_Lesotho',
 'area_Libya',
 'area_Lithuania',
 'area_Luxembourg',
 'area_Madagascar',
 'area_Malawi',
 'area_Malaysia',
 'area_Maldives',
 'area_Mali',
 'area_Mauritania',
 'area_Mauritius',
 'area_Mexico',
 'area_Micronesia_Federated_States_of',
 'area_Montenegro',
 'area_Morocco',
 'area_Mozambique',
 'area_Myanmar',
 'area_Namibia',
 'area_Nepal',
 'area_Netherlands_Kingdom_of_the',
 'area_New_Caledonia',
 'area_New_Zealand',
 'area_Nicaragua',
 'area_Niger',
 'area_Nigeria',
 'area_North_Macedonia',
 'area_Oman',
 'area_Pakistan',
 'area_Panama',
 'area_Papua_New_Guinea',
 'area_Paraguay',
 'area_Peru',
 'area_Philippines',
 'area_Poland',
 'area_Portugal',
 'area_Puerto_Rico',
 'area_Qatar',
 'area_Republic_of_Korea',
 'area_Republic_of_Moldova',
 'area_Romania',
 'area_Russian_Federation',
 'area_Rwanda',
 'area_Runion',
 'area_Saint_Vincent_and_the_Grenadines',
 'area_Sao_Tome_and_Principe',
 'area_Saudi_Arabia',
 'area_Senegal',
 'area_Serbia',
 'area_Serbia_and_Montenegro',
 'area_Sierra_Leone',
 'area_Slovakia',
 'area_Slovenia',
 'area_Somalia',
 'area_South_Africa',
 'area_South_Sudan',
 'area_Spain',
 'area_Sri_Lanka',
 'area_Sudan',
 'area_Sudan_former',
 'area_Suriname',
 'area_Sweden',
 'area_Switzerland',
 'area_Syrian_Arab_Republic',
 'area_Tajikistan',
 'area_Thailand',
 'area_TimorLeste',
 'area_Togo',
 'area_Trinidad_and_Tobago',
 'area_Turkmenistan',
 'area_Trkiye',
 'area_USSR',
 'area_Uganda',
 'area_Ukraine',
 'area_United_Arab_Emirates',
 'area_United_Republic_of_Tanzania',
 'area_United_States_of_America',
 'area_Uruguay',
 'area_Uzbekistan',
 'area_Vanuatu',
 'area_Venezuela_Bolivarian_Republic_of',
 'area_Viet_Nam',
 'area_Yemen',
 'area_Yugoslav_SFR',
 'area_Zambia',
 'area_Zimbabwe']




Start LSTM Model

In [6]:
# === Build clean dataset for LSTM (similar to Model_Part 5) ===

# 1) Keep only non-negative Y_rice and drop NaN / inf in all columns
target_col = "Y_rice"

xy_drop_na = xy[xy[target_col].notna() & (xy[target_col] >= 0)].copy()

# Remove inf and NaN in all columns
xy_drop_na = xy_drop_na.replace([np.inf, -np.inf], np.nan).dropna()

# 2) Build feature list automatically
#    - remove all Y_* target columns
#    - remove one-hot area_* columns
remove_targets = [c for c in xy_drop_na.columns if c.startswith("Y_")]
remove_areas   = [c for c in xy_drop_na.columns if c.startswith("area_")]

feature_cols = [
    c for c in xy_drop_na.columns
    if c not in remove_targets
    and c not in remove_areas
    and c != target_col
]

print("Removed target columns:", remove_targets)
print("Removed area columns:", len(remove_areas))
print("Final feature count:", len(feature_cols))

# 3) Train / Val / Test split by year (same style as Model_Part 5)
TRAIN_END_YEAR = 2016
VAL_END_YEAR   = 2020

train_mask = xy_drop_na["year"] <= TRAIN_END_YEAR
val_mask   = (xy_drop_na["year"] > TRAIN_END_YEAR) & (xy_drop_na["year"] <= VAL_END_YEAR)
test_mask  = xy_drop_na["year"] > VAL_END_YEAR

print("Train samples:", train_mask.sum())
print("Val samples:  ", val_mask.sum())
print("Test samples: ", test_mask.sum())

# 4) Scale features using only the training data
scaler = StandardScaler()

X_train = scaler.fit_transform(xy_drop_na.loc[train_mask, feature_cols])
X_val   = scaler.transform(xy_drop_na.loc[val_mask,   feature_cols])
X_test  = scaler.transform(xy_drop_na.loc[test_mask,  feature_cols])

y_train = xy_drop_na.loc[train_mask, target_col].values
y_val   = xy_drop_na.loc[val_mask,   target_col].values
y_test  = xy_drop_na.loc[test_mask,  target_col].values

# 5) Clean arrays used later for tensors (LSTM 2 uses these names)
X_train_clean = X_train
X_val_clean   = X_val
X_test_clean  = X_test

y_train_clean = y_train
y_val_clean   = y_val
y_test_clean  = y_test



Removed target columns: ['Y_bananas', 'Y_barley', 'Y_cassava_fresh', 'Y_cucumbers_and_gherkins', 'Y_maize_corn', 'Y_oil_palm_fruit', 'Y_other_vegetables_fresh_nec', 'Y_potatoes', 'Y_rice', 'Y_soya_beans', 'Y_sugar_beet', 'Y_sugar_cane', 'Y_tomatoes', 'Y_watermelons', 'Y_wheat']
Removed area columns: 175
Final feature count: 104
Train samples: 131
Val samples:   20
Test samples:  15


In [7]:
import numpy as np
import torch
import torch.nn as nn

# If not defined yet:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# === 1. Combine train + val into a single training set (like example) ===
X_train_full = np.concatenate([X_train_clean, X_val_clean], axis=0)
y_train_full = np.concatenate([y_train_clean, y_val_clean], axis=0).reshape(-1, 1)

X_test = X_test_clean
y_test = y_test_clean.reshape(-1, 1)

print("X_train_full:", X_train_full.shape)
print("y_train_full:", y_train_full.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

# === 2. Create sliding windows (same style as `slider` in the example) ===
slider = 5  # number of timesteps in each LSTM input sequence

def create_sequences(X, y, slider):
    X_seq, y_seq = [], []
    for i in range(len(X) - slider + 1):
        X_seq.append(X[i:i+slider])
        y_seq.append(y[i+slider-1])
    return np.array(X_seq), np.array(y_seq)

X_train, y_train = create_sequences(X_train_full, y_train_full, slider)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, slider)

print("After sliding windows:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test_seq:", X_test_seq.shape, "y_test_seq:", y_test_seq.shape)


Using device: cpu
X_train_full: (151, 104)
y_train_full: (151, 1)
X_test: (15, 104)
y_test: (15, 1)
After sliding windows:
X_train: (147, 5, 104) y_train: (147, 1)
X_test_seq: (11, 5, 104) y_test_seq: (11, 1)


In [8]:
# Convert to PyTorch datasets (same style as example)
train_dataset = torch.utils.data.TensorDataset(
    torch.FloatTensor(X_train),
    torch.FloatTensor(y_train)
)

test_dataset = torch.utils.data.TensorDataset(
    torch.FloatTensor(X_test_seq),
    torch.FloatTensor(y_test_seq)
)

# Split training set into train and validation (80/20, like example)
train_size = int(0.8 * len(train_dataset))
val_size   = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    train_dataset, [train_size, val_size]
)

print("Train samples:", len(train_dataset))
print("Val samples:  ", len(val_dataset))
print("Test samples: ", len(test_dataset))


Train samples: 117
Val samples:   30
Test samples:  11


In [9]:
# set hyperparameters (same format as LSTM_example)
n_neuron       = 64
activation     = "ReLU"
num_epochs     = 50
learning_rate  = 0.001
minibatch_size = 64
model_num      = 1  # optional label if you save multiple models


In [10]:
# Create data loaders for batched training (same as example)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=minibatch_size,
    shuffle=False
)
val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=minibatch_size,
    shuffle=False
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=minibatch_size,
    shuffle=False
)

print("Train batches:", len(train_loader))
print("Val batches:", len(val_loader))
print("Test batches:", len(test_loader))


Train batches: 2
Val batches: 1
Test batches: 1


In [11]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, activation="ReLU", num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            getattr(nn, activation)(),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        out, _ = self.lstm(x)    # [batch, seq_len, hidden]
        x = out[:, -1, :]        # last timestep
        x = self.fc(x)
        return x

# Instantiate model in SAME way as example
model = LSTMModel(
    input_size=X_train.shape[2],
    hidden_size=n_neuron,
    output_size=y_train.shape[-1],
    activation=activation,
    num_layers=2
)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

model.to(device)
print(model)


LSTMModel(
  (lstm): LSTM(104, 64, num_layers=2, batch_first=True)
  (fc): Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [12]:
from utils import pytorch_train  # same as in LSTM_example (3).ipynb

train_losses_mse, val_losses_mse = pytorch_train(
    model,
    optimizer,
    criterion=criterion,
    device=device,
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=num_epochs
)

import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

model.eval()
y_test_true_all = []
y_test_pred_all = []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        out = model(xb)
        y_test_true_all.append(yb.cpu().numpy())
        y_test_pred_all.append(out.cpu().numpy())

y_test_true_all = np.concatenate(y_test_true_all, axis=0).ravel()
y_test_pred_all = np.concatenate(y_test_pred_all, axis=0).ravel()

test_mse = mean_squared_error(y_test_true_all, y_test_pred_all)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test_true_all, y_test_pred_all)

print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R²:   {test_r2:.4f}")

# Scatter plot (true vs predicted)
plt.figure(figsize=(6, 6))
plt.scatter(y_test_true_all, y_test_pred_all, alpha=0.5)
min_val = min(y_test_true_all.min(), y_test_pred_all.min())
max_val = max(y_test_true_all.max(), y_test_pred_all.max())
plt.plot([min_val, max_val], [min_val, max_val], "r--", label="y = x")
plt.xlabel("True Y_rice")
plt.ylabel("Predicted Y_rice")
plt.title("Test: True vs Predicted Y_rice (LSTM, example format)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'xarray'