In [1]:
import os
import pickle
import sys

import copy
import random
import joblib
import numpy as np
import pandas as pd
import rioxarray as rxr
import torch
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

sys.path.append("..")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from src.model_utils import reshape_data, downsample
from src.dataprocessing import generate_subsets, generate_blocks

## Paths to data

In [2]:
# defining paths
path_to_npys_data = os.path.join("..", "data", "npys_data")

pathTarget = os.path.join(os.path.join(path_to_npys_data, "target_croplands.npy"))
pathFeatures = os.path.join(path_to_npys_data, "features_initial_data.npy")
pathMorf = os.path.join(path_to_npys_data, "features_morf_data.npy")
pathTarget_tif = os.path.join("..", "data", "target", "target_croplands.tif")

##  Data

In [3]:
# Features
climate_features = pd.DataFrame.from_dict(
    np.load(pathFeatures, allow_pickle=True), orient="columns"
)
morf_features = pd.DataFrame.from_dict(
    np.load(pathMorf, allow_pickle=True), orient="columns"
)

climate_keys = list(climate_features.keys())
morf_keys = list(morf_features.keys())

with open(os.path.join(path_to_npys_data, "climate_keys.pkl"), "wb") as file:
    pickle.dump(climate_keys, file)

with open(os.path.join(path_to_npys_data, "morf_keys.pkl"), "wb") as file:
    pickle.dump(morf_keys, file)

In [4]:
# Target Variable
y = pd.DataFrame.from_dict(np.load(pathTarget, allow_pickle=True), orient="columns")
y = y["Target"].astype(int)
# Set classes 4,5 to 0
y = pd.DataFrame({"target": np.where(y > 3, 0, y)})

## Data Preparation 


### Train/val/test split using pixels blocks

In [5]:
# combine climate morf and target and then filter to make holdout 
climate_features.drop(columns=['latitude', 'longitude'], inplace=True)
data = pd.concat([climate_features, morf_features, y], axis=1)

main_data = data[data['longitude'] <= 100]
hold_out = data[(115 <= data['longitude']) & (data['longitude'] <= 135) &
                     (42 <= data['latitude']) & (data['latitude'] <= 55)]

X_keys = list(data.keys()[:-1])

with open(os.path.join(path_to_npys_data, "X_keys.pkl"), 'wb') as file:
    pickle.dump(X_keys, file)

del data

In [6]:
# Reshape features and target dataframes back to its original shape
y = main_data.pop("target").to_numpy()
X = main_data.values

# holdout
y_holdout = hold_out.pop("target").to_numpy()
X_holdout = hold_out.to_numpy()

### Target one hot encoding and Train/test split

In [7]:
y = pd.DataFrame(y, columns=["Target"])
# read data and apply one-hot encoding
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False).fit(y)
y = ohe.transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.02, stratify=y, random_state=123
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, stratify=y_test, random_state=123
)

# Define scaler based on train set
scaler = MinMaxScaler()
minmax = scaler.fit(X_train)
joblib.dump(minmax, os.path.join(path_to_npys_data, "scaler_FR_RUS.save"))

# Normalization using minmax scaler
X_train = minmax.transform(X_train)
X_val = minmax.transform(X_val)
X_test = minmax.transform(X_test)
X_holdout = minmax.transform(X_holdout)

X = dict()
y = dict()

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("X_holdout shape:", X_holdout.shape)

X_train shape: (21392910, 164)
X_val shape: (218295, 164)
X_test shape: (218295, 164)
X_holdout shape: (3241413, 164)


### RUS ROS sampling for train set

In [8]:
X_train, y_train = downsample(X_train, np.argmax(y_train, 1), oversampling=True)

Initial data:
[(0, 17225658), (3, 2691221), (2, 1240911), (1, 235120)]
Resampled data:
[(0, 2691221), (1, 2691221), (2, 2691221), (3, 2691221)]


In [9]:
X["Train"] = X_train
X["Val"] = X_val
X["Test"] = X_test
y["Train"] = ohe.transform(pd.DataFrame(y_train))
y["Val"] = y_val
y["Test"] = y_test

In [10]:
# save holdout data 
with open(os.path.join("..", "data", "processed_files", "pkls", "X_FR_RUS_holdout.pkl"), 'wb') as file:
    pickle.dump(X_holdout, file)
    
with open(os.path.join("..", "data", "processed_files", "pkls", "y_holdout.pkl"), 'wb') as file:
    pickle.dump(y_holdout, file)

In [11]:
# save dictionary pkl file
with open(os.path.join("..", "data", "processed_files", "pkls", "X_FR_RUS.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y_FR_RUS.pkl"), "wb") as fp:
    pickle.dump(y, fp)

In [12]:
X_Train_monthly, X_Train_static, monthly_keys, static_keys = reshape_data(pd.DataFrame(X_train, columns=X_keys))
X_val_monthly, X_val_static, _, _ = reshape_data(pd.DataFrame(X_val, columns=X_keys))
X_test_monthly, X_test_static, _, _ = reshape_data(pd.DataFrame(X_test, columns=X_keys))

X["Train"] = X_Train_monthly, X_Train_static
X["Val"] = X_val_monthly, X_val_static
X["Test"] = X_test_monthly, X_test_static

In [13]:
with open(os.path.join(path_to_npys_data, "monthly_keys.pkl"), "wb") as file:
    pickle.dump(monthly_keys, file)

with open(os.path.join(path_to_npys_data, "static_keys.pkl"), "wb") as file:
    pickle.dump(static_keys, file)

In [14]:
with open(os.path.join("..", "data", "processed_files", "pkls", "X_FR_RUS_lstm.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y_FR_RUS_lstm.pkl"), "wb") as fp:
    pickle.dump(y, fp)