In [33]:
import os
import pickle
import sys

import copy
import random
import joblib
import numpy as np
import pandas as pd
import rioxarray as rxr
import torch
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

sys.path.append('..')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# from src.model_utils import downsample, get_unique_values, reshape_data

## Paths to data

In [34]:
# defining paths
path_to_npys_data = os.path.join("..", "data", "npys_data")

pathTarget = os.path.join(os.path.join(path_to_npys_data, "target_croplands.npy"))
pathFeatures = os.path.join(path_to_npys_data, "features_initial_data.npy")
pathMorf = os.path.join(path_to_npys_data, "features_morf_data.npy")
pathTarget_tif = os.path.join("..", "data", "target", "target_croplands.tif")

##  Data

In [35]:
# Features
climate_features = pd.DataFrame.from_dict(np.load(pathFeatures, allow_pickle=True), orient="columns")
morf_features = pd.DataFrame.from_dict(
        np.load(pathMorf, allow_pickle=True), orient="columns"
    )

climate_keys = list(climate_features.keys())
morf_keys = list(morf_features.keys())

with open(os.path.join(path_to_npys_data, "climate_keys.pkl"), 'wb') as file:
    pickle.dump(climate_keys, file)
    
with open(os.path.join(path_to_npys_data, "morf_keys.pkl"), 'wb') as file:
    pickle.dump(morf_keys, file)

In [36]:
# Target Variable
y = pd.DataFrame.from_dict(np.load(pathTarget, allow_pickle=True), orient="columns")
y = y["Target"].astype(int)
# Set classes 4,5 to 0
y =  pd.DataFrame({"target": np.where(y > 3, 0, y)})


In [37]:
# combine climate morf and target and then filter to make holdout 
data = pd.concat([climate_features.drop(columns=['latitude', 'longitude']), morf_features, y], axis=1)

main_data = data[data['longitude'] <= 100]
hold_out = data[(115 <= data['longitude']) & (data['longitude'] <= 135) &
                     (42 <= data['latitude']) & (data['latitude'] <= 55)]

with open(os.path.join(path_to_npys_data, "hold_out.pkl"), 'wb') as file:
    pickle.dump(hold_out, file)

del data

In [38]:
# Calculate the total number of samples in each class
class_counts = np.unique(main_data['target'], return_counts=True)[1]

# Calculate the total number of samples
total_samples = sum(class_counts)

In [39]:
main_data.shape

(21819700, 165)

In [40]:
# get nrows and ncols using preprocessed tif
nrows, ncols = rxr.open_rasterio(pathTarget_tif).squeeze().where(rxr.open_rasterio(pathTarget_tif).squeeze()["x"] <= 100, drop=True).shape
print(nrows, ncols)

2450 8906


In [41]:
# Reshape features and target dataframes back to its original shape
y = main_data.pop("target").to_numpy().reshape(nrows, ncols)
X = main_data.values.reshape(nrows, ncols, -1)

with open(os.path.join(path_to_npys_data, "X_keys.pkl"), 'wb') as file:
    pickle.dump(X.keys(), file)

In [42]:
# 2. Function to generate 200x200 blocks from data
def generate_blocks(data, block_size=(200, 200)):
    blocks = []
    n_rows, n_cols = data.shape[:2]
    block_rows, block_cols = block_size

    for i in range(0, n_rows, block_rows):
        for j in range(0, n_cols, block_cols):
            # Ensure that the block has the specified size
            block = data[i:i + block_rows, j:j + block_cols]
            blocks.append(block)

    return blocks

In [95]:
def generate_subsets(blocks, empty, max_iterations, class_counts, options, options_distr, train_portion, val_test_portion):
    iteration = 0
    while blocks and iteration < max_iterations:
        random_element = blocks.pop(random.randint(0, len(blocks) - 1))
        block_distr = {value: count for value, count in zip(*np.unique(random_element[1].flatten(), return_counts=True))}
        block_distr = {key: block_distr.get(key, 0) for key in [0, 1, 2, 3]}
        for key, value in empty.items():
            block_distr[key] += value 
        
        indexes = list(range(len(options)))
        random.shuffle(indexes)
        
        for j in range(len(indexes)):
            if not any(options_distr[indexes[j]][i] + block_distr[i] > class_counts[i] * (train_portion if indexes[j] == 0 else val_test_portion) for i in range(len(class_counts))):
                options[indexes[j]].append(random_element)
                for key, value in block_distr.items():
                    options_distr[indexes[j]][key] += value 
                break
        else:
            blocks.append(random_element)
        
        iteration += 1
    
    return options, options_distr, blocks

In [None]:
minblocks = np.inf
for iter in tqdm(range(100)):
    # Generate 200x200 blocks from X and y
    X_blocks = generate_blocks(X, block_size=(200, 200))
    y_blocks = generate_blocks(y, block_size=(200, 200))
    blocks = list(zip(X_blocks, y_blocks))

    max_iterations = 10 * len(blocks)
    
    train, val, test = [], [], []
    train_distr, val_distr, test_distr, empty = ({0: 0, 1: 0, 2: 0, 3: 0} for i in range(4))

    options = [train, val, test]
    options_distr = [train_distr, val_distr, test_distr]

    options, options_distr, blocks = generate_subsets(blocks, empty, max_iterations, class_counts, options, options_distr, 0.8, 0.1)
    
    if len(blocks) < minblocks:
        minblocks = len(blocks)
        results = copy.deepcopy(options)
        results_distr = copy.deepcopy(options_distr)
        residual_blocks = copy.deepcopy(blocks)
    if minblocks == 0:
        break

# work with residuals increasing limits
max_iterations = 10 * minblocks
options, options_distr, blocks = generate_subsets(residual_blocks, empty, max_iterations, class_counts, results, results_distr, 0.85, 0.15)

In [105]:
# Check residuals
for i in range(len(blocks)):
    print("Block ", i)
    print(np.unique(residual_blocks[i][1].flatten(),return_counts=True)[1])

In [106]:
# Check results distr
for s, set in enumerate(['train', 'val', 'test']):
    print(set, [results_distr[s][i] / class_counts[i] for i in range(4)])

train [0.8004722015891051, 0.8132737018481314, 0.7990903066878018, 0.7991303442208421]
val [0.09993501066201975, 0.11509349027584424, 0.10091530090250612, 0.10567763380216041]
test [0.09959278774887509, 0.0716328078760243, 0.09999439240969213, 0.09519202197699757]


## Data Preparation 


### Target one hot encoding and Train/test split

In [16]:
y = pd.DataFrame(y, columns=["Target"])
# read data and apply one-hot encoding
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False).fit(y)
y = ohe.transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=123
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.15, stratify=y_test, random_state=123
)

# Define scaler based on whole dataset
scaler = MinMaxScaler()
minmax = scaler.fit(X_train)
joblib.dump(minmax, os.path.join(path_to_npys_data, "scaler.save"))

# Normalization using minmax scaler
X_train = minmax.transform(X_train)
X_test = minmax.transform(X_test)
X_val = minmax.transform(X_val)

X = dict()
y = dict()

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("X_val shape:", X_val.shape)

X_train shape: (10228764, 162)
X_test shape: (2898149, 162)
X_val shape: (511439, 162)


In [18]:
# Check Classes distribution
get_unique_values(np.argmax(y_train, 1))

[(1, 110024), (2, 439606), (3, 879902), (0, 8799232)]

In [19]:
# Downsampling Class 0 up to Class 2 and oversampling Class 1
X_train, y_train = downsample(X_train, np.argmax(y_train, 1), oversampling=False)
X_val, y_val = downsample(X_val, np.argmax(y_val, 1), oversampling=False)

Initial data:
[(0, 8799232), (3, 879902), (2, 439606), (1, 110024)]
Resampled data:
[(0, 879902), (3, 879902), (2, 439606), (1, 110024)]
Initial data:
[(0, 439962), (3, 43995), (2, 21981), (1, 5501)]
Resampled data:
[(0, 43995), (3, 43995), (2, 21981), (1, 5501)]


In [20]:
X["Train"] = X_train
X["Val"] = X_val
X["Test"] = X_test
y["Train"] = ohe.transform(pd.DataFrame(y_train))
y["Val"] = ohe.transform(pd.DataFrame(y_val))
y["Test"] = y_test

In [21]:
# save dictionary pkl file
with open(os.path.join("..", "data", "processed_files", "pkls", "X_down.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y_down.pkl"), "wb") as fp:
    pickle.dump(y, fp)

In [22]:
X["Train"] = reshape_data(pd.DataFrame(X_train, columns=keys))
X["Val"] = reshape_data(pd.DataFrame(X_val, columns=keys))
X["Test"] = reshape_data(pd.DataFrame(X_test, columns=keys))

In [24]:
# save dictionary pkl file

with open(os.path.join("..", "data", "processed_files", "pkls", "X_down_lstm.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y_down_lstm.pkl"), "wb") as fp:
    pickle.dump(y, fp)