In [1]:
import os
import pickle
import sys

import copy
import random
import joblib
import numpy as np
import pandas as pd
import rioxarray as rxr
import torch
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

sys.path.append("..")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from src.model_utils import reshape_data
from src.dataprocessing import generate_subsets, generate_blocks

/home/crop_dev/miniconda/envs/crop_env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


## Paths to data

In [None]:
# defining paths
path_to_npys_data = os.path.join("..", "data", "npys_data")

pathTarget = os.path.join(os.path.join(path_to_npys_data, "target_croplands.npy"))
pathFeatures = os.path.join(path_to_npys_data, "features_initial_data.npy")
pathMorf = os.path.join(path_to_npys_data, "features_morf_data.npy")
pathTarget_tif = os.path.join("..", "data", "target", "target_croplands.tif")

##  Data

In [3]:
# Features
climate_features = pd.DataFrame.from_dict(
    np.load(pathFeatures, allow_pickle=True), orient="columns"
)
morf_features = pd.DataFrame.from_dict(
    np.load(pathMorf, allow_pickle=True), orient="columns"
)

climate_keys = list(climate_features.keys())
morf_keys = list(morf_features.keys())

with open(os.path.join(path_to_npys_data, "climate_keys.pkl"), "wb") as file:
    pickle.dump(climate_keys, file)

with open(os.path.join(path_to_npys_data, "morf_keys.pkl"), "wb") as file:
    pickle.dump(morf_keys, file)

In [4]:
# Target Variable
y = pd.DataFrame.from_dict(np.load(pathTarget, allow_pickle=True), orient="columns")
y = y["Target"].astype(int)
# Set classes 4,5 to 0
y = pd.DataFrame({"target": np.where(y > 3, 0, y)})

## Data Preparation 


### Train/val/test split using pixels blocks

In [6]:
# combine climate morf and target and then filter to make holdout 
climate_features.drop(columns=['latitude', 'longitude'], inplace=True)
data = pd.concat([climate_features, morf_features, y], axis=1)

main_data = data[data['longitude'] <= 100]
hold_out = data[(115 <= data['longitude']) & (data['longitude'] <= 135) &
                     (42 <= data['latitude']) & (data['latitude'] <= 55)]

X_keys = list(data.keys()[:-1])

with open(os.path.join(path_to_npys_data, "X_keys.pkl"), 'wb') as file:
    pickle.dump(X_keys, file)

del data

In [None]:
# Calculate the total number of samples in each class
class_counts = np.unique(main_data["target"], return_counts=True)[1]

# Calculate the total number of samples
total_samples = sum(class_counts)

In [None]:
# get nrows and ncols using preprocessed tif
nrows, ncols = (
    rxr.open_rasterio(pathTarget_tif)
    .squeeze()
    .where(rxr.open_rasterio(pathTarget_tif).squeeze()["x"] <= 100, drop=True)
    .shape
)
print(nrows, ncols)

2450 8906


In [None]:
# Reshape features and target dataframes back to its original shape
y = main_data.pop("target").to_numpy().reshape(nrows, ncols)
X = main_data.values.reshape(nrows, ncols, -1)

# holdout
y_holdout = hold_out.pop("target").to_numpy()
X_holdout = hold_out.to_numpy()

In [None]:
minblocks = np.inf
for iter in tqdm(range(100)):
    # Generate 200x200 blocks from X and y
    X_blocks = generate_blocks(X, block_size=(200, 200))
    y_blocks = generate_blocks(y, block_size=(200, 200))
    blocks = list(zip(X_blocks, y_blocks))

    max_iterations = 10 * len(blocks)

    train, val, test = [], [], []
    train_distr, val_distr, test_distr, empty = (
        {0: 0, 1: 0, 2: 0, 3: 0} for i in range(4)
    )

    options = [train, val, test]
    options_distr = [train_distr, val_distr, test_distr]

    options, options_distr, blocks = generate_subsets(
        blocks, empty, max_iterations, class_counts, options, options_distr, 0.8, 0.1
    )

    if len(blocks) < minblocks:
        minblocks = len(blocks)
        results = copy.deepcopy(options)
        results_distr = copy.deepcopy(options_distr)
        residual_blocks = copy.deepcopy(blocks)
    if minblocks == 0:
        break

# work with residuals increasing limits
max_iterations = 10 * minblocks
options, options_distr, blocks = generate_subsets(
    residual_blocks,
    empty,
    max_iterations,
    class_counts,
    results,
    results_distr,
    0.85,
    0.15,
)

In [None]:
# Check residuals
for i in range(len(blocks)):
    print("Block ", i)
    print(np.unique(residual_blocks[i][1].flatten(), return_counts=True)[1])

In [None]:
# Check results distr
for s, set in enumerate(["train", "val", "test"]):
    print(set, [results_distr[s][i] / class_counts[i] for i in range(4)])

In [36]:
X_train = np.concatenate([block[0].reshape(-1, len(X_keys)) for block in options[0]], axis=0)
X_val = np.concatenate([block[0].reshape(-1, len(X_keys)) for block in options[1]], axis=0)
X_test = np.concatenate([block[0].reshape(-1, len(X_keys)) for block in options[2]], axis=0)

In [44]:
y_train = np.concatenate([block[1].reshape(-1, 1) for block in options[0]], axis=0)
y_val = np.concatenate([block[1].reshape(-1, 1) for block in options[1]], axis=0)
y_test = np.concatenate([block[1].reshape(-1, 1) for block in options[2]], axis=0)

### Target one hot encoding and Train/test split

In [49]:
# read data and apply one-hot encoding
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False).fit(y_train)
y_train = ohe.transform(y_train)
y_val = ohe.transform(y_val)
y_test = ohe.transform(y_test)

# Define scaler based on whole dataset
scaler = MinMaxScaler()
minmax = scaler.fit(X_train)
joblib.dump(minmax, os.path.join(path_to_npys_data, "scaler.save"))

# Normalization using minmax scaler
X_train = minmax.transform(X_train)
X_val = minmax.transform(X_val)
X_test = minmax.transform(X_test)
X_holdout = minmax.transform(X_holdout)

X = dict()
y = dict()

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("X_holdout shape:", X_holdout.shape)



X_train shape: (17436100, 164)
X_test shape: (2170000, 164)
X_val shape: (2213600, 164)


In [50]:
X["Train"] = X_train
X["Val"] = X_val
X["Test"] = X_test
y["Train"] = y_train
y["Val"] = y_val
y["Test"] = y_test

In [None]:
# save holdout data 
with open(os.path.join("..", "data", "processed_files", "pkls", "X_holdout.pkl"), 'wb') as file:
    pickle.dump(X_holdout, file)
    
with open(os.path.join("..", "data", "processed_files", "pkls", "y_holdout.pkl"), 'wb') as file:
    pickle.dump(y_holdout, file)

In [None]:
# save dictionary pkl file
with open(os.path.join("..", "data", "processed_files", "pkls", "X.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y.pkl"), "wb") as fp:
    pickle.dump(y, fp)

In [22]:
X["Train"] = reshape_data(pd.DataFrame(X_train, columns=X_keys))
X["Val"] = reshape_data(pd.DataFrame(X_val, columns=X_keys))
X["Test"] = reshape_data(pd.DataFrame(X_test, columns=X_keys))

In [24]:
with open(os.path.join("..", "data", "processed_files", "pkls", "X_lstm.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y_lstm.pkl"), "wb") as fp:
    pickle.dump(y, fp)