In [1]:
import os
import pickle
import sys

import joblib
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import OneHotEncoder

sys.path.append("..")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from src.model_utils import downsample, reshape_data
from src.dataprocessing import get_class_distribution

## Paths to data

In [2]:
# defining paths
path_to_npys_data = os.path.join("..", "data", "npys_data")

pathTarget_3 = os.path.join(os.path.join(path_to_npys_data, "target_croplands.npy"))
pathFeatures = os.path.join(path_to_npys_data, "features_initial_data.npy")
pathMorf = os.path.join(path_to_npys_data, "features_morf_data.npy")

##  Data

In [3]:
# Features
X = pd.DataFrame.from_dict(np.load(pathFeatures, allow_pickle=True), orient="columns")
morf = pd.DataFrame.from_dict(np.load(pathMorf, allow_pickle=True), orient="columns")
X = pd.concat([X, morf], axis=1)
keys = list(X.keys())

with open(os.path.join("..", "data", "processed_files", "pkls", "keys.pkl"), "wb") as file:
    pickle.dump(keys, file)

# Target Variable
y = pd.DataFrame.from_dict(np.load(pathTarget_3, allow_pickle=True), orient="columns")
y = y["Target"].astype(int).to_numpy()

In [4]:
# Set classes 4,5 to 0
y = np.where(y > 3, 0, y)

## Data Description

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13638352 entries, 0 to 13638351
Columns: 162 entries, sfcWindmax_M1 to morf_33_10
dtypes: float64(162)
memory usage: 16.5 GB


In [6]:
X.head()

Unnamed: 0,sfcWindmax_M1,sfcWindmax_M2,sfcWindmax_M3,sfcWindmax_M4,sfcWindmax_M5,sfcWindmax_M6,sfcWindmax_M7,sfcWindmax_M8,sfcWindmax_M9,sfcWindmax_M10,...,morf_33_1,morf_33_2,morf_33_3,morf_33_4,morf_33_5,morf_33_6,morf_33_7,morf_33_8,morf_33_9,morf_33_10
0,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.50077,177.346313,0.707366,0.000439,0.207605,0.062292,-0.257171,-0.257601,0.062723,0.713253
1,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.50588,172.595993,0.707884,0.000409,0.201472,0.057969,-0.25212,-0.256513,0.062362,0.71327
2,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.554733,164.047455,0.708955,0.000276,0.169685,0.039095,-0.23285,-0.254409,0.060654,0.713656
3,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.63044,159.827286,0.709747,0.000169,0.140592,0.023969,-0.219259,-0.253394,0.058104,0.714367
4,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.731494,159.067596,0.710274,0.000134,0.11992,0.019065,-0.216999,-0.253654,0.05572,0.715481


## Data Preparation 


### Target one hot encoding and Train/test split

In [7]:
# One-hot encode target
y = pd.DataFrame(y, columns=["Target"])
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False).fit(y)
y = ohe.transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=123
)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, stratify=y_test, random_state=123
)

# Normalization
scaler = MinMaxScaler().fit(X_train)
joblib.dump(scaler, os.path.join(path_to_npys_data, "scaler.save"))
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

X = dict()
y = dict()

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

X_train shape: (10910681, 162)
X_val shape: (1363836, 162)
X_test shape: (1363835, 162)


In [8]:
# Check Classes distribution
get_class_distribution(np.argmax(y_train, 1))

Class 1: 1.08 %
Class 2: 4.30 %
Class 3: 8.60 %
Class 0: 86.02 %


In [9]:
# Downsampling Class 0 up to Class 2 and oversampling Class 1
X_train, y_train = downsample(X_train, np.argmax(y_train, 1), oversampling=False)
X_val, y_val = downsample(X_val, np.argmax(y_val, 1), oversampling=False)

Initial data:
[(0, 9385847), (3, 938562), (2, 468913), (1, 117359)]
Resampled data:
[(0, 938562), (3, 938562), (2, 468913), (1, 117359)]
Initial data:
[(0, 1173231), (3, 117321), (2, 58614), (1, 14670)]
Resampled data:
[(0, 117321), (3, 117321), (2, 58614), (1, 14670)]


In [19]:
X["Train"] = X_train
X["Val"] = X_val
X["Test"] = X_test
y["Train"] = ohe.transform(pd.DataFrame(y_train))
y["Val"] = ohe.transform(pd.DataFrame(y_val))
y["Test"] = y_test

In [11]:
# save dictionary pkl file
with open(
    os.path.join("..", "data", "processed_files", "pkls", "X_down.pkl"), "wb"
) as fp:
    pickle.dump(X, fp)

with open(
    os.path.join("..", "data", "processed_files", "pkls", "y_down.pkl"), "wb"
) as fp:
    pickle.dump(y, fp)

In [20]:
X["Train"] = reshape_data(pd.DataFrame(X_train, columns=keys))
X["Val"] = reshape_data(pd.DataFrame(X_val, columns=keys))
X["Test"] = reshape_data(pd.DataFrame(X_test, columns=keys))

In [None]:
# save dictionary pkl file
with open(
    os.path.join("..", "data", "processed_files", "pkls", "X_down_lstm.pkl"), "wb"
) as fp:
    pickle.dump(X, fp)

with open(
    os.path.join("..", "data", "processed_files", "pkls", "y_down_lstm.pkl"), "wb"
) as fp:
    pickle.dump(y, fp)