In [3]:
import os
import pickle
import sys

import joblib
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import OneHotEncoder

sys.path.append('..')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from src.model_utils import downsample, get_unique_values, reshape_data

## Paths to data

In [13]:
# defining paths
path_to_npys_data = os.path.join("..", "data", "npys_data")

pathTarget_3 = os.path.join(os.path.join(path_to_npys_data, "target_croplands.npy"))
pathFeatures = os.path.join(path_to_npys_data, "features_initial_data.npy")
pathMorf = os.path.join(path_to_npys_data, "features_morf_data.npy")

##  Data

In [14]:
# Features
X = pd.DataFrame.from_dict(np.load(pathFeatures, allow_pickle=True), orient="columns")
morf = pd.DataFrame.from_dict(
        np.load(pathMorf, allow_pickle=True), orient="columns"
    )
X = pd.concat([X, morf], axis=1)
keys = list(X.keys())

with open(os.path.join(path_to_npys_data, "keys.pkl"), 'wb') as file:
    # Dump the list into the file
    pickle.dump(keys, file)
    
# Target Variable
y = pd.DataFrame.from_dict(np.load(pathTarget_3, allow_pickle=True), orient="columns")
y = y["Target"].astype(int).to_numpy()

In [15]:
# Set classes 4,5 to 0
y = np.where(y > 3, 0, y)


## Data Description

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13638352 entries, 0 to 13638351
Columns: 162 entries, sfcWindmax_M1 to morf_33_10
dtypes: float64(162)
memory usage: 16.5 GB


In [10]:
X.head()

Unnamed: 0,sfcWindmax_M1,sfcWindmax_M2,sfcWindmax_M3,sfcWindmax_M4,sfcWindmax_M5,sfcWindmax_M6,sfcWindmax_M7,sfcWindmax_M8,sfcWindmax_M9,sfcWindmax_M10,...,morf_33_1,morf_33_2,morf_33_3,morf_33_4,morf_33_5,morf_33_6,morf_33_7,morf_33_8,morf_33_9,morf_33_10
0,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.50077,177.346313,0.707366,0.000439,0.207605,0.062292,-0.257171,-0.257601,0.062723,0.713253
1,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.50588,172.595993,0.707884,0.000409,0.201472,0.057969,-0.25212,-0.256513,0.062362,0.71327
2,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.554733,164.047455,0.708955,0.000276,0.169685,0.039095,-0.23285,-0.254409,0.060654,0.713656
3,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.63044,159.827286,0.709747,0.000169,0.140592,0.023969,-0.219259,-0.253394,0.058104,0.714367
4,1.3,1.1,1.0,1.5,1.8,1.2,1.8,1.3,2.0,1.2,...,0.731494,159.067596,0.710274,0.000134,0.11992,0.019065,-0.216999,-0.253654,0.05572,0.715481


In [11]:
X.describe()

Unnamed: 0,sfcWindmax_M1,sfcWindmax_M2,sfcWindmax_M3,sfcWindmax_M4,sfcWindmax_M5,sfcWindmax_M6,sfcWindmax_M7,sfcWindmax_M8,sfcWindmax_M9,sfcWindmax_M10,...,morf_33_1,morf_33_2,morf_33_3,morf_33_4,morf_33_5,morf_33_6,morf_33_7,morf_33_8,morf_33_9,morf_33_10
count,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,...,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0,13638350.0
mean,1.380395,1.434316,1.900162,1.415603,1.612003,1.577536,1.505585,1.617272,1.437621,1.418018,...,1.276664,181.3717,0.7065281,-0.0001145581,0.1603969,-0.01585194,-0.4116076,-0.8372061,0.3720454,0.7065811
std,0.4019296,0.4296952,0.4451495,0.3411769,0.4015478,0.392984,0.3429174,0.3614533,0.4322675,0.3602904,...,2.311596,113.9481,0.008674426,0.003606118,0.5425503,0.5146567,1.275188,0.9456932,0.9990941,0.03116058
min,0.0,0.1,0.3,0.3,0.4,0.2,0.3,0.2,0.2,0.3,...,0.0,2.666816e-06,0.4865897,-0.06034555,-23.63145,-8.565227,-9.004397,-9.081969,-0.03625468,0.2555841
25%,1.1,1.1,1.6,1.2,1.3,1.3,1.3,1.4,1.1,1.2,...,0.1882204,95.46693,0.7054717,-3.59056e-05,0.0,-0.00509296,-0.7440413,-1.168685,0.05363546,0.7020443
50%,1.4,1.4,1.9,1.4,1.6,1.6,1.5,1.6,1.4,1.4,...,0.4743695,180.0558,0.7070554,0.0,0.1296582,0.0,-0.2179654,-0.4492523,0.1203083,0.7070925
75%,1.6,1.7,2.2,1.6,1.9,1.8,1.7,1.9,1.7,1.7,...,1.316145,276.8814,0.7083952,0.0007287971,0.3155357,0.103439,0.0,-0.2202093,0.3204067,0.7118509
max,3.0,2.9,3.6,3.0,3.1,3.2,2.9,3.0,3.0,2.9,...,31.49512,360.0,0.859124,0.105768,10.99178,14.991,14.93593,0.06169987,15.04727,0.9723462


## Data Preparation 


### Target one hot encoding and Train/test split

In [16]:
y = pd.DataFrame(y, columns=["Target"])
# read data and apply one-hot encoding
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False).fit(y)
y = ohe.transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=123
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.15, stratify=y_test, random_state=123
)

# Define scaler based on whole dataset
scaler = MinMaxScaler()
minmax = scaler.fit(X_train)
joblib.dump(minmax, os.path.join(path_to_npys_data, "scaler.save"))

# Normalization using minmax scaler
X_train = minmax.transform(X_train)
X_test = minmax.transform(X_test)
X_val = minmax.transform(X_val)

X = dict()
y = dict()

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("X_val shape:", X_val.shape)

X_train shape: (10228764, 162)
X_test shape: (2898149, 162)
X_val shape: (511439, 162)


In [18]:
# Check Classes distribution
get_unique_values(np.argmax(y_train, 1))

[(1, 110024), (2, 439606), (3, 879902), (0, 8799232)]

In [19]:
# Downsampling Class 0 up to Class 2 and oversampling Class 1
X_train, y_train = downsample(X_train, np.argmax(y_train, 1), oversampling=False)
X_val, y_val = downsample(X_val, np.argmax(y_val, 1), oversampling=False)

Initial data:
[(0, 8799232), (3, 879902), (2, 439606), (1, 110024)]
Resampled data:
[(0, 879902), (3, 879902), (2, 439606), (1, 110024)]
Initial data:
[(0, 439962), (3, 43995), (2, 21981), (1, 5501)]
Resampled data:
[(0, 43995), (3, 43995), (2, 21981), (1, 5501)]


In [20]:
X["Train"] = X_train
X["Val"] = X_val
X["Test"] = X_test
y["Train"] = ohe.transform(pd.DataFrame(y_train))
y["Val"] = ohe.transform(pd.DataFrame(y_val))
y["Test"] = y_test

In [21]:
# save dictionary pkl file
with open(os.path.join("..", "data", "processed_files", "pkls", "X_down.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y_down.pkl"), "wb") as fp:
    pickle.dump(y, fp)

In [22]:
X["Train"] = reshape_data(pd.DataFrame(X_train, columns=keys))
X["Val"] = reshape_data(pd.DataFrame(X_val, columns=keys))
X["Test"] = reshape_data(pd.DataFrame(X_test, columns=keys))

In [24]:
# save dictionary pkl file

with open(os.path.join("..", "data", "processed_files", "pkls", "X_down_lstm.pkl"), "wb") as fp:
    pickle.dump(X, fp)

with open(os.path.join("..", "data", "processed_files", "pkls", "y_down_lstm.pkl"), "wb") as fp:
    pickle.dump(y, fp)