# Create Training and Testing Data for a Logistic Regression 
## Version: Using Max UH location data during Current and Future Climate (UNBALANCED)

First, import relevant packages.

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
from ncar_jobqueue import NCARCluster
from dask.distributed import Client

Start dask workers with adaptive scaling to load data for training.

In [2]:
#--------------------------------------------------

#if __name__== "__main__":

#start dask workers
cluster = NCARCluster(memory="109GB", cores=36)
cluster.adapt(minimum=1, maximum=10, wait_count=60)
cluster
#print scripts
print(cluster.job_script())
#start client
client = Client(cluster)
client

#--------------------------------------------------

#!/usr/bin/env bash

#PBS -N dask-worker
#PBS -q regular
#PBS -A P54048000
#PBS -l select=1:ncpus=36:mem=109GB
#PBS -l walltime=01:00:00
#PBS -e /glade/scratch/molina/
#PBS -o /glade/scratch/molina/
JOB_ID=${PBS_JOBID%%.*}



/glade/work/molina/miniconda3/envs/python-tutorial/bin/python -m distributed.cli.dask_worker tcp://10.148.10.17:42307 --nthreads 36 --memory-limit 109.00GB --name dask-worker--${JOB_ID}-- --death-timeout 60 --local-directory /glade/scratch/molina --interface ib0



0,1
Client  Scheduler: tcp://10.148.10.17:42307  Dashboard: https://jupyterhub.ucar.edu/ch/user/molina/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


Open data sets that contain variables at maximum UH locations.

In [3]:
data_current = xr.open_dataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/current_conus1_varsatmaxUH.nc")
data_futures = xr.open_dataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/future_conus1_varsatmaxUH.nc")

Print relevant shapes of current and future climate data. Future climate has more storm patches.

In [4]:
print("Total number of storm patches in current climate: ",data_current.x.shape[0])
print("Total number of storm patches in future climate: ",data_futures.x.shape[0])

Total number of storm patches in current climate:  1387488
Total number of storm patches in future climate:  1419928


Separate storm patch data into above and below UH 75 m2/s2 threshold groups.

In [6]:
data_uh0_above_current = data_current.data_prd[data_current.data_prd>=75]

data_tk1_above_current = data_current.data_var_tk1[data_current.data_prd>=75]
data_tk3_above_current = data_current.data_var_tk3[data_current.data_prd>=75]
data_tk5_above_current = data_current.data_var_tk5[data_current.data_prd>=75]
data_tk7_above_current = data_current.data_var_tk7[data_current.data_prd>=75]

data_ev1_above_current = data_current.data_var_ev1[data_current.data_prd>=75]
data_ev3_above_current = data_current.data_var_ev3[data_current.data_prd>=75]
data_ev5_above_current = data_current.data_var_ev5[data_current.data_prd>=75]
data_ev7_above_current = data_current.data_var_ev7[data_current.data_prd>=75]

data_eu1_above_current = data_current.data_var_eu1[data_current.data_prd>=75]
data_eu3_above_current = data_current.data_var_eu3[data_current.data_prd>=75]
data_eu5_above_current = data_current.data_var_eu5[data_current.data_prd>=75]
data_eu7_above_current = data_current.data_var_eu7[data_current.data_prd>=75]

data_pr1_above_current = data_current.data_var_pr1[data_current.data_prd>=75]
data_pr3_above_current = data_current.data_var_pr3[data_current.data_prd>=75]
data_pr5_above_current = data_current.data_var_pr5[data_current.data_prd>=75]
data_pr7_above_current = data_current.data_var_pr7[data_current.data_prd>=75]

data_qv1_above_current = data_current.data_var_qv1[data_current.data_prd>=75]
data_qv3_above_current = data_current.data_var_qv3[data_current.data_prd>=75]
data_qv5_above_current = data_current.data_var_qv5[data_current.data_prd>=75]
data_qv7_above_current = data_current.data_var_qv7[data_current.data_prd>=75]

data_uh0_below_current = data_current.data_prd[data_current.data_prd<75]

data_tk1_below_current = data_current.data_var_tk1[data_current.data_prd<75]
data_tk3_below_current = data_current.data_var_tk3[data_current.data_prd<75]
data_tk5_below_current = data_current.data_var_tk5[data_current.data_prd<75]
data_tk7_below_current = data_current.data_var_tk7[data_current.data_prd<75]

data_ev1_below_current = data_current.data_var_ev1[data_current.data_prd<75]
data_ev3_below_current = data_current.data_var_ev3[data_current.data_prd<75]
data_ev5_below_current = data_current.data_var_ev5[data_current.data_prd<75]
data_ev7_below_current = data_current.data_var_ev7[data_current.data_prd<75]

data_eu1_below_current = data_current.data_var_eu1[data_current.data_prd<75]
data_eu3_below_current = data_current.data_var_eu3[data_current.data_prd<75]
data_eu5_below_current = data_current.data_var_eu5[data_current.data_prd<75]
data_eu7_below_current = data_current.data_var_eu7[data_current.data_prd<75]

data_pr1_below_current = data_current.data_var_pr1[data_current.data_prd<75]
data_pr3_below_current = data_current.data_var_pr3[data_current.data_prd<75]
data_pr5_below_current = data_current.data_var_pr5[data_current.data_prd<75]
data_pr7_below_current = data_current.data_var_pr7[data_current.data_prd<75]

data_qv1_below_current = data_current.data_var_qv1[data_current.data_prd<75]
data_qv3_below_current = data_current.data_var_qv3[data_current.data_prd<75]
data_qv5_below_current = data_current.data_var_qv5[data_current.data_prd<75]
data_qv7_below_current = data_current.data_var_qv7[data_current.data_prd<75]

data_uh0_above_futures = data_futures.data_prd[data_futures.data_prd>=75]

data_tk1_above_futures = data_futures.data_var_tk1[data_futures.data_prd>=75]
data_tk3_above_futures = data_futures.data_var_tk3[data_futures.data_prd>=75]
data_tk5_above_futures = data_futures.data_var_tk5[data_futures.data_prd>=75]
data_tk7_above_futures = data_futures.data_var_tk7[data_futures.data_prd>=75]

data_ev1_above_futures = data_futures.data_var_ev1[data_futures.data_prd>=75]
data_ev3_above_futures = data_futures.data_var_ev3[data_futures.data_prd>=75]
data_ev5_above_futures = data_futures.data_var_ev5[data_futures.data_prd>=75]
data_ev7_above_futures = data_futures.data_var_ev7[data_futures.data_prd>=75]

data_eu1_above_futures = data_futures.data_var_eu1[data_futures.data_prd>=75]
data_eu3_above_futures = data_futures.data_var_eu3[data_futures.data_prd>=75]
data_eu5_above_futures = data_futures.data_var_eu5[data_futures.data_prd>=75]
data_eu7_above_futures = data_futures.data_var_eu7[data_futures.data_prd>=75]

data_pr1_above_futures = data_futures.data_var_pr1[data_futures.data_prd>=75]
data_pr3_above_futures = data_futures.data_var_pr3[data_futures.data_prd>=75]
data_pr5_above_futures = data_futures.data_var_pr5[data_futures.data_prd>=75]
data_pr7_above_futures = data_futures.data_var_pr7[data_futures.data_prd>=75]

data_qv1_above_futures = data_futures.data_var_qv1[data_futures.data_prd>=75]
data_qv3_above_futures = data_futures.data_var_qv3[data_futures.data_prd>=75]
data_qv5_above_futures = data_futures.data_var_qv5[data_futures.data_prd>=75]
data_qv7_above_futures = data_futures.data_var_qv7[data_futures.data_prd>=75]

data_uh0_below_futures = data_futures.data_prd[data_futures.data_prd<75]

data_tk1_below_futures = data_futures.data_var_tk1[data_futures.data_prd<75]
data_tk3_below_futures = data_futures.data_var_tk3[data_futures.data_prd<75]
data_tk5_below_futures = data_futures.data_var_tk5[data_futures.data_prd<75]
data_tk7_below_futures = data_futures.data_var_tk7[data_futures.data_prd<75]

data_ev1_below_futures = data_futures.data_var_ev1[data_futures.data_prd<75]
data_ev3_below_futures = data_futures.data_var_ev3[data_futures.data_prd<75]
data_ev5_below_futures = data_futures.data_var_ev5[data_futures.data_prd<75]
data_ev7_below_futures = data_futures.data_var_ev7[data_futures.data_prd<75]

data_eu1_below_futures = data_futures.data_var_eu1[data_futures.data_prd<75]
data_eu3_below_futures = data_futures.data_var_eu3[data_futures.data_prd<75]
data_eu5_below_futures = data_futures.data_var_eu5[data_futures.data_prd<75]
data_eu7_below_futures = data_futures.data_var_eu7[data_futures.data_prd<75]

data_pr1_below_futures = data_futures.data_var_pr1[data_futures.data_prd<75]
data_pr3_below_futures = data_futures.data_var_pr3[data_futures.data_prd<75]
data_pr5_below_futures = data_futures.data_var_pr5[data_futures.data_prd<75]
data_pr7_below_futures = data_futures.data_var_pr7[data_futures.data_prd<75]

data_qv1_below_futures = data_futures.data_var_qv1[data_futures.data_prd<75]
data_qv3_below_futures = data_futures.data_var_qv3[data_futures.data_prd<75]
data_qv5_below_futures = data_futures.data_var_qv5[data_futures.data_prd<75]
data_qv7_below_futures = data_futures.data_var_qv7[data_futures.data_prd<75]

There are more storm patches that exceed the UH threshold in the future climate than the current climate:

In [7]:
print("Current climate >75m2/s2: ",data_uh0_above_current.shape[0])
print("Future climate >75m2/s2: ",data_uh0_above_futures.shape[0])

Current climate >75m2/s2:  33284
Future climate >75m2/s2:  39753


Creation of various functions for use in analysis

In [30]:
def minmax_scale_apply(thedata):
    #apply min max normalize the input data
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(thedata)
    return scaler.transform(thedata)

def standardize_scale_apply(thedata):
    #standardization of the data
    #to interpret: "this data point is X standard deviations below/above the mean of the data set."
    return np.divide((thedata - np.nanmean(thedata)), np.std(thedata))

def standardize_scale_apply_test(thedatatrain, thedatatest):
    #standardization of the test data using the training mean and standard deviation.
    return np.divide((thedatatest - np.nanmean(thedatatrain)), np.std(thedatatrain))


def data_permute(data_a, data_b, data_split=0.6, spit_result=False):
    
    """
    This function splits the data into desired percentage of training versus test. 
    Data is permuted (shuffled) prior to being split.
    
    Input Parameters:
    data_a: data above the specified UH threshold (np.array).
    data_b: data below the specified UH threshold (np.array).
    data_split: percent of total data to be used for training (float).
    split_result: whether to split and permute the label data.
    """
    
    #split and permute the above threshold data.
    np.random.seed(0)
    select_train_a = np.random.permutation(data_a.shape[0])[:int(data_a.shape[0]*data_split)]
    np.random.seed(0)
    select_test_a = np.random.permutation(data_a.shape[0])[int(data_a.shape[0]*data_split):]
    train_patches_a = data_a[select_train_a]
    test_patches_a = data_a[select_test_a]
    
    #split and permute the below threshold data using above threshold total data shape.
    np.random.seed(0)
    select_train_b = np.random.permutation(data_b.shape[0])[:int(data_a.shape[0]*data_split)]
    np.random.seed(0)
    select_test_b = np.random.permutation(data_b.shape[0])[int(data_a.shape[0]*data_split):
                                                           int((((data_a.shape[0]*(1-data_split))*data_b.shape[0])/data_a.shape[0])+(data_a.shape[0]*(1-data_split)))]
    train_patches_b = data_b[select_train_b]
    test_patches_b = data_b[select_test_b]
    
    #combine the above and below threshold data into one total training data set.
    total_train_data = np.hstack([train_patches_a, train_patches_b])
    if spit_result:
        result_train_data = np.hstack([np.ones(train_patches_a.shape[0]), np.zeros(train_patches_b.shape[0])])
    #combine the above and below test data into one total test data set.
    total_test_data = np.hstack([test_patches_a, test_patches_b])
    if spit_result:
        result_test_data = np.hstack([np.ones(test_patches_a.shape[0]), np.zeros(test_patches_b.shape[0])])
    
    #shuffle/permute the combined dataset.
    np.random.seed(5)
    indx_1 = np.random.permutation(total_train_data.shape[0])
    indx_1 = total_train_data[indx_1]
    if spit_result:
        np.random.seed(5)
        indx_2 = np.random.permutation(result_train_data.shape[0])
        indx_2 = result_train_data[indx_2]
    #shuffle/permute the combined dataset.
    np.random.seed(10)
    indx_3 = np.random.permutation(total_test_data.shape[0])
    indx_3 = total_test_data[indx_3]
    if spit_result:
        np.random.seed(10)
        indx_4 = np.random.permutation(result_test_data.shape[0])
        indx_4 = result_test_data[indx_4]
        
    #return data.
    if not spit_result:
        return indx_1, indx_3
    if spit_result:
        return indx_1, indx_3, indx_2, indx_4

Permute, combine, and split the above and below threshold data into training and testing data sets.

In [31]:
train_tk1_current, test_tk1_current, train_uh0_current, test_uh0_current = data_permute(data_tk1_above_current, 
                                                                                        data_tk1_below_current, spit_result=True)

train_tk3_current, test_tk3_current = data_permute(data_tk3_above_current, data_tk3_below_current)
train_tk5_current, test_tk5_current = data_permute(data_tk5_above_current, data_tk5_below_current)
train_tk7_current, test_tk7_current = data_permute(data_tk7_above_current, data_tk7_below_current)

train_ev1_current, test_ev1_current = data_permute(data_ev1_above_current, data_ev1_below_current)
train_ev3_current, test_ev3_current = data_permute(data_ev3_above_current, data_ev3_below_current)
train_ev5_current, test_ev5_current = data_permute(data_ev5_above_current, data_ev5_below_current)
train_ev7_current, test_ev7_current = data_permute(data_ev7_above_current, data_ev7_below_current)

train_eu1_current, test_eu1_current = data_permute(data_eu1_above_current, data_eu1_below_current)
train_eu3_current, test_eu3_current = data_permute(data_eu3_above_current, data_eu3_below_current)
train_eu5_current, test_eu5_current = data_permute(data_eu5_above_current, data_eu5_below_current)
train_eu7_current, test_eu7_current = data_permute(data_eu7_above_current, data_eu7_below_current)

train_pr1_current, test_pr1_current = data_permute(data_pr1_above_current, data_pr1_below_current)
train_pr3_current, test_pr3_current = data_permute(data_pr3_above_current, data_pr3_below_current)
train_pr5_current, test_pr5_current = data_permute(data_pr5_above_current, data_pr5_below_current)
train_pr7_current, test_pr7_current = data_permute(data_pr7_above_current, data_pr7_below_current)

train_qv1_current, test_qv1_current = data_permute(data_qv1_above_current, data_qv1_below_current)
train_qv3_current, test_qv3_current = data_permute(data_qv3_above_current, data_qv3_below_current)
train_qv5_current, test_qv5_current = data_permute(data_qv5_above_current, data_qv5_below_current)
train_qv7_current, test_qv7_current = data_permute(data_qv7_above_current, data_qv7_below_current)


train_tk1_futures, test_tk1_futures, train_uh0_futures, test_uh0_futures = data_permute(data_tk1_above_futures, 
                                                                                        data_tk1_below_futures, spit_result=True)

train_tk3_futures, test_tk3_futures = data_permute(data_tk3_above_futures, data_tk3_below_futures)
train_tk5_futures, test_tk5_futures = data_permute(data_tk5_above_futures, data_tk5_below_futures)
train_tk7_futures, test_tk7_futures = data_permute(data_tk7_above_futures, data_tk7_below_futures)

train_ev1_futures, test_ev1_futures = data_permute(data_ev1_above_futures, data_ev1_below_futures)
train_ev3_futures, test_ev3_futures = data_permute(data_ev3_above_futures, data_ev3_below_futures)
train_ev5_futures, test_ev5_futures = data_permute(data_ev5_above_futures, data_ev5_below_futures)
train_ev7_futures, test_ev7_futures = data_permute(data_ev7_above_futures, data_ev7_below_futures)

train_eu1_futures, test_eu1_futures = data_permute(data_eu1_above_futures, data_eu1_below_futures)
train_eu3_futures, test_eu3_futures = data_permute(data_eu3_above_futures, data_eu3_below_futures)
train_eu5_futures, test_eu5_futures = data_permute(data_eu5_above_futures, data_eu5_below_futures)
train_eu7_futures, test_eu7_futures = data_permute(data_eu7_above_futures, data_eu7_below_futures)

train_pr1_futures, test_pr1_futures = data_permute(data_pr1_above_futures, data_pr1_below_futures)
train_pr3_futures, test_pr3_futures = data_permute(data_pr3_above_futures, data_pr3_below_futures)
train_pr5_futures, test_pr5_futures = data_permute(data_pr5_above_futures, data_pr5_below_futures)
train_pr7_futures, test_pr7_futures = data_permute(data_pr7_above_futures, data_pr7_below_futures)

train_qv1_futures, test_qv1_futures = data_permute(data_qv1_above_futures, data_qv1_below_futures)
train_qv3_futures, test_qv3_futures = data_permute(data_qv3_above_futures, data_qv3_below_futures)
train_qv5_futures, test_qv5_futures = data_permute(data_qv5_above_futures, data_qv5_below_futures)
train_qv7_futures, test_qv7_futures = data_permute(data_qv7_above_futures, data_qv7_below_futures)

Create an array containing the training features in correct shape, with standardization applied.

In [32]:
X_train_current = np.hstack([
    standardize_scale_apply(train_tk1_current.reshape(-1,1)), 
    standardize_scale_apply(train_tk3_current.reshape(-1,1)), 
    standardize_scale_apply(train_tk5_current.reshape(-1,1)), 
    standardize_scale_apply(train_tk7_current.reshape(-1,1)),                     
    
    standardize_scale_apply(train_ev1_current.reshape(-1,1)), 
    standardize_scale_apply(train_ev3_current.reshape(-1,1)), 
    standardize_scale_apply(train_ev5_current.reshape(-1,1)), 
    standardize_scale_apply(train_ev7_current.reshape(-1,1)),   
    
    standardize_scale_apply(train_eu1_current.reshape(-1,1)), 
    standardize_scale_apply(train_eu3_current.reshape(-1,1)), 
    standardize_scale_apply(train_eu5_current.reshape(-1,1)), 
    standardize_scale_apply(train_eu7_current.reshape(-1,1)),                     
                     
    standardize_scale_apply(train_pr1_current.reshape(-1,1)), 
    standardize_scale_apply(train_pr3_current.reshape(-1,1)), 
    standardize_scale_apply(train_pr5_current.reshape(-1,1)), 
    standardize_scale_apply(train_pr7_current.reshape(-1,1)),                     
                     
    standardize_scale_apply(train_qv1_current.reshape(-1,1)), 
    standardize_scale_apply(train_qv3_current.reshape(-1,1)), 
    standardize_scale_apply(train_qv5_current.reshape(-1,1)), 
    standardize_scale_apply(train_qv7_current.reshape(-1,1))
])

X_test_current = np.hstack([
    standardize_scale_apply_test(train_tk1_current.reshape(-1,1), test_tk1_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_tk3_current.reshape(-1,1), test_tk3_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_tk5_current.reshape(-1,1), test_tk5_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_tk7_current.reshape(-1,1), test_tk7_current.reshape(-1,1)), 
    
    standardize_scale_apply_test(train_ev1_current.reshape(-1,1), test_ev1_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_ev3_current.reshape(-1,1), test_ev3_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_ev5_current.reshape(-1,1), test_ev5_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_ev7_current.reshape(-1,1), test_ev7_current.reshape(-1,1)),    
    
    standardize_scale_apply_test(train_eu1_current.reshape(-1,1), test_eu1_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_eu3_current.reshape(-1,1), test_eu3_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_eu5_current.reshape(-1,1), test_eu5_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_eu7_current.reshape(-1,1), test_eu7_current.reshape(-1,1)), 
    
    standardize_scale_apply_test(train_pr1_current.reshape(-1,1), test_pr1_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_pr3_current.reshape(-1,1), test_pr3_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_pr5_current.reshape(-1,1), test_pr5_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_pr7_current.reshape(-1,1), test_pr7_current.reshape(-1,1)),  
    
    standardize_scale_apply_test(train_qv1_current.reshape(-1,1), test_qv1_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_qv3_current.reshape(-1,1), test_qv3_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_qv5_current.reshape(-1,1), test_qv5_current.reshape(-1,1)), 
    standardize_scale_apply_test(train_qv7_current.reshape(-1,1), test_qv7_current.reshape(-1,1))
])

X_train_futures = np.hstack([
    standardize_scale_apply(train_tk1_futures.reshape(-1,1)), 
    standardize_scale_apply(train_tk3_futures.reshape(-1,1)), 
    standardize_scale_apply(train_tk5_futures.reshape(-1,1)), 
    standardize_scale_apply(train_tk7_futures.reshape(-1,1)),          
                                          
    standardize_scale_apply(train_ev1_futures.reshape(-1,1)), 
    standardize_scale_apply(train_ev3_futures.reshape(-1,1)), 
    standardize_scale_apply(train_ev5_futures.reshape(-1,1)), 
    standardize_scale_apply(train_ev7_futures.reshape(-1,1)),                     
                     
    standardize_scale_apply(train_eu1_futures.reshape(-1,1)), 
    standardize_scale_apply(train_eu3_futures.reshape(-1,1)), 
    standardize_scale_apply(train_eu5_futures.reshape(-1,1)), 
    standardize_scale_apply(train_eu7_futures.reshape(-1,1)),                     
                     
    standardize_scale_apply(train_pr1_futures.reshape(-1,1)), 
    standardize_scale_apply(train_pr3_futures.reshape(-1,1)), 
    standardize_scale_apply(train_pr5_futures.reshape(-1,1)), 
    standardize_scale_apply(train_pr7_futures.reshape(-1,1)),                     
                     
    standardize_scale_apply(train_qv1_futures.reshape(-1,1)), 
    standardize_scale_apply(train_qv3_futures.reshape(-1,1)), 
    standardize_scale_apply(train_qv5_futures.reshape(-1,1)), 
    standardize_scale_apply(train_qv7_futures.reshape(-1,1))
])

X_test_futures = np.hstack([
    standardize_scale_apply_test(train_tk1_futures.reshape(-1,1), test_tk1_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_tk3_futures.reshape(-1,1), test_tk3_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_tk5_futures.reshape(-1,1), test_tk5_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_tk7_futures.reshape(-1,1), test_tk7_futures.reshape(-1,1)), 
    
    standardize_scale_apply_test(train_ev1_futures.reshape(-1,1), test_ev1_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_ev3_futures.reshape(-1,1), test_ev3_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_ev5_futures.reshape(-1,1), test_ev5_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_ev7_futures.reshape(-1,1), test_ev7_futures.reshape(-1,1)),    
    
    standardize_scale_apply_test(train_eu1_futures.reshape(-1,1), test_eu1_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_eu3_futures.reshape(-1,1), test_eu3_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_eu5_futures.reshape(-1,1), test_eu5_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_eu7_futures.reshape(-1,1), test_eu7_futures.reshape(-1,1)), 
    
    standardize_scale_apply_test(train_pr1_futures.reshape(-1,1), test_pr1_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_pr3_futures.reshape(-1,1), test_pr3_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_pr5_futures.reshape(-1,1), test_pr5_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_pr7_futures.reshape(-1,1), test_pr7_futures.reshape(-1,1)),  
    
    standardize_scale_apply_test(train_qv1_futures.reshape(-1,1), test_qv1_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_qv3_futures.reshape(-1,1), test_qv3_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_qv5_futures.reshape(-1,1), test_qv5_futures.reshape(-1,1)), 
    standardize_scale_apply_test(train_qv7_futures.reshape(-1,1), test_qv7_futures.reshape(-1,1))
])

In [33]:
print(X_train_current.shape)
print(X_test_current.shape)
print(X_train_futures.shape)
print(X_test_futures.shape)

(39940, 20)
(548339, 20)
(47702, 20)
(560022, 20)


Save training and testing data for current and future climate as one file for future use.

In [34]:
data_assemble = xr.Dataset({
    'X_train_current':(['a','features'], X_train_current),
    'X_train_current_label':(['a'], train_uh0_current),
    'X_test_current':(['b','features'], X_test_current),
    'X_test_current_label':(['b'], test_uh0_current),
    'X_train_futures':(['c','features'], X_train_futures),
    'X_train_futures_label':(['c'], train_uh0_futures),
    'X_test_futures':(['d','features'], X_test_futures),
    'X_test_futures_label':(['d'], test_uh0_futures),
    },
     coords=
    {'feature':(['features'],np.array(["tk_1km", "tk_3km", "tk_5km", "tk_7km",
                                       "ev_1km", "ev_3km", "ev_5km", "ev_7km",
                                       "eu_1km", "eu_3km", "eu_5km", "eu_7km",
                                       "pr_1km", "pr_3km", "pr_5km", "pr_7km",
                                       "qv_1km", "qv_3km", "qv_5km", "qv_7km"])),
    })

In [35]:
data_assemble

<xarray.Dataset>
Dimensions:                (a: 39940, b: 548339, c: 47702, d: 560022, features: 20)
Coordinates:
    feature                (features) <U6 'tk_1km' 'tk_3km' ... 'qv_7km'
Dimensions without coordinates: a, b, c, d, features
Data variables:
    X_train_current        (a, features) float32 0.916849 ... -1.7484128
    X_train_current_label  (a) float64 1.0 1.0 1.0 0.0 1.0 ... 0.0 0.0 1.0 0.0
    X_test_current         (b, features) float32 -0.4930225 ... -1.129027
    X_test_current_label   (b) float64 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
    X_train_futures        (c, features) float32 -0.8571867 ... 2.1425257
    X_train_futures_label  (c) float64 1.0 0.0 1.0 0.0 1.0 ... 0.0 1.0 1.0 0.0
    X_test_futures         (d, features) float32 0.73516613 ... -0.760373
    X_test_futures_label   (d) float64 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0

In [36]:
data_assemble.to_netcdf("/glade/scratch/molina/WRF_CONUS1_derived/logistic_regression/varsatUHmax_traintestdata_unbalanced.nc")