# Create Training and Testing Data for a Logistic Regression
## Version: Using Mean Patch data during Current or Future Climate (UNBALANCED)
### Requires: create_UHindx_file_step1, ..file_step2, ..file_step3 (e.g., future_uh75patches_12.nc)
First, import relevant packages.

In [1]:
import xarray as xr
import numpy as np
from ncar_jobqueue import NCARCluster
from dask.distributed import Client

Choose the climate to work with (e.g., current or future).

In [75]:
which_climate = 'current'

Start dask workers with adaptive scaling to load data for training.

In [3]:
#--------------------------------------------------

#if __name__== "__main__":

#start dask workers
cluster = NCARCluster(memory="109GB", cores=36)
cluster.adapt(minimum=1, maximum=10, wait_count=60)
cluster
#print scripts
print(cluster.job_script())
#start client
client = Client(cluster)
client

#--------------------------------------------------

#!/usr/bin/env bash

#PBS -N dask-worker
#PBS -q regular
#PBS -A P54048000
#PBS -l select=1:ncpus=36:mem=109GB
#PBS -l walltime=01:00:00
#PBS -e /glade/scratch/molina/
#PBS -o /glade/scratch/molina/
JOB_ID=${PBS_JOBID%%.*}



/glade/work/molina/miniconda3/envs/python-tutorial/bin/python -m distributed.cli.dask_worker tcp://10.148.10.17:36175 --nthreads 36 --memory-limit 109.00GB --name dask-worker--${JOB_ID}-- --death-timeout 60 --local-directory /glade/scratch/molina --interface ib0



0,1
Client  Scheduler: tcp://10.148.10.17:36175  Dashboard: https://jupyterhub.ucar.edu/ch/user/molina/proxy/8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B



Load storm patch data that was previously separated using UH>75 and UH<75 m2/s2 thresholds.

In [76]:
data_dec_above = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_uh75patches_12.nc", 
                                   parallel=True, combine='by_coords')
data_jan_above = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_uh75patches_01.nc",
                                   parallel=True, combine='by_coords')
data_feb_above = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_uh75patches_02.nc", 
                                   parallel=True, combine='by_coords')

data_mar_above = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_uh75patches_03.nc", 
                                   parallel=True, combine='by_coords')
data_apr_above = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_uh75patches_04.nc", 
                                   parallel=True, combine='by_coords')
data_may_above = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_uh75patches_05.nc", 
                                   parallel=True, combine='by_coords')

data_above = xr.concat([data_dec_above, data_jan_above, data_feb_above, data_mar_above, data_apr_above, data_may_above], dim='patch')

In [77]:
data_dec_below = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_nonuh75patches_12.nc",
                                  parallel=True, combine='by_coords')
data_jan_below = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_nonuh75patches_01.nc",
                                   parallel=True, combine='by_coords')
data_feb_below = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_nonuh75patches_02.nc",
                                   parallel=True, combine='by_coords')

data_mar_below = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_nonuh75patches_03.nc",
                                   parallel=True, combine='by_coords')
data_apr_below = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_nonuh75patches_04.nc",
                                   parallel=True, combine='by_coords')
data_may_below = xr.open_mfdataset(f"/glade/scratch/molina/WRF_CONUS1_derived/storm_envs/{which_climate}_nonuh75patches_05.nc",
                                   parallel=True, combine='by_coords')

data_below = xr.concat([data_dec_below, data_jan_below, data_feb_below, data_mar_below, data_apr_below, data_may_below], dim='patch')

In [78]:
print("The ratio of strongly rotating to not strongly rotating storm patches expressed as a percent is about: ", 
      round((data_above.patch.size/data_below.patch.size)*100), "%")

The ratio of strongly rotating to not strongly rotating storm patches expressed as a percent is about:  2 %


Creation of various functions for use in analysis.

In [79]:
def create_traintest_data(data_b, data_a, split_perc=0.6, return_label=False):
    #balancing of above and below threshold data for training data, spitting out remainder for testing
    #permute and slice the below threshold data to equal the above threshold data shape.
    
    #train above
    np.random.seed(0)
    select_data = np.random.permutation(data_a.shape[0])[:int(data_a.shape[0]*split_perc)]
    train_above = data_a[select_data]
    
    #train below
    np.random.seed(0)
    select_data = np.random.permutation(data_b.shape[0])[:int(data_a.shape[0]*split_perc)]
    train_below = data_b[select_data]
    
    #test above
    np.random.seed(0)
    select_data = np.random.permutation(data_a.shape[0])[int(data_a.shape[0]*split_perc):]
    test_above = data_a[select_data]
    
    #test below
    np.random.seed(0)
    #slicing to get respective ratio of above to below UH data patches
    select_data = np.random.permutation(data_b.shape[0])[int(data_a.shape[0]*split_perc):
                                                         int((((data_a.shape[0]*(1-split_perc))*data_b.shape[0])/data_a.shape[0])+(data_a.shape[0]*(1-split_perc)))]
    test_below = data_b[select_data]

    #create the label data
    train_above_label = np.ones(train_above.shape[0])
    train_below_label = np.zeros(train_below.shape[0])
    test_above_label = np.ones(test_above.shape[0])
    test_below_label = np.zeros(test_below.shape[0])
    
    #merge above and below data in prep to shuffle/permute
    train_data = np.vstack([train_above, train_below])
    if return_label:
        train_label = np.hstack([train_above_label, train_below_label])
    test_data = np.vstack([test_above, test_below])
    if return_label:
        test_label = np.hstack([test_above_label, test_below_label])
    
    #finally, permute the data that has been merged and properly balanced
    np.random.seed(10)
    train_data = np.random.permutation(train_data)
    np.random.seed(10)
    test_data = np.random.permutation(test_data)
    if not return_label:
        return train_data, test_data  
    if return_label:
        np.random.seed(10)
        train_label = np.random.permutation(train_label)
        np.random.seed(10)
        test_label = np.random.permutation(test_label)    
        return train_data, test_data, train_label, test_label


def minmax_scale_apply(thedata):
    #apply min max normalize the input data
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(thedata)
    return scaler.transform(thedata)

def standardize_scale_apply(thedata):
    #standardization of the data
    #to interpret: "this data point is X standard deviations below/above the mean of the data set."
    return np.divide((thedata - np.nanmean(thedata)), np.std(thedata))

def standardize_scale_apply_test(thedatatrain, thedatatest):
    #standardization of the test data using the training mean and standard deviation.
    return np.divide((thedatatest - np.nanmean(thedatatrain)), np.std(thedatatrain))


Extract the permuted below threshold data of the length of the above threshold data to balance the distribution of above and below threshold storm patches.

In [80]:
#above 
mean_above_data_temp1 = data_above.temp_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_temp3 = data_above.temp_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_temp5 = data_above.temp_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_temp7 = data_above.temp_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_above_data_evwd1 = data_above.evwd_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_evwd3 = data_above.evwd_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_evwd5 = data_above.evwd_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_evwd7 = data_above.evwd_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_above_data_euwd1 = data_above.euwd_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_euwd3 = data_above.euwd_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_euwd5 = data_above.euwd_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_euwd7 = data_above.euwd_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_above_data_qvap1 = data_above.qvap_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_qvap3 = data_above.qvap_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_qvap5 = data_above.qvap_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_qvap7 = data_above.qvap_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_above_data_pres1 = data_above.pres_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_pres3 = data_above.pres_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_pres5 = data_above.pres_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_pres7 = data_above.pres_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_above_data_qgrp1 = data_above.qgrp_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_qgrp3 = data_above.qgrp_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_qgrp5 = data_above.qgrp_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_above_data_qgrp7 = data_above.qgrp_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

#below
mean_below_data_temp1 = data_below.temp_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_temp3 = data_below.temp_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_temp5 = data_below.temp_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_temp7 = data_below.temp_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_below_data_evwd1 = data_below.evwd_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_evwd3 = data_below.evwd_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_evwd5 = data_below.evwd_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_evwd7 = data_below.evwd_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_below_data_euwd1 = data_below.euwd_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_euwd3 = data_below.euwd_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_euwd5 = data_below.euwd_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_euwd7 = data_below.euwd_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_below_data_qvap1 = data_below.qvap_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_qvap3 = data_below.qvap_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_qvap5 = data_below.qvap_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_qvap7 = data_below.qvap_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_below_data_pres1 = data_below.pres_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_pres3 = data_below.pres_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_pres5 = data_below.pres_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_pres7 = data_below.pres_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

mean_below_data_qgrp1 = data_below.qgrp_sev_1.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_qgrp3 = data_below.qgrp_sev_3.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_qgrp5 = data_below.qgrp_sev_5.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)
mean_below_data_qgrp7 = data_below.qgrp_sev_7.mean(axis=(1,2), skipna=True).values.reshape(-1, 1)

In [81]:
mean_train_temp1, mean_test_temp1, train_label, test_label = create_traintest_data(mean_below_data_temp1, mean_above_data_temp1, split_perc=0.6, return_label=True)
mean_train_temp3, mean_test_temp3 = create_traintest_data(mean_below_data_temp3, mean_above_data_temp3, split_perc=0.6, return_label=False)
mean_train_temp5, mean_test_temp5 = create_traintest_data(mean_below_data_temp5, mean_above_data_temp5, split_perc=0.6, return_label=False)
mean_train_temp7, mean_test_temp7 = create_traintest_data(mean_below_data_temp7, mean_above_data_temp7, split_perc=0.6, return_label=False)

mean_train_evwd1, mean_test_evwd1 = create_traintest_data(mean_below_data_evwd1, mean_above_data_evwd1, split_perc=0.6, return_label=False)
mean_train_evwd3, mean_test_evwd3 = create_traintest_data(mean_below_data_evwd3, mean_above_data_evwd3, split_perc=0.6, return_label=False)
mean_train_evwd5, mean_test_evwd5 = create_traintest_data(mean_below_data_evwd5, mean_above_data_evwd5, split_perc=0.6, return_label=False)
mean_train_evwd7, mean_test_evwd7 = create_traintest_data(mean_below_data_evwd7, mean_above_data_evwd7, split_perc=0.6, return_label=False)

mean_train_euwd1, mean_test_euwd1 = create_traintest_data(mean_below_data_euwd1, mean_above_data_euwd1, split_perc=0.6, return_label=False)
mean_train_euwd3, mean_test_euwd3 = create_traintest_data(mean_below_data_euwd3, mean_above_data_euwd3, split_perc=0.6, return_label=False)
mean_train_euwd5, mean_test_euwd5 = create_traintest_data(mean_below_data_euwd5, mean_above_data_euwd5, split_perc=0.6, return_label=False)
mean_train_euwd7, mean_test_euwd7 = create_traintest_data(mean_below_data_euwd7, mean_above_data_euwd7, split_perc=0.6, return_label=False)

mean_train_qvap1, mean_test_qvap1 = create_traintest_data(mean_below_data_qvap1, mean_above_data_qvap1, split_perc=0.6, return_label=False)
mean_train_qvap3, mean_test_qvap3 = create_traintest_data(mean_below_data_qvap3, mean_above_data_qvap3, split_perc=0.6, return_label=False)
mean_train_qvap5, mean_test_qvap5 = create_traintest_data(mean_below_data_qvap5, mean_above_data_qvap5, split_perc=0.6, return_label=False)
mean_train_qvap7, mean_test_qvap7 = create_traintest_data(mean_below_data_qvap7, mean_above_data_qvap7, split_perc=0.6, return_label=False)

mean_train_pres1, mean_test_pres1 = create_traintest_data(mean_below_data_pres1, mean_above_data_pres1, split_perc=0.6, return_label=False)
mean_train_pres3, mean_test_pres3 = create_traintest_data(mean_below_data_pres3, mean_above_data_pres3, split_perc=0.6, return_label=False)
mean_train_pres5, mean_test_pres5 = create_traintest_data(mean_below_data_pres5, mean_above_data_pres5, split_perc=0.6, return_label=False)
mean_train_pres7, mean_test_pres7 = create_traintest_data(mean_below_data_pres7, mean_above_data_pres7, split_perc=0.6, return_label=False)

In [82]:
data_scaled_train_temp1 = standardize_scale_apply(mean_train_temp1)
data_scaled_train_temp3 = standardize_scale_apply(mean_train_temp3)
data_scaled_train_temp5 = standardize_scale_apply(mean_train_temp5)
data_scaled_train_temp7 = standardize_scale_apply(mean_train_temp7)

data_scaled_train_evwd1 = standardize_scale_apply(mean_train_evwd1)
data_scaled_train_evwd3 = standardize_scale_apply(mean_train_evwd3)
data_scaled_train_evwd5 = standardize_scale_apply(mean_train_evwd5)
data_scaled_train_evwd7 = standardize_scale_apply(mean_train_evwd7)

data_scaled_train_euwd1 = standardize_scale_apply(mean_train_euwd1)
data_scaled_train_euwd3 = standardize_scale_apply(mean_train_euwd3)
data_scaled_train_euwd5 = standardize_scale_apply(mean_train_euwd5)
data_scaled_train_euwd7 = standardize_scale_apply(mean_train_euwd7)

data_scaled_train_qvap1 = standardize_scale_apply(mean_train_qvap1)
data_scaled_train_qvap3 = standardize_scale_apply(mean_train_qvap3)
data_scaled_train_qvap5 = standardize_scale_apply(mean_train_qvap5)
data_scaled_train_qvap7 = standardize_scale_apply(mean_train_qvap7)

data_scaled_train_pres1 = standardize_scale_apply(mean_train_pres1)
data_scaled_train_pres3 = standardize_scale_apply(mean_train_pres3)
data_scaled_train_pres5 = standardize_scale_apply(mean_train_pres5)
data_scaled_train_pres7 = standardize_scale_apply(mean_train_pres7)

In [83]:
data_scaled_test_temp1 = standardize_scale_apply_test(mean_train_temp1, mean_test_temp1)
data_scaled_test_temp3 = standardize_scale_apply_test(mean_train_temp1, mean_test_temp3)
data_scaled_test_temp5 = standardize_scale_apply_test(mean_train_temp1, mean_test_temp5)
data_scaled_test_temp7 = standardize_scale_apply_test(mean_train_temp1, mean_test_temp7)

data_scaled_test_evwd1 = standardize_scale_apply_test(mean_train_evwd1, mean_test_evwd1)
data_scaled_test_evwd3 = standardize_scale_apply_test(mean_train_evwd3, mean_test_evwd3)
data_scaled_test_evwd5 = standardize_scale_apply_test(mean_train_evwd5, mean_test_evwd5)
data_scaled_test_evwd7 = standardize_scale_apply_test(mean_train_evwd7, mean_test_evwd7)

data_scaled_test_euwd1 = standardize_scale_apply_test(mean_train_euwd1, mean_test_euwd1)
data_scaled_test_euwd3 = standardize_scale_apply_test(mean_train_euwd3, mean_test_euwd3)
data_scaled_test_euwd5 = standardize_scale_apply_test(mean_train_euwd5, mean_test_euwd5)
data_scaled_test_euwd7 = standardize_scale_apply_test(mean_train_euwd7, mean_test_euwd7)

data_scaled_test_qvap1 = standardize_scale_apply_test(mean_train_qvap1, mean_test_qvap1)
data_scaled_test_qvap3 = standardize_scale_apply_test(mean_train_qvap3, mean_test_qvap3)
data_scaled_test_qvap5 = standardize_scale_apply_test(mean_train_qvap5, mean_test_qvap5)
data_scaled_test_qvap7 = standardize_scale_apply_test(mean_train_qvap7, mean_test_qvap7)

data_scaled_test_pres1 = standardize_scale_apply_test(mean_train_pres1, mean_test_pres1)
data_scaled_test_pres3 = standardize_scale_apply_test(mean_train_pres3, mean_test_pres3)
data_scaled_test_pres5 = standardize_scale_apply_test(mean_train_pres5, mean_test_pres5)
data_scaled_test_pres7 = standardize_scale_apply_test(mean_train_pres7, mean_test_pres7)

In [84]:
X_train = np.hstack([data_scaled_train_temp1, data_scaled_train_temp3, data_scaled_train_temp5, data_scaled_train_temp7,                     
                     data_scaled_train_evwd1, data_scaled_train_evwd3, data_scaled_train_evwd5, data_scaled_train_evwd7,     
                     data_scaled_train_euwd1, data_scaled_train_euwd3, data_scaled_train_euwd5, data_scaled_train_euwd7,
                     data_scaled_train_qvap1, data_scaled_train_qvap3, data_scaled_train_qvap5, data_scaled_train_qvap7,
                     data_scaled_train_pres1, data_scaled_train_pres3, data_scaled_train_pres5, data_scaled_train_pres7
])

In [85]:
X_test = np.hstack([data_scaled_test_temp1, data_scaled_test_temp3, data_scaled_test_temp5, data_scaled_test_temp7,
                    data_scaled_test_evwd1, data_scaled_test_evwd3, data_scaled_test_evwd5, data_scaled_test_evwd7,
                    data_scaled_test_euwd1, data_scaled_test_euwd3, data_scaled_test_euwd5, data_scaled_test_euwd7,
                    data_scaled_test_qvap1, data_scaled_test_qvap3, data_scaled_test_qvap5, data_scaled_test_qvap7,
                    data_scaled_test_pres1, data_scaled_test_pres3, data_scaled_test_pres5, data_scaled_test_pres7
])

In [86]:
data_assemble = xr.Dataset({
    'X_train':(['a','features'], X_train),
    'X_train_label':(['a'], train_label),
    'X_test':(['b','features'], X_test),
    'X_test_label':(['b'], test_label),
    },
     coords=
    {'feature':(['features'],np.array(["tk_1km", "tk_3km", "tk_5km", "tk_7km",
                                       "ev_1km", "ev_3km", "ev_5km", "ev_7km",
                                       "eu_1km", "eu_3km", "eu_5km", "eu_7km",
                                       "pr_1km", "pr_3km", "pr_5km", "pr_7km",
                                       "qv_1km", "qv_3km", "qv_5km", "qv_7km"])),
    })

In [87]:
data_assemble

<xarray.Dataset>
Dimensions:        (a: 39940, b: 548339, features: 20)
Coordinates:
    feature        (features) <U6 'tk_1km' 'tk_3km' ... 'qv_5km' 'qv_7km'
Dimensions without coordinates: a, b, features
Data variables:
    X_train        (a, features) float32 0.5012527 0.7719492 ... 0.17607747
    X_train_label  (a) float64 1.0 1.0 1.0 0.0 1.0 1.0 ... 1.0 1.0 0.0 0.0 1.0
    X_test         (b, features) float32 0.40386754 -1.7664597 ... -0.6877184
    X_test_label   (b) float64 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0

In [108]:
data_assemble.to_netcdf(f"/glade/scratch/molina/WRF_CONUS1_derived/logistic_regression/{which_climate}_meanpatch_traintestdata_unbalanced.nc")