In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: marcodia
"""
import numpy as np
import random
import xarray as xr
import pandas as pd
import datetime as dt
import time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
import tensorflow as tf

import import_ipynb
import sys
import os 

import network_arch as network
import metrics
import plot
import settings 
import functions_misc as fnc

In [None]:
# # MAKE THE NN ARCHITECTURE
def make_model():
    # Define and train the model
    tf.keras.backend.clear_session()
    model = network.defineNN(HIDDENS,
                             input1_shape = X_train.shape[1],
                             output_shape=NLABEL,
                             ridge_penalty1=RIDGE1,
                             dropout=DROPOUT,
                             act_fun='relu',
                             network_seed=NETWORK_SEED)
    
    loss_function = tf.keras.losses.CategoricalCrossentropy()    
    model.compile(
                  optimizer = tf.keras.optimizers.Adam(learning_rate=LR_INIT),
                  loss = loss_function,
                  metrics = [
                      tf.keras.metrics.CategoricalAccuracy(name="categorical_accuracy", dtype=None),
                      metrics.PredictionAccuracy(NLABEL)
                      ]
                  )           
    return model, loss_function

# #---------------------------------------------------
# #LEARNING RATE CALLBACK FUNCTION
# def scheduler(epoch, lr):
#     # This function keeps the initial learning rate for the first ten epochs
#     # and decreases it exponentially after that.
#     if epoch < 10:
#         return lr
#     else:
#         return lr * tf.math.exp(-0.1)

In [None]:
EXPERIMENT = 'exp_0/exp_000'


ddir_X = '/Users/marcodia/Research/Data/global_daily_anomalies/'
ddir_Y = '/Users/marcodia/Research/Data/processed_fields/precip_data/'
ddir_out = '/Users/marcodia/Research/salinity_s2s/experiments/exp_0/exp_000/' 

In [None]:

params = settings.get_settings(EXPERIMENT)

PREDICTOR_VAR  = params['PREDICTOR_VAR']           
PREDICTAND_VAR = params['PREDICTAND_VAR']              
REGION_TOR     = params['REGION_TOR']          
REGION_TAND    = params['REGION_TAND']            
training_ens   = params['training_ens']            
validation_ens = params['validation_ens']           
testing_ens    = params['testing_ens']           
train_list     = params['train_list']           
lead           = params['lead']            
days_average   = params['days_average']            
GLOBAL_SEED    = params['GLOBAL_SEED']            
HIDDENS        = params['HIDDENS']          
DROPOUT        = params['DROPOUT']            
RIDGE1         = params['RIDGE1']                    
LR_INIT        = params['LR_INIT']
BATCH_SIZE     = params['BATCH_SIZE']           
RANDOM_SEED    = params['RANDOM_SEED']            
act_fun        = params['act_fun']            
N_EPOCHS       = params['N_EPOCHS']           
PATIENCE       = params['PATIENCE']   
window_size = 5

In [None]:
#>>>>>SET UP <<<<<<<<<<<<<<<
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
tf.compat.v1.random.set_random_seed(GLOBAL_SEED)

NLABEL = 2

YEARS = '1850-1949'
STRT = pd.to_datetime('05-01-1850')
END   = pd.to_datetime('08-31-1949')  + dt.timedelta(days=1)

time_range = xr.cftime_range(str(STRT)[:10], str(END)[:10],calendar = 'noleap') #[0:10] corresponds to full datestamp
time_range_szn = time_range.where(fnc.is_mjja(time_range.month)).dropna()
TIME_X = xr.DataArray(time_range_szn + dt.timedelta(days=0), dims=['time'])     
TIME_Y = xr.DataArray(time_range_szn + dt.timedelta(days=lead+days_average), dims=['time'])  #below comment explains time segmentation

In [None]:
# ----- X TRAINING ------
count = 0 
for i in train_list:
    X_finame = PREDICTOR_VAR+'_'+REGION_TOR+'_'+YEARS+'_'+'ens'+i+'_dailyanom_detrend.nc'
    X_all_full = xr.open_dataarray(ddir_X+X_finame)
    X = X_all_full.where(X_all_full.time == TIME_X, drop=True)
    
    X_nptime = np.array(X.time)                 #for some annoying reason, it needed to be converted to numpy for creating DataArray   
    X_nplat = np.array(X.lat)
    X_nplon = np.array(X.lon)
    del X_all_full 

    if count == 0: # don't rewrite empty matrix each time 
        X_all = xr.DataArray(np.zeros((len(train_list),X.shape[0],X.shape[1],X.shape[2]))+np.nan,
                             dims = ['ens','time','lat','lon'],
                             coords = [('ens',np.arange(0,len(train_list))),('time', X_nptime),('lat',X_nplat),('lon',X_nplon)])

    X_all[count,:,:,:] = X   
    
    count = count+1
    del X
    
Xtrain = X_all.stack(time_all=('ens','time')) # lat,lon,time*8 (8= number of training ens members) 
Xtrain = Xtrain.transpose('time_all','lat','lon') # time*8,lat,lon

Xtrain_std = np.std(Xtrain,axis=0)
Xtrain_mean = np.mean(Xtrain,axis=0)
Xtrain = (Xtrain-Xtrain_mean)/Xtrain_std
X_train = Xtrain.stack(z=('lat','lon'))

# ---------- X VALIDATION----------
X_finame  = PREDICTOR_VAR+'_'+REGION_TOR+'_'+YEARS+'_'+'ens'+str(validation_ens)+'_dailyanom_detrend.nc'
Xval = xr.open_dataarray(ddir_X+X_finame)

Xval= Xval.where(Xval.time == TIME_X, drop=True)
Xval_unstack = (Xval - Xtrain_mean)/Xtrain_std

Xval = Xval_unstack.stack(z=('lat','lon'))

In [None]:
#Xval_unstack

In [None]:
X_train = X_train.dropna(dim='z', how = 'any')
X_val = Xval.dropna(dim='z', how = 'any')

In [None]:
#%% ----- Y TRAINING--------
#Ytrain = np.zeros((len(Xval.time),NLABEL))
#Ytrain_class  = np.zeros(len(Xval.time))
count = 0
for i in train_list:
    Ytrain_finame = PREDICTAND_VAR+'_'+REGION_TAND+'_'+YEARS+'_ens'+str(i)+'_'+str(window_size)+'daysum.nc'

    Y_all_full = xr.open_dataarray(ddir_Y+Ytrain_finame)
    Y = Y_all_full.where(Y_all_full.time == TIME_Y, drop=True)

    Y_nptime = np.array(Y.time)                 
    del Y_all_full 

    if count == 0: # don't rewrite empty matrix each time 
        Y_all = xr.DataArray(np.zeros((len(train_list),Y.shape[0]))+np.nan,
                             dims = ['ens','time'],
                             coords = [('ens',np.arange(0,len(train_list))),('time', Y_nptime)])
        
        
    Y_all[count,:] = Y   
    count = count + 1
 

In [None]:
Y_use = Y_all.stack(time_all=('ens','time'))

In [None]:
Y_use

In [None]:
light_val = np.percentile(Y_use, 80)
#mod_val   = np.percentile(Y_use,80)
Ytrain_class = (Y_use >= light_val).astype(int) #+ (Y_use >= mod_val).astype(int)

In [None]:
#Ytrain = (np.array(output_class).reshape(-1,1) == np.unique(output_class)).astype(int)


In [None]:
# How often does our data fall into each category? This is just for the last ensemble member in training
calcpercent = lambda cat: str((np.sum(np.array(Ytrain_class) == cat)/len(Ytrain_class)*100).astype(int))

# Print out the sizes of each class
print('Frequency for each Precip Category')
print('Light: ' + calcpercent(0) + '%')
print('Heavy: ' + calcpercent(1) + '%')
#print('Heavy: ' + calcpercent(2) + '%')

In [None]:
# # Distribution of precip concentrations

# fig, axs = plt.subplots(2, 4, figsize = (15,8))

# for m in np.arange(0,8):
#     ax = axs[m//4,m%4]
# #    sb.displot(Y_all[m,:], kind='hist')
#     sb.histplot(Y_all[m,:], color='black', ax=ax)
#     ax.set(xticks=(np.arange(0,55,step=5)))

#     ax.set_xlabel('mm')
#     Y_use = Y_all[m,:]
#     light_val = np.percentile(Y_use, 40)
#     mod_val   = np.percentile(Y_use,80)
    
#     ax.axvline(x=light_val, color='goldenrod')
#     ax.axvline(x=mod_val, color='red')
#     ax.set_title('Training Ensemble Member '+str(m+1))
#     ax.text(8, 50, 'Light', rotation=90, color='goldenrod')
#     ax.text(16, 50, 'Moderate', rotation=90, color='red')

# fig.tight_layout(pad=1.0)
# print('Histograms of Midwest Summer Precip 5-day Sums')



In [None]:
# ----- Y VALIDATION --------
Yval_finame = PREDICTAND_VAR+'_'+REGION_TAND+'_'+YEARS+'_ens'+str(validation_ens)+'_'+str(window_size)+'daysum.nc'

Y_all_full = xr.open_dataarray(ddir_Y+Yval_finame)
Y = Y_all_full.where(Y_all_full.time == TIME_Y, drop=True)

light_val = np.percentile(Y, 80)
#mod_val   = np.percentile(Y,80)
Yval_class = (Y >= light_val).astype(int) #+ (Y >= mod_val).astype(int)
Yval = (np.array(Yval_class).reshape(-1,1) == np.unique(Yval_class)).astype(int)

calcpercent = lambda cat: str((np.sum(np.array(Yval_class) == cat)/len(Yval_class)*100).astype(int))

# Print out the sizes of each class
print('Frequency for each Ozone Category')
print('Light: ' + calcpercent(0) + '%')
print('Heavy: ' + calcpercent(1) + '%')
#print('Heavy: ' + calcpercent(2) + '%')

In [None]:
Ytrain_class

In [None]:
# from collections import Counter

# from imblearn.over_sampling import RandomOverSampler
# over_sampler = RandomOverSampler(random_state=42)
# X_bal, Y_bal = over_sampler.fit_resample(X_train, Ytrain_class)
# print(f"Training target statistics: {Counter(Y_bal)}")
# #print(f"Validation target statistics: {Counter(y_test)}")

In [None]:
# # How often does our data fall into each category? This is just for the last ensemble member in training
# calcpercent = lambda cat: str((np.sum(np.array(Y_bal) == cat)/len(Y_bal)*100).astype(int))

# # Print out the sizes of each class
# print('Frequency for each Category')
# print('Light: ' + calcpercent(0) + '%')

# print('Moderate: ' + calcpercent(1) + '%')
# print('Heavy: ' + calcpercent(2) + '%')

In [None]:
### Here we can change how much the loss function takes into consideration different classes
# CLASS_WEIGHT = {0 : 1 / np.mean(Ytrain[:,0] == 1),
#                 1 : 1 / np.mean(Ytrain[:,1] == 1),
#                 2 : 1 / np.mean(Ytrain[:,2] == 1)}

CLASS_WEIGHT = {0 : 1, 1 : 3}

In [None]:
CLASS_WEIGHT

In [None]:
# ----- Make one hot vector -----
enc = preprocessing.OneHotEncoder()
onehotlabels      = enc.fit_transform(np.array(Ytrain_class).reshape(-1, 1)).toarray()
onehotlabels_val  = enc.fit_transform(np.array(Yval_class).reshape(-1, 1)).toarray()

In [None]:
onehotlabels.shape

In [None]:
X_train.shape

In [None]:
onehotlabels_val.shape

In [None]:
X_val.shape

In [None]:
ddir_out


In [None]:
dropout = [0.0, 0.1, 0.3, 0.7]
ridge = [0.0, 0.1, 0.5, 1.0]
batch = [64]
HIDDENS = [64,32]
RANDOM_SEED = [98]

In [None]:
# -------------------- TRAIN NN --------------------
for NETWORK_SEED in RANDOM_SEED:
    for DROPOUT in dropout:
        for RIDGE1 in ridge:
            for BATCH_SIZE in batch:
                print(NETWORK_SEED)

                # the network seed changes the random seed for the initialized weights.
                # this means that a different network seed can give a different result (e.g. it finds a different minimum in the loss)
                # ----- MAKE NN -----
                es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',     #monitor='val_prediction_accuracy'
                                                               patience=PATIENCE,
                                                               mode='auto',
                                                               restore_best_weights=True,
                                                               verbose=1)
                #lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler,verbose=0) #I don't use in this study 
                #callbacks = [es_callback,lr_callback]
                callbacks = [es_callback]

                model, loss_function = make_model()

                hotlabels = onehotlabels[:,:model.output_shape[-1]]
                hotlabels_val = onehotlabels_val[:,:model.output_shape[-1]]

                # ----- TRAINING NETWORK -----
                start_time = time.time()
                history = model.fit(X_train,
                                    hotlabels,
                                    validation_data=(X_val, hotlabels_val),
                                    class_weight = CLASS_WEIGHT,
                                    batch_size=BATCH_SIZE,
                                    epochs=N_EPOCHS,
                                    shuffle=True,
                                    verbose=0,
                                    callbacks=callbacks,
                                   )
                stop_time = time.time()
                tf.print(f"Elapsed time during fit = {stop_time - start_time:.2f} seconds\n")

                # ----- SAVE MODEL -----
                fi = '_operationalseed'+str(NETWORK_SEED)+str(DROPOUT)+str(RIDGE1)+str(BATCH_SIZE)+'.h5' 
                model.save_weights(ddir_out+fi)

                # ----- PLOT THE RESULTS -----
                plot.plot_results(
                    history,
                    exp_info=(N_EPOCHS, HIDDENS, LR_INIT, BATCH_SIZE, NETWORK_SEED, PATIENCE, RIDGE1, DROPOUT, CLASS_WEIGHT),
                    showplot=True
                )

                # ----- PRINT THE RESULTS -----
                predictions = np.argmax(model.predict(X_val),axis=-1)
                predictions_training = np.argmax(model.predict(X_train),axis=-1)
                confusion_training = tf.math.confusion_matrix(labels=Ytrain_class, predictions=predictions_training)
                confusion = tf.math.confusion_matrix(labels=Yval_class, predictions=predictions)
                zero_precision  = (np.sum(confusion[0,0])/np.sum(confusion[:,0])) * 100
                one_precision   = (np.sum(confusion[1,1])/np.sum(confusion[:,1])) * 100
                #two_precision   = (np.sum(confusion[2,2])/np.sum(confusion[:,2])) * 100

                # Number of times network predicts a given class
                zero_predictions  = (np.shape(np.where(predictions==0))[1]/predictions.shape[0])* 100
                one_predictions   = (np.shape(np.where(predictions==1))[1]/predictions.shape[0])* 100
                #two_predictions   = (np.shape(np.where(predictions==2))[1]/predictions.shape[0])* 100

                print('Zero prediction accuracy: '+str(zero_precision)[:2]+'%')
                print('Zero: '+str(zero_predictions)[:3]+'% of predictions')
                print('One prediction accuracy: '+str(one_precision)[:2]+'%')
                print('One: '+str(one_predictions)[:3]+'% of predictions')
            #     print('Two prediction accuracy: '+str(two_precision)[:2]+'%')
            #     print('Two: '+str(two_predictions)[:3]+'% of predictions')

                print('Validation Loss at Best Epoch: '+str(es_callback.best*1))#+'%')

                # ----- END LOOP -----

In [None]:
# ----- PRINT THE RESULTS -----
predictions = np.argmax(model.predict(X_val),axis=-1)
predictions_training = np.argmax(model.predict(X_train),axis=-1)
confusion_training = tf.math.confusion_matrix(labels=Ytrain_class, predictions=predictions_training)
confusion = tf.math.confusion_matrix(labels=Yval_class, predictions=predictions)
zero_precision  = (np.sum(confusion[0,0])/np.sum(confusion[:,0])) * 100
one_precision   = (np.sum(confusion[1,1])/np.sum(confusion[:,1])) * 100
#two_precision   = (np.sum(confusion[2,2])/np.sum(confusion[:,2])) * 100

# Number of times network predicts a given class
zero_predictions  = (np.shape(np.where(predictions==0))[1]/predictions.shape[0])* 100
one_predictions   = (np.shape(np.where(predictions==1))[1]/predictions.shape[0])* 100
#two_predictions   = (np.shape(np.where(predictions==2))[1]/predictions.shape[0])* 100

print('Zero prediction accuracy: '+str(zero_precision)[:2]+'%')
print('Zero: '+str(zero_predictions)[:3]+'% of predictions')
print('One prediction accuracy: '+str(one_precision)[:2]+'%')
print('One: '+str(one_predictions)[:3]+'% of predictions')
# print('Two prediction accuracy: '+str(two_precision)[:2]+'%')
# print('Two: '+str(two_predictions)[:3]+'% of predictions')


In [None]:
onehotlabels

In [None]:
# What predictions did the model make for our training, validation, and test sets?
Ptrain = model.predict(X_train) # Array of class likelihoods for each class
Pval = model.predict(X_val)

Cptrain = Ptrain.argmax(axis=1) # 1-D array of predicted class (highest likelihood)
Cpval = Pval.argmax(axis=1)

Cttrain = onehotlabels.argmax(axis=1) # 1-D array of truth class
Ctval = onehotlabels_val.argmax(axis=1)


In [None]:
from sklearn.metrics import f1_score, accuracy_score
print('Validation Categorical Accuracy:', accuracy_score(Ctval, Cpval) )

# Weight equal to the inverse of the frequency of the class
cat_weights = np.sum((1 / np.mean(X_train, axis=0)) * X_val, axis=0) 
print('Validation Weighted Categorical Accuracy:', accuracy_score(Ctval, Cpval, sample_weight=cat_weights) )

In [None]:
def confusion_matrix(predclasses, targclasses):

    class_names = np.unique(targclasses)

    table = []
    for pred_class in class_names:
        row = []
        for true_class in class_names:
            row.append(100 * np.mean(predclasses[targclasses == true_class] == pred_class))
        table.append(row)
    class_titles_t = ["T(Light)", "T(Moderate)", "T(Heavy)"]
    class_titles_p = ["P(Light)", "P(Moderate)", "P(Heavy)"]
    conf_matrix = pd.DataFrame(table, index=class_titles_p, columns=class_titles_t)
    display(conf_matrix.style.background_gradient(cmap='Blues').format("{:.1f}"))

In [None]:
# What predictions did the model make for our training, validation, and test sets?
Ptrain = model.predict(X_train) # Array of class likelihoods for each class
Pval = model.predict(X_val)

Cptrain = Ptrain.argmax(axis=1) # 1-D array of predicted class (highest likelihood)
Cpval = Pval.argmax(axis=1)     #argmax along axis=1 returns the index which has the highest value for each row 

Cttrain = hotlabels.argmax(axis=1) # 1-D array of truth class
Ctval = hotlabels_val.argmax(axis=1)

In [None]:
print("Predicted versus Target Classes")
print("")
print("Training")
confusion_matrix(Cptrain, Cttrain)
print("Validation")
confusion_matrix(Cpval, Ctval)

In [None]:
predictions = np.argmax(model.predict(X_val),axis=-1)
confusion = tf.math.confusion_matrix(labels=Yval_class, predictions=predictions)
sns.heatmap(confusion,annot=True,cmap=plt.cm.Reds, alpha=0.5, fmt ='g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig(ddir_out+'confusion_matrix_lastseed.png', format='png')