In [2]:
import import_ipynb
import settings
from netCDF4 import Dataset
import h5py

importing Jupyter notebook from settings.ipynb


In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: Marybeth Arcodia 
"""

import numpy as np
import xarray as xr

import matplotlib.pyplot as plt
from numpy.polynomial import polynomial
import import_ipynb
import sys
import os 

import pandas as pd
import datetime as dt
import time
import matplotlib.pyplot as plt
import seaborn as sns
import random
import tensorflow as tf


#%% Filter
########################################
#interp2.5_hist_1850-1949_PRECT_0.nc
########################################
NLABEL = 2

YEARS = '1850-1949'
STRT = pd.to_datetime('11-01-1850')
END   = pd.to_datetime('2-28-1949')  + dt.timedelta(days=1)

2023-05-08 15:14:21.056325: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
#-- Pull Parameters from settings.ipynb ------
exp_num = 1

for sub_exp in np.arange(exp_num*100,exp_num*100+10):
    FOLDER     = 'exp_'+str(exp_num)
    EXPERIMENT = 'exp_'+str(exp_num)+'/exp_'+str(sub_exp)
    
    print("Experiment "+str(sub_exp))
          
    ddir_in = DIRECTORY_IN # Need to name
    ddir_out = DIRECTORY_OUT #need to name 
    params = settings.get_settings(EXPERIMENT)

    PREDICTOR_VAR  = params['PREDICTOR_VAR']         
    PREDICTAND_VAR = params['PREDICTAND_VAR']              
    REGION_TOR     = params['REGION_TOR']            
    REGION_TAND    = params['REGION_TAND']            
    training_ens   = params['training_ens']            
    validation_ens = params['validation_ens']           
    testing_ens    = params['testing_ens']           
    train_list     = params['train_list']           
    train_val_list = params['train_val_list']
    lead           = params['lead']            
    days_average   = params['days_average']            
    GLOBAL_SEED    = params['GLOBAL_SEED']            
    HIDDENS        = params['HIDDENS']          
    DROPOUT        = params['DROPOUT']            
    RIDGE1         = params['RIDGE1']                    
    LR_INIT        = params['LR_INIT']
    BATCH_SIZE     = params['BATCH_SIZE']           
    RANDOM_SEED    = params['RANDOM_SEED']            
    act_fun        = params['act_fun']            
    N_EPOCHS       = params['N_EPOCHS']           
    PATIENCE       = params['PATIENCE']   
    
#>>>>>SET UP <<<<<<<<<<<<<<<
    np.random.seed(GLOBAL_SEED)
    random.seed(GLOBAL_SEED)
    tf.compat.v1.random.set_random_seed(GLOBAL_SEED)

    ens_calc_members = np.hstack((np.array(training_ens),np.array(validation_ens)))
    anom_calc_members = np.hstack((np.array(ens_calc_members),np.array(testing_ens)))
    print(ens_calc_members)
    print(anom_calc_members)
    
    count = 0
    for i in ens_calc_members:
        infile = 'interp2.5_LE2-'+str(i)+'.'+PREDICTOR_VAR+'.'+REGION_TOR+'.'+YEARS+'.nc'       
        X = xr.open_dataset(ddir_in+infile)[PREDICTOR_VAR]   #reads in just the PRECT var without the time_bnds dimension 
        X_nptime = np.array(X.time)                 #for some annoying reason, it needed to be converted to numpy for creating DataArray   
        X_nplat = np.array(X.lat)
        X_nplon = np.array(X.lon)
        #print(infile)
        del infile 
        
        if count == 0: # don't rewrite empty matrix each time 
            X_all = xr.DataArray(np.zeros((len(train_val_list),X.shape[0],X.shape[1],X.shape[2]))+np.nan,
                                 dims = ['ens','time','lat','lon'],
                                 coords = [('ens',np.arange(len(train_val_list))+1),('time', X_nptime),('lat',X_nplat),('lon',X_nplon)])
        if PREDICTOR_VAR == 'PRECT':
            X_convert = X * 1000. * 86400. # m/s * 1000mm/m * 86400s/day
            X_all[count,:,:,:] = X_convert
        else:
             X_all[count,:,:,:] = X   
        count = count+1
        del X #, X_convert 
        
    #Take mean over ensembles & save
    X_all_mean = X_all.mean('ens',skipna=True)
    X_all_mean.to_netcdf(ddir_out+'ensmean-'+train_val_list+'_'+REGION_TOR+'_'+YEARS+'_'+PREDICTOR_VAR+'.nc')
    
### Calculate Anomalies by Subtracting Ensemble Mean
    del X_nptime, X_nplat, X_nplon, X_all
    del count, i 
    
    ensmean = X_all_mean#xr.open_dataset(ddir_out+'ensmean-'+train_val_list+'_'+REGION_TOR+'_'+YEARS+'_'+PREDICTOR_VAR+'.nc')['__xarray_dataarray_variable__']
    ens_stacked = ensmean.stack(z=('lat','lon'))
    
    del X_all_mean

    #To be able to save all anomaly data in 1 file for training
    X_nptime = np.array(ensmean.time)                 #for some annoying reason, it needed to be converted to numpy for creating DataArray   
    X_nplat = np.array(ensmean.lat)
    X_nplon = np.array(ensmean.lon)
    anom_concat = xr.DataArray(np.zeros((len(training_ens),ensmean.shape[0],ensmean.shape[1],ensmean.shape[2]))+np.nan,
                                dims = ['ens','time','lat','lon'],
                                coords = [('ens',np.arange(len(training_ens))),('time', X_nptime),('lat',X_nplat),('lon',X_nplon)])
    del X_nptime, X_nplat, X_nplon

#Calculate Daily Anomaly by subtracting the climatology from each grid point, for each day 

    count = 0 
    for i in anom_calc_members:
        infile = 'interp2.5_LE2-'+str(i)+'.'+PREDICTOR_VAR+'.'+REGION_TOR+'.'+YEARS+'.nc'     

        outfile_singles = PREDICTOR_VAR+'_'+REGION_TOR+'_'+YEARS+'_ens'+str(i)+'_dailyanom_detrend.nc'

        X = xr.open_dataset(ddir_in+infile)[PREDICTOR_VAR]
        if PREDICTOR_VAR == 'PRECT':
            X = X * 1000. * 86400. # m/s * 1000mm/m * 86400s/day = [mm/day]

        Xstacked = X.stack(z=('lat', 'lon'))
        del X
        
        temp = Xstacked['time.dayofyear']
        #climo_full = []
        anom = []
        for label,ens_group in ens_stacked.groupby('time.dayofyear'):

            Xgroup = Xstacked.where(temp == label, drop = True)     #group all Jan 1s together, fit curve 

            curve = polynomial.polyfit(np.arange(0,ens_group.shape[0]),ens_group,1) #fit a line to all Jan 1s and not full timeseries
            trend = polynomial.polyval(np.arange(0,ens_group.shape[0]),curve,tensor=True)  #don't assume trend is the same throughout seasonal cycle
            trend = np.swapaxes(trend,0,1)
            diff  = Xgroup - trend 
            anom.append(diff)

        anom_xr_trend = xr.concat(anom,dim='time').unstack()
        anom_xr_trend = anom_xr_trend.sortby('time')

        X_nptime = np.array(anom_xr_trend.time)                 #need to convert to numpy for creating DataArray   
        X_nplat = np.array(anom_xr_trend.lat)
        X_nplon = np.array(anom_xr_trend.lon)

        detrend_anom_4write = xr.DataArray(np.zeros((anom_xr_trend.shape[0],anom_xr_trend.shape[1],anom_xr_trend.shape[2]))+np.nan,
                     dims = ['time','lat','lon'],
                     coords = [('time', X_nptime),('lat',X_nplat),('lon',X_nplon)]) 
        detrend_anom_4write[:,:,:] = anom_xr_trend[:,:,:]
        detrend_anom_4write.to_netcdf(ddir_out+outfile_singles)

        if count < len(training_ens):
            anom_concat[count,:,:,:] = anom_xr_trend   
            count = count+1

    anom_concat = anom_concat[:len(training_ens),:,:,:]
    outfile_concat = PREDICTOR_VAR+'_'+REGION_TOR+'_'+YEARS+'_'+'ens'+train_list+'_dailyanom_detrend.nc'
    anom_concat.to_netcdf(ddir_out+outfile_concat)
    
    del outfile_concat, anom, diff, trend, curve, Xgroup, temp, Xstacked, detrend_anom_4write


    # Calculate Running Averages of Anomalies for Prediction problems
    #### Only Needed for Predictand Variable

    predictand_var = anom_concat #xr.open_dataset(ddir_out+VAR+'_'+REGION+'_'+YEARS+'_'+'ens'+train_list+'_dailyanom_detrend.nc')['__xarray_dataarray_variable__']
    del anom_concat

    #Need to spatially average over the region we want to predict 

    lat_avg = np.mean(predictand_var,axis=2)
    box_avg = np.mean(lat_avg,axis=2)
    del predictand_var
    
#    Check that data has 50% above median
    for ENS in range(0,len(training_ens)): 
        ind = (copy_box_avg[ENS,:]<=np.median(box_avg[ENS,:]))
        copy_box_avg[ENS,ind] = np.median(box_avg[ENS,:])
        ind = (copy_box_avg[ENS,:]>np.median(box_avg[ENS,:]))
        copy_box_avg[ENS,ind] = 1.0

        num_above_median = np.count_nonzero(copy_box_avg[ENS,:] == 1)
        perc_above_median = np.round((num_above_median/len(predictand_var.time))*100,2)
        print('Ensemble '+str(ENS)+ ' has '+ str(perc_above_median) + '% above the median of '+str(np.round(np.median(box_avg[ENS,:]),3)))

    #Calculate Forward Running 14-Day Average to make precip less noisy 
    #Resulting Timeseries is len(timeseries minus (days_average-1) because can't take forward average of first days
    #NN can't handle NaNs but if I need the timeseries to be full length, then remove .dropna code 

    box_time_np = np.array(box_avg.time)
    box_time_np = box_time_np[:len(box_avg.time)]
    
    weekly_run_avg = xr.DataArray(np.zeros((len(training_ens),box_avg.shape[1])),
                                dims = ['ens','time'],
                                coords = [('ens',np.arange(len(training_ens))),('time',box_time_np)])
    for ENS in range(0,len(training_ens)):
        weekly_run_avg[ENS,:] = box_avg[ENS,:].rolling(time=days_average, center=False).mean()     #.dropna(dim="time")

    weekly_run_avg = weekly_run_avg.dropna(dim="time")  #need to add in .dropna here because indexing was difficult in previous line 

    del box_avg, lat_avg, box_time_np
    
#Check to see if averaging precip makes distribution more gaussian
# This check is important to keep just to make sure data is close to ~50%

    copy_week_avg = np.copy(weekly_run_avg)

    print("Check distribution about 0 for "+str(days_average)+"-day average")
    for ENS in range(0,len(training_ens)): 
        ind = (copy_week_avg[ENS,:]<=0)
        copy_week_avg[ENS,ind] = 0.0
        ind = (copy_week_avg[ENS,:]>0)
        copy_week_avg[ENS,ind] = 1.0

        num_above_zero = np.count_nonzero(copy_week_avg[ENS,:] == 1)
        #print(num_above_zero)
        perc_above_zero = np.round((num_above_zero/len(weekly_run_avg.time))*100,2)
        print('Ensemble '+str(ENS)+ ' has '+ str(perc_above_zero) + '% above 0.0 mm/day')

    outfile_concat = PREDICTAND_VAR+'_'+REGION_TAND+'_boxavg_'+YEARS+'_'+'ens'+train_list+'_dailyanom_detrend_'+str(days_average)+'dayavg.nc'
    weekly_run_avg.to_netcdf(ddir_out+outfile_concat)
    
    del weekly_run_avg, outfile_concat, copy_week_avg, ind, num_above_zero, perc_above_zero


    ## Compute Weekly Avg for Validation Ensemble Member

    predictand_var = xr.open_dataset(ddir_out+PREDICTAND_VAR+'_'+REGION_TAND+'_'+YEARS+'_ens'+str(validation_ens)+'_dailyanom_detrend.nc')['__xarray_dataarray_variable__']

    #Need to spatially average over the region we want to predict 

    lat_avg = np.mean(predictand_var,axis=1)
    box_avg = np.mean(lat_avg,axis=1)
    del predictand_var
    
    copy_box_avg = np.copy(box_avg)
    copy_box_avg = np.array(copy_box_avg)

    box_time_np = np.array(box_avg.time)
    box_time_np = box_time_np[:len(box_avg.time)]

    weekly_run_avg = xr.DataArray((np.zeros(box_avg.shape[0])),
                                dims = ['time'],
                                coords = [('time',box_time_np)])
    weekly_run_avg = box_avg.rolling(time=days_average, center=False).mean()     #.dropna(dim="time")

    weekly_run_avg = weekly_run_avg.dropna(dim="time")  #need to add in .dropna here because indexing was difficult in previous line 

    outfile_concat = PREDICTAND_VAR+'_'+REGION_TAND+'_boxavg_'+YEARS+'_'+'ens'+str(validation_ens)+'_dailyanom_detrend_'+str(days_average)+'dayavg.nc'
    weekly_run_avg.to_netcdf(ddir_out+outfile_concat)
    
    del copy_box_avg, box_time_np, weekly_run_avg, outfile_concat


    ## Compute Weekly Avg for TESTING Ensemble Member

    predictand_var = xr.open_dataset(ddir_out+PREDICTAND_VAR+'_'+REGION_TAND+'_'+YEARS+'_ens'+str(testing_ens)+'_dailyanom_detrend.nc')['__xarray_dataarray_variable__']
    # predictand_var
    # plt.plot(predictand_var[:,0,0])
    # print(np.mean(predictand_var[:,0,0]))

    #Need to spatially average over the region we want to predict 

    lat_avg = np.mean(predictand_var,axis=1)
    box_avg = np.mean(lat_avg,axis=1)
    del predictand_var
    
    copy_box_avg = np.copy(box_avg)
    copy_box_avg = np.array(copy_box_avg)

    box_time_np = np.array(box_avg.time)
    box_time_np = box_time_np[:len(box_avg.time)]

    weekly_run_avg = xr.DataArray((np.zeros(box_avg.shape[0])),
                                dims = ['time'],
                                coords = [('time',box_time_np)])
    weekly_run_avg = box_avg.rolling(time=days_average, center=False).mean()     #.dropna(dim="time")

    weekly_run_avg = weekly_run_avg.dropna(dim="time")  #need to add in .dropna here because indexing was difficult in previous line 

    outfile_concat = PREDICTAND_VAR+'_'+REGION_TAND+'_boxavg_'+YEARS+'_'+'ens'+str(testing_ens)+'_dailyanom_detrend_'+str(days_average)+'dayavg.nc'
    weekly_run_avg.to_netcdf(ddir_out+outfile_concat)
    
    del copy_box_avg, box_time_np, weekly_run_avg, outfile_concat

Experiment 100
[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8 9]


KeyboardInterrupt: 