In [1]:
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import random
import warnings 
warnings.filterwarnings("ignore")
import pickle
import math

In [31]:
class split_dataset:
    def __init__(self, configs, test_size, random):
        self.main_dir = configs['main_dir']
        self.year = configs['year']
        self.pattern_filename = configs['pattern_filename'][0]
        self.test_size = test_size
        self.random = random

    def random_split(list_elements, test_size, random=False):
        '''
        Input: 
            - list_elements : list of observations, datetime format is not required
            - test_size : (float) from 0.0 to 1.0, proportion of test dataset size required, generally test_size < 0.5 
            - random : if True, select randomly days from list days
        Output: 
            - train_days : list of training days from total days of dataset 
            - test_days : list of testing days from total days of dataset
        '''
        # get number of list training days 
        counts = len(list_elements)
        counts_train = np.int((1-test_size)*counts)
        li1 = np.arange(0, len(list_elements),1)
        
        if random:
            # set random index for training days 
            ind_train = np.random.choice(range(counts), counts_train, replace=False)
            
            # get index left for testing days 
            li2 = np.array(ind_train)
            dif1 = np.setdiff1d(li1, li2)
            dif2 = np.setdiff1d(li2, li1)
            ind_test = np.concatenate((dif1, dif2))

            # get training & testing days 
            train_list = np.array(list_elements)[ind_train]
            test_list = np.array(list_elements)[ind_test]
        else:
            train_list = list_elements[:counts_train]
            test_list = list_elements[counts_train:]
        return train_list, test_list

    def get_list_paths(self):
        ### pattern_filename = 'ipral_calib_03_*_000000_1440.nc'
        if len(self.year) > 1:
            print(self.year)
            list_paths = [sorted(Path(self.main_dir, yy).glob(self.pattern_filename)) for yy in self.year]
        else:
            list_paths = sorted(Path(input_dir, year).glob(pattern_filename))
        
        print(list_paths)
        train_paths, test_paths = split_dataset.random_split(list_paths, self.test_size, self.random)
        return train_paths, test_paths

    def get_list_days(self):
        train_paths, test_paths = split_dataset.get_list_paths(self)
        train_days = [path.stem.split("_")[3] for path in train_paths]
        test_days = [path.stem.split("_")[3] for path in test_paths]
        return train_days, test_days

In [32]:
import random
configs_ipral = {
    'main_dir' : Path('/homedata/nmpnguyen/NETCDF/v2/'),
    'pattern_filename' : ['ipral_calib_03_','_000000_1440.nc'],
    'year' : ['2018', '2019'],   
    'variables_name' : {
        'ATB' : 'Total_Calib_Attn_Backscatter', 
        'AMB' : 'Attn_Molecular_Backscatter', 
        'time' : 'time',
        'range' : 'range'
    }, 
    'instrument' : 'IPRAL',
    'output_dir' : Path('/homedata/nmpnguyen/IPRAL/learning_model_test/Products'),
    'random_version' : random.randint(0,1000)
}

In [33]:
dataset_split = split_dataset(configs_ipral, 0.3, True)

In [34]:
dataset_split.get_list_paths()

NameError: name 'yy' is not defined