### Author: Md Fahim Hasan
### Work Email: mdfahim.hasan@bayer.com

In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from glob import glob
import rasterio as rio
import geopandas as gpd
import dask.dataframe as ddf
from rasterio.mask import mask
from shapely.geometry import mapping
from rasterio.enums import MergeAlg, Resampling

## General functions

In [2]:
def read_raster_arr_object(raster_file, rasterio_obj=False, band=1, get_file=True, change_dtype=True):
    """
    Get raster array and raster file.

    :param raster_file: Input raster filepath.
    :param rasterio_obj: Set True if raster_file is a rasterio object.
    :param band: Selected band to read. Default set to 1.
    :param get_file: Set to False if raster file is not required.
    :param change_dtype: Set to True if want to change raster data type to float. Default set to True.

    :return: Raster numpy array and rasterio object file (get_file=True, rasterio_obj=False).
    """
    if not rasterio_obj:
        raster_file = rio.open(raster_file)
    else:
        get_file = False
    raster_arr = raster_file.read(band)
    if change_dtype:
        raster_arr = raster_arr.astype(np.float32)
        if raster_file.nodata:
            raster_arr[np.isclose(raster_arr, raster_file.nodata)] = np.nan
    if get_file:
        return raster_arr, raster_file
    else:
        return raster_arr
    

def write_array_to_raster(raster_arr, raster_file, transform, output_path, ref_file=None, nodata=-9999):
    """
    Write raster array to Geotiff format.

    :param raster_arr: Raster array data to be written.
    :param raster_file: Original rasterio raster file containing geo-coordinates.
    :param transform: Affine transformation matrix.
    :param output_path: Output filepath.
    :param ref_file: Write output raster considering parameters from reference raster file.
    :param nodata: no_data_value set as -9999.

    :return: Output filepath.
    """
    if ref_file:
        raster_file = rio.open(ref_file)
        transform = raster_file.transform

    with rio.open(
            output_path,
            'w',
            driver='GTiff',
            height=raster_arr.shape[0],
            width=raster_arr.shape[1],
            dtype=raster_arr.dtype,
            count=1,  # raster_file.count
            crs=raster_file.crs,
            transform=transform,
            nodata=nodata
    ) as dst:
        dst.write(raster_arr, 1) #raster_file.count

    return output_path
   
     
def make_lat_lon_array_from_raster(input_raster, nodata=-9999):
    """
    Make lat, lon array for each pixel using the input raster.
    
    params:
    input_raster : Input raster filepath that will be used as reference raster.
    nodata : No data value. Default set to -9999.
    
    returns: Lat, lon array with nan value (-9999) applied.
    """
    
    raster_file = rio.open(input_raster)
    raster_arr = raster_file.read(1)

    # calculating lat, lon of each cells centroid
    height, width = raster_arr.shape
    cols, rows = np.meshgrid(np.arange(width), np.arange(height))
    xs, ys = rio.transform.xy(rows=rows, cols=cols, transform=raster_file.transform)
    
    # flattening and reshaping to the input_raster's array size
    xs = np.array(xs).flatten()
    ys = np.array(ys).flatten()
    
    lon_arr = xs.reshape(raster_arr.shape)
    lat_arr = ys.reshape(raster_arr.shape)
    
    # assigning no_data_value
    lon_arr[raster_arr==nodata] = nodata
    lat_arr[raster_arr==nodata] = nodata
    
    return lon_arr, lat_arr


def mask_raster_array_by_shapefile(input_raster, mask_shape, output_dir=None, raster_name=None, invert=False,
                                   crop=True, save_masked_arr=False):
    """
    Mask a raster using a input shapefile.

    Parameters:
    input_raster: Input raster filepath.
    mask_shape : Reference shape file to crop input_raster.
    output_dir : Defaults to None. Set a output raster directory path if save_masked_arr is True.
    raster_name : Defaults to None. Set a output raster name if save_masked_arr is True.
    invert : If False (default) pixels outside shapes will be masked.
             If True, pixels inside shape will be masked.
    crop : Whether to crop the raster to the extent of the shapes. Set to False if invert=True is used.
    save_masked_arr : Set to true if want to save cropped/masked raster array. If True, must provide output_raster_name and
                       output_dir.

    returns : Masked raster array and masked raster filepath.
    """
    input_arr, input_file = read_raster_arr_object(input_raster)
    
    shapefile = gpd.read_file(mask_shape)
    geoms = shapefile['geometry'].values  # list of shapely geometries
    geoms = [mapping(geoms[0])]
    
    # masking
    masked_arr, masked_transform = mask(dataset=input_file, shapes=geoms, filled=True, crop=crop, invert=invert, 
                                        all_touched=False)
    masked_arr = masked_arr.squeeze()  # Remove axes of length 1 from the array
    

    if save_masked_arr:
        # naming output file
        makedirs([output_dir])
        output_raster = os.path.join(output_dir, raster_name)

        # saving output raster
        masked_raster = write_array_to_raster(raster_arr=masked_arr, raster_file=input_file, transform=masked_transform,
                              output_path=output_raster)
        return masked_arr, masked_raster
   
    else: # in case raster is not saved return only masked raster array
        return masked_arr

    
def resample_raster_based_on_ref_raster(input_raster, ref_raster, output_dir, raster_name, resampling_alg=Resampling.bilinear,
                                        paste_value_on_ref_raster=False):
    """
    Resample raster based on a refernce raster.
    
    params:
    input_raster : Filepath of input raster to resample.
    ref_raster : Filepath of input raster to be used in determining resample height/width/affine transformation/crs/dtype/nodata.
    output_raster : Filepath of resampled output raster.
    resampling_alg : resampling algorithm. Can be Resampling.nearest/ Resampling.bilinear/Resampling.cubic or 
                     any resampling algorith rasterio supports Default set to Resampling.bilinear.
    paste_value_on_ref_raster : Set to True if want to have nodata pixels on the resampled raster similar to reference raster. 
    
    returns: The resampled output raster filepath.
    """
    makedirs([output_dir])
    
    ref_arr, ref_file = read_raster_arr_object(ref_raster)
    
    # target shape. use a reference raster (created using GIS for a specific region) to decide.
    resampled_height, resampled_width = ref_arr.shape

    with rio.open(input_raster) as dataset:
        # resample data to target shape
        resampled_arr = dataset.read(1,
                            out_shape=(1,
                                       resampled_height,
                                       resampled_width),
                            resampling=resampling_alg)

        resampled_arr = resampled_arr.squeeze() # removing the 1 (for count) from the dimension
        
        if paste_value_on_ref_raster:
            resampled_arr = np.where(np.isnan(ref_arr), -9999, resampled_arr)
        
        # Saving the resampled data
        output_raster = os.path.join(output_dir, raster_name)
        write_array_to_raster(raster_arr=resampled_arr, raster_file=ref_file, 
                              transform=ref_file.transform, output_path=output_raster, 
                              ref_file=None, nodata=-9999)
        
        return output_raster  

## Soil-Elevation 4km ARD functions

In [None]:
def resample_soil_elevation_data_4km(dataset_list, output_folder, reference_raster, paste_value_on_ref_raster=True):
    """
    Resample soil and elavation datasets to 4km.
    
    :param dataset_list : A list of dataset file paths.
    :param output_folder : Filepath of output folder to save data.
    :param reference_raster : A reference raster filepath. This raster will be used as resampling reference. 
                              Can use one of the finalized weather datasets.   
    :param paste_value_on_ref_raster : Set to True if want to have nodata pixels on the resampled raster similar to reference raster.
   
     returns: Resampled 4km spatial resolution soil and elevation datasets.
    """
    for data in dataset_list:
        raster_name = os.path.basename(data)
        if 'elevation' in raster_name:
            raster_name = 'elevation.tif'
        elif 'slope' in raster_name:
            raster_name = 'slope.tif'
        
        # resampling
        resample_raster_based_on_ref_raster(input_raster=data, ref_raster=reference_raster, 
                                            output_dir=output_folder, raster_name=raster_name, 
                                            resampling_alg=Resampling.bilinear, 
                                            paste_value_on_ref_raster=paste_value_on_ref_raster)
        
                
def generate_soil_elevation_ARD(input_data_directory, output_folder, savename):
    """
    Compile soil-elevation datasets to a dataframe. The dataframe will be regarded as a Analytical Ready Dataset (ARD).
    
    params:
    input_data_directory : (str) Filepath of input datasets' directory.
    output_folder : (str) Filepath of oputput directory/folder. 
    savename : (str) Name of the output parquet file.
    
    returns: compiled soil and elevation ARD dataframe.
    """
    makedirs([output_folder])
    
    datasets = glob(os.path.join(input_data_directory, '*.tif'))
    
    # a dictionary where daily dataset values will be stored under variable_name 
    variable_dict = {}  

    for count, data in enumerate(datasets):
             
        variable_name = os.path.basename(data).split('.')[0]  # extracted variable name 
        
        print(f'compiling data for {variable_name}...')

        data_arr = read_raster_arr_object(data, get_file=False).flatten()  # read data as array and flattened it

        # Assigning all values to the variable_dict
        variable_dict[variable_name] = list(data_arr)  # storing flattened data in a dictionary under the variable name

        # adding lat/lon info
        if data == datasets[-1]:
            lon_arr, lat_arr = make_lat_lon_array_from_raster(data, nodata=-9999)
            lon_arr = lon_arr.flatten()
            lat_arr = lat_arr.flatten()
                
            variable_dict['lat'] = list(lat_arr)
            variable_dict['lon'] = list(lon_arr)
            
    variable_df = pd.DataFrame(variable_dict)
    variable_df = variable_df.dropna()
    
    # saving data
    if '.parquet'in savename:
        output_parquet_file = os.path.join(output_folder, savename)
    else:
        output_parquet_file = os.path.join(output_folder, f'{savename}.parquet')
    
    variable_df.to_parquet(output_parquet_file, index=False)
    

## Soil-Elevation 100m ARD functions

In [7]:
def mask_resample_soil_elevation_datasets_to_shapefile(input_datasets_fp, output_dir, masking_shapefile, target_raster,
                                                       paste_value_on_target_raster=True):
    """
    Mask soil and elevation datasets with a shapefile and resample to resolution of target satellite data (100m).
      
    params:
    input_datasets_fp : A list of input dataset filepaths. 
    output_dir : Output data directory where masked and resampled data will be saved. 
    masking_shapefile : Filepath of shapefile to mask the data with.
    target_raster : 100m ref raster. Should be a satellite raster of 100m resolution that is inside the shapefile. 
    paste_value_on_target_raster : Set to True if want to have nodata pixels on the resampled raster similar to target raster. 
    
    returns: None.
    """

    # resampling data only the required variables
    for data in input_datasets_fp:
        variable_name = os.path.basename(data).split('.')[0]
        if 'elevation' in variable_name:
            variable_name = 'elevation'
        elif 'slope' in variable_name:
            variable_name = 'slope'
        
        print(f'Masking > Resampling data for {variable_name}...')
        
        masked_output_dir =  os.path.join(output_dir, 'masked') #saving initial masked rasters in an 'interim folder' inside each variable's folder
        resampled_output_dir =  output_dir
        makedirs([resampled_output_dir, masked_output_dir])
        
        raster_name = variable_name + '.tif'
        masked_raster_fp = os.path.join(masked_output_dir, raster_name)
        
        # masking with shapefile
        mask_raster_array_by_shapefile(input_raster=data, mask_shape=masking_shapefile, 
                                       output_dir=masked_output_dir, raster_name=raster_name, 
                                       invert=False, crop=True, save_masked_arr=True)
        
        # resampling to make sure the processed raster's pixels allign with the target_raster's pixels
        resample_raster_based_on_ref_raster(input_raster=masked_raster_fp, ref_raster=target_raster, 
                                            output_dir=resampled_output_dir, raster_name=raster_name,
                                            paste_value_on_ref_raster=paste_value_on_target_raster)

## Weather 4km ARD functions

In [4]:
def create_weather_4km_preARD_dataframes(data_directories_list, output_folder, savename, 
                                         dataset_in_each_chunk=3):
    """
    Compile 4km weather datasets to multiple preARD dataframes. 
    Weather datasets have extended daily records and will take high memory to compile into single dataframe at once; therefore, 
    compiling weather datasets into multiple dataframes (referring to as preARD). These preARD dataframes will be combined into 
    a single dataframe and averaged to form the 4km weather ARD in a later step.
    
    params:
    data_directories_list : (list) List of str of all weather data (model-interpolated/resampled) directories.The code will automatically get data in the sub-directories.
    output_folder : (str) Main output folder. The code will automatically save data in the individual sub-directories.
    savename : (str) Name of the output parquet file.
    
    returns: None.
    """
    makedirs([output_folder])
    
    # All datasets except avg_temp, min_Rhumid, max_Rhumid, TotNetSR have modeled and TWC data mixed (TWC data mixed from 2015)
    # The number of days in avg_temp, min_Rhumid, max_Rhumid, TotNetSR are 7809, while in other dataasets it's 7821
    # creating a separate data_chunk for avg_temp, min_Rhumid, max_Rhumid, TotNetSR so that they can be saved asw separate preARD
    separated_datasets = []
    for i in data_directories_list:
        dataname =  os.path.basename(i)
        
        if dataname in ['avg_temp', 'min_Rhumid', 'max_Rhumid', 'TotNet_SR']:
            separated_datasets.append(i)
    
    # Removing 'avg_temp', 'min_Rhumid', 'max_Rhumid', 'TotNetSR' from data directories list
    data_directories_list = [i for i in data_directories_list if i not in separated_datasets]
    
    # creating a list of list of directories. Each individual list will be processed as a chunk and saved as individual parquet
    data_chunks = [data_directories_list[x:x+dataset_in_each_chunk] for x in range(0, len(data_directories_list), dataset_in_each_chunk)]
        
    # creating a separate chunk for avg_temp, min_Rhumid, max_Rhumid, TotNetSR
    data_chunks.append(separated_datasets)
    
    
    for num_chunk, datasets in enumerate(data_chunks):
        # will be used to multiply lat/lon data 
        num_days = len(glob(os.path.join(datasets[0], '*.tif')))
    
        # a dictionary where daily dataset values will be stored under variable_name 
        variable_dict = {}  
        
        for data_dir in datasets:
            all_data = glob(os.path.join(data_dir, '*.tif')) # making list of all dataset in a particular data folder
            all_data = sorted(all_data)  # to sort data by date so that all variables are compiled in same serial
        
            variable_name = os.path.basename(data_dir).split('.')[0]  # extracted variable name 
            
            print(f'compiling data for {variable_name}...')

            # loop for reading datasets and storing pixel info in a dictionary
            for count, data in enumerate(all_data):
                # retrieving and storing data
                data_arr = read_raster_arr_object(data, get_file=False).flatten()  # read data as array and flattened it

                # extarcting, formatting, and storing date info
                date = os.path.basename(data).split('.')[0].split('_')[-1]

                len_data = len(data_arr)  # number of pixels in each daily dataset (array)
                date_list = [int(date)] * len_data

                # Assigning all values to the variable_dict
                if count == 0:
                    variable_dict[variable_name] = list(data_arr)  # storing flattened data in a dictionary under the variable name
                    variable_dict['date'] = date_list

                else:
                    variable_dict[variable_name].extend(list(data_arr))  # storing flattened data in a dictionary under the variable name)
                    variable_dict['date'].extend(date_list)

                # adding lat/lon info
                if data_dir == datasets[-1]:
                    lon_arr, lat_arr = make_lat_lon_array_from_raster(data, nodata=-9999)
                    lon_arr = lon_arr.flatten()
                    lat_arr = lat_arr.flatten()
                
                    if count == 0:
                        variable_dict['lat'] = list(lat_arr)
                        variable_dict['lon'] = list(lon_arr)
                    else:
                        variable_dict['lat'].extend(list(lat_arr))
                        variable_dict['lon'].extend(list(lon_arr))

        variable_df = pd.DataFrame(variable_dict)
        variable_ddf = ddf.from_pandas(variable_df, npartitions=10)
        variable_ddf = variable_ddf.dropna()
        variable_ddf = variable_ddf.reset_index()
    
        # saving data
        output_parquet_file = os.path.join(output_folder, savename, f'{savename}_{num_chunk}.parquet')
        variable_ddf.to_parquet(output_parquet_file, write_index=False)

        
def compile_preARD_multiDF_to_ARD(parquet_folder, output_folder, savename):
    """
    Compile multiple dataframe of era5 data (generated by compile_era5_daily_data_to_multiple_dataframe()) 
    into a single dataframe.
    
    params:
    parquet_folder : Filepath of folder where multiple parquet files (dataframes) are saved.
    output_folder : Filepath of output folder where single parquet file (dataframe) with all era5 variales will be saved.
    save_keyword : A keyword (str) to distinguish between 4km/8km parquet files. Can set to '4km'/'8km'.
    
    returns: Compiled single dataframe.
    """
    parquet_files = glob(os.path.join(parquet_folder, '*.parquet'))
    for parq in parquet_files:
        df = pd.read_parquet(parq)
        df = df.drop(columns=['index'])
        if parq == parquet_files[0]:
            compiled_df = df
        else:
            compiled_df = compiled_df.merge(df, on=['date', 'lat', 'lon'])
    
    compiled_df['month_day'] = compiled_df['date'].apply(lambda x: f'{str(x)[4:6]}_{str(x)[6:]}')

    # Averaging dataframe for month_day to generate the final ARD
    compiled_df = compiled_df.drop(columns=['date'])
    columns_to_compile = [col for col in compiled_df.columns if col not in ['month_day', 'lat', 'lon']]  # removing the columns to use in group_by from grouping
    
    compiled_df = compiled_df.groupby(['month_day', 'lat', 'lon'])[columns_to_compile].mean().reset_index()
    
    # saving the ARD
    if '.parquet' not in savename:
        savename = savename + '.parquet'
        
    output_parquet = os.path.join(output_folder, savename)
    compiled_df.to_parquet(output_parquet, index=False)
       
    return compiled_df

## Weather 100m ARD functions

In [None]:
def resample_mask_resample_weather_datasets_to_shapefile(input_data_dir_list, output_main_dir, masking_shapefile, 
                                                         target_raster_whole_aoi, target_raster_smaller_aoi):
    """
    Performs resample > mask > ressample operations to a weather data to take it from low resolution (4km) to 
    high resolution (100m) for specified smaller AOI.
    
    params:
    input_data_dir_list : List of input dataset directory dilepaths.
    output_main_dir : Output data main directory where masked and resampled data will be saved. 
                      Subdirectories for each variable will be selected by the code.
    masking_shapefile : Filepath of shapefile to mask the data with.
    target_raster_whole_aoi : 100m ref raster for a larger AOI (similar to input data extent). 
    target_raster_smaller_aoi : 100m ref raster for the smaller AOI for which data will be masked and resampled.
    
    returns: None.
    """

    # resampling data only the required variables
    for data_dir in input_data_dir_list:
        variable_name = os.path.basename(data_dir)
        print(f'Resampling > Masking > Resampling data for {variable_name}...')
        
        all_rasters = glob(os.path.join(data_dir, '*.tif'))
        
        # to save 100m raster for the first resample
        interim_output_dir = os.path.join(output_main_dir, variable_name, 'resampled')
        # to save masked rasters in an interim folder inside each variable's folder
        masked_output_dir =  os.path.join(output_main_dir, variable_name, 'masked') 
        # to save final resampled raster
        resampled_output_dir =  os.path.join(output_main_dir, variable_name)
        
        makedirs([resampled_output_dir, interim_output_dir, masked_output_dir])
        
        for raster in all_rasters:
            # Step 1: Resampling to 100m with target-raster_whole_aoi
            raster_name = os.path.basename(raster).split('.')[0] + '.tif'
            first_resampled_raster = resample_raster_based_on_ref_raster(input_raster=raster, 
                                                                         ref_raster=target_raster_whole_aoi, 
                                                                         output_dir=interim_output_dir, 
                                                                         raster_name=raster_name)
            
            # Step 2: Mask raster with the masking_shapefile (smaller AOI) 
            masked_arr, masked_raster_fp = mask_raster_array_by_shapefile(input_raster=first_resampled_raster, 
                                                                          mask_shape=masking_shapefile, 
                                                                          output_dir=masked_output_dir, raster_name=raster_name, 
                                                                          invert=False, crop=True, save_masked_arr=True)
            
            # Step 3: Resampling the masked raster to make sure the processed raster's pixels allign with target_raster_smaller_aoi's pixels
            resample_raster_based_on_ref_raster(input_raster=masked_raster_fp, ref_raster=target_raster_smaller_aoi, 
                                                output_dir=resampled_output_dir, raster_name=raster_name)
            
            # Deleting the resampled raster for larger aoi
            os.remove(first_resampled_raster)
            
            
def create_weather_satellite_100m_preARD_dataframes(weather_data_directories_list, satellite_data_directories_list,
                                                    output_folder, savename, dataset_in_each_chunk=3):
    """
    Compile 100m weather datasets to multiple preARD dataframes. 
    Weather datasets have extended daily records and will take high memory to compile into single dataframe at once; therefore, 
    compiling weather datasets into multiple dataframes (referring to as preARD). These preARD dataframes will be combined into 
    a single dataframe and averaged to form the 4km weather ARD in a later step.
    
    *** satellite datasets will be given input as a separate list because it doesn't have the same daily coverage as the weather 
    data and can cause unequal array issue while being compiled with the weather datasets.
    
    params:
    weather_data_directories_list : List of filepaths of weather data directories.
    satellite_data_directories_list : List of filepaths of satellite data directories.
    output_folder : Filepath (str) of output folder. 
    savename : (str) Name of the output parquet file.
    
    returns: None.
    """
    makedirs([output_folder])
    
    # All datasets except avg_temp, min_Rhumid, max_Rhumid, TotNetSR have modeled and TWC data mixed (TWC data mixed from 2015)
    # The number of days in avg_temp, min_Rhumid, max_Rhumid, TotNetSR are 7809, while in other dataasets it's 7821
    # creating a separate data_chunk for avg_temp, min_Rhumid, max_Rhumid, TotNetSR so that they can be saved asw separate preARD
    separated_datasets = []
    for i in weather_data_directories_list:
        dataname =  os.path.basename(i)
        
        if dataname in ['avg_temp', 'min_Rhumid', 'max_Rhumid', 'TotNet_SR']:
            separated_datasets.append(i)
    
    # Removing 'avg_temp', 'min_Rhumid', 'max_Rhumid', 'TotNetSR' from data directories list
    weather_data_directories_list = [i for i in weather_data_directories_list if i not in separated_datasets]
    
    # creating a list of list of directories. Each individual list will be processed as a chunk and saved as individual parquet
    data_chunks = [weather_data_directories_list[x:x+dataset_in_each_chunk] for x in range(0, len(weather_data_directories_list), 
                                                                                   dataset_in_each_chunk)]
        
    # creating a separate chunk for avg_temp, min_Rhumid, max_Rhumid, TotNetSR
    data_chunks.append(separated_datasets)
        
    # Adding satellite_data_directoroes_list as a separate data_chunk
    data_chunks.append(satellite_data_directories_list)
    
    for num_chunk, datasets in enumerate(data_chunks):
        # will be used to multiply lat/lon data 
        num_days = len(glob(os.path.join(datasets[0], '*.tif')))
    
        # a dictionary where daily dataset values will be stored under variable_name 
        variable_dict = {}  
        
        for data_dir in datasets:
            all_data = glob(os.path.join(data_dir, '*.tif')) # making list of all dataset in a particular data folder
            all_data = sorted(all_data)  # to sort data by date so that all variables are compiled in same serial
        
            variable_name = os.path.basename(data_dir).split('.')[0]  # extracted variable name 
            
            print(f'compiling data for {variable_name}...')

            # loop for reading datasets and storing pixel info in a dictionary
            for count, data in enumerate(all_data):
                # retrieving and storing data
                data_arr = read_raster_arr_object(data, get_file=False).flatten()  # read data as array and flattened it

                # extarcting, formatting, and storing date info
                date = os.path.basename(data).split('.')[0].split('_')[-1]

                len_data = len(data_arr)  # number of pixels in each daily dataset (array)
                date_list = [int(date)] * len_data

                # Assigning all values to the variable_dict
                if count == 0:
                    variable_dict[variable_name] = list(data_arr)  # storing flattened data in a dictionary under the variable name
                    variable_dict['date'] = date_list

                else:
                    variable_dict[variable_name].extend(list(data_arr))  # storing flattened data in a dictionary under the variable name)
                    variable_dict['date'].extend(date_list)

                # adding lat/lon info
                if data_dir == datasets[-1]:
                    lon_arr, lat_arr = make_lat_lon_array_from_raster(data, nodata=-9999)
                    lon_arr = lon_arr.flatten()
                    lat_arr = lat_arr.flatten()
                
                    if count == 0:
                        variable_dict['lat'] = list(lat_arr)
                        variable_dict['lon'] = list(lon_arr)
                    else:
                        variable_dict['lat'].extend(list(lat_arr))
                        variable_dict['lon'].extend(list(lon_arr))

    
        variable_df = pd.DataFrame(variable_dict)
        variable_ddf = ddf.from_pandas(variable_df, npartitions=10)
        variable_ddf = variable_ddf.dropna()
        variable_ddf = variable_ddf.reset_index()
    
        # saving data
        output_parquet_file = os.path.join(output_folder, savename, f'{savename}_{num_chunk}.parquet')
        variable_ddf.to_parquet(output_parquet_file, write_index=False)

## Miscellanious

In [5]:
def copy_file(input_dir_file, copy_dir, search_by='*.tif', rename=None):
    """
    Copy a file to the specified directory.

    :param input_dir_file: File path of input directory/ Path of the file to copy.
    :param copy_dir: File path of copy directory.
    :param search_by: Default set to '*.tif'.
    :param rename: New name of file if required. Default set to None. DOesn't work if a directory is being copied.

    :returns: File path of copied file.
    """
    makedirs([copy_dir])
    if '.tif' not in input_dir_file:
        input_files = glob(os.path.join(input_dir_file, search_by))

        for each in input_files:
            file_name = os.path.basename(each)
            copy_file = os.path.join(copy_dir, file_name)

            shutil.copyfile(each, copy_file)

    else:
        if rename is not None:
            copy_file = os.path.join(copy_dir, f'{rename}.tif')
        else:
            file_name = os.path.basename(input_dir_file)
            copy_file = os.path.join(copy_dir, file_name)

        shutil.copyfile(input_dir_file, copy_file)

    return copy_file


def makedirs(directory_list):
    """
    Make directory (if not exists) from a list of directory.

    :param directory_list: A list of directories to create.

    :returns: None.
    """
    for directory in directory_list:
        if not os.path.exists(directory):
            os.makedirs(directory)