In [None]:
import glob #---------------------------------------------------- To read the files or folders in a system directory
import re #------------------------------------------------------ To replace characters in a string
import numpy    as np
import datetime
import warnings #------------------------------------------------ To suppress warnings
import h5py #---------------------------------------------------- To read hdf5 files
import pandas   as pd
import time as TTT
from   scipy                         import spatial #-------------------------------------- To extract the values and indices of k nearest neighbors
from   ast                           import literal_eval #----------------------------------- For literal evaluation of a string to extract python objects
from   pyproj                        import Proj, transform #----------------------------- To interconvert different projections
from   netCDF4                       import Dataset #------------------------------------ To read nc , nc4 and hdf4 files
from   photutils.utils               import ShepardIDWInterpolator as idw #------ To use Shepard's Inverse Distance Weighing Interpolation tool
from   IPython.core.interactiveshell import InteractiveShell  #---- Output all jupyter lab inputs instead of the last one
from   IPython.display               import Markdown, display

InteractiveShell.ast_node_interactivity = "all"
warnings.simplefilter('ignore')
def printmd(string):
    display(Markdown(string))

### Define Non-Iterative Functions and Variables

in_proj  = Proj('+proj=sinu +R=6371007.181 +nadgrids=@null +wktext') #------ Specify input projection
out_proj = Proj(init='epsg:4326') #----------------------------------------- Specify output projection


def Times(x): #------------------------------------------Extract Time from sounding ID. NOTE the time format is HH:MM:SSSS
    y   = str(x)
    yy  = y[8:]
    yyy = '{}:{}:{}'.format(yy[:2], yy[2:4], yy[4:])
    return yyy

hours = [0, 3000000, 6000000, 9000000, 12000000, 15000000, 18000000, 21000000, 23595900]
def f(x): #---------------------------------------------Extract the hour interval of sif time (which  is in seconds)
    for i in range(len(hours)):
        if (x>hours[i]) and (x<hours[i+1]):
            lb = hours[i]
            ub = hours[i+1]
            return lb,ub
            break

def format_time(t): #----------------------------------- Format the time into HH:MM:SSSS for the raw format HH:MM:SSSSSSSS
    s = t
    return s[:-4]

def nn(latitude_list,longitude_list,target): #---------- Find the index of nearest neighbor (NOTE: absolute difference)
    target_lat, target_lon = target[1], target[0]
    d = [abs(latitude-target_lat) + abs(longitude-target_lon) for latitude,longitude in zip(latitude_list,longitude_list)]
    return np.argmin(d)

data = np.genfromtxt('sn_bound_10deg.txt', skip_header = 7, skip_footer = 3)
def tile_finder(Lat,Lon): #----------------------------- Find modis tile numbers in which the argument lat,lon lies
    in_tile = False
    i = 0
    while(not in_tile):
        in_tile = Lat >= data[i, 4] and Lat <= data[i, 5] and Lon >= data[i, 2] and Lon <= data[i, 3]
        i += 1
    V = str(int(data[i-1, 0])).zfill(2)
    H = str(int(data[i-1, 1])).zfill(2)
    return H,V

def extract_pixel_coordinates(ULx,Uly,LRx,LRy,shape):
    x        = np.linspace(ULx, LRx, shape[0], endpoint=False) + abs((ULx-LRx)/(2*shape[0]))
    y        = np.linspace(ULy, LRy, shape[0], endpoint=False) - abs((ULy-LRy)/(2*shape[0]))
    xx, yy   = np.meshgrid(x,y)
    xs       = xx.flatten()
    ys       = yy.flatten()
    plon, plat = transform(in_proj, out_proj, xs, ys)
    return plon, plat

def temporal_interpolation(time1,val1,time2,val2,timeX):
    df    = pd.DataFrame( [(time1, val1) , (time2, val2)] , columns=['Times','Values'] ) 
    df    = df.set_index('Times')
    df    = pd.Series(df['Values'], index=df.index)
    df.index = pd.to_datetime(df.index)
    inter = df.resample('S').interpolate(method='linear')
    valX  = inter.loc[timeX]
    return valX

fpar_folder_list  = glob.glob('MCD15A3H/*')    
par_folder_list   = glob.glob('MCD18A2/*' )
ref_folder_list   = glob.glob('MCD43A4/*' )

sif_file_list     = glob.glob('OCO2_sif/*.nc4')    #--------------------------------------------- List of all OCO2 files
calipso_file_list = glob.glob('OCO2_calipso/*.h5') #--------------------------------------------- List of all OCO2-CALIPSO files

data              = np.genfromtxt('sn_bound_10deg.txt', skip_header = 7, skip_footer = 3) #------ File having tile numbers and IDs

for sif_file in sif_file_list: #---------------------------------------------------------------------------Read one sif file at a time
    TTT1 = TTT.time()
    
    sif_date        = datetime.datetime.strptime(sif_file.split('_')[3], '%y%m%d').strftime("%Y-%m-%d") #-----Extract sif date
    sif_julian_day  = datetime.datetime.strptime(sif_file.split('_')[3], '%y%m%d').strftime("%j") #-----------Extract sif julian day
    sif             = Dataset(sif_file, mode='r') #-----------------------------------------------------------Open sif file
    calipso_df_list = [] #------------------------------------------------------------------------------------Create an empty list
    for calipso_file in calipso_file_list: #------------------------------------------------------------------Loop through all calipso files
        calipso_date      = datetime.datetime.strptime(calipso_file.split('_')[5], '%y%m%d').strftime("%Y-%m-%d")
        if calipso_date  == sif_date: #----------------------------------------------------------------------------If calipso date matches sif date,
            calipso       = h5py.File(calipso_file, mode='r') #----------------------------------------------------open the calipso file
            calipso_ID    = calipso['OCO2_sounding_id'                                           ][:]
            calipso_dist  = calipso['matchup_distance_km'                                        ][:]
            calipso_index = calipso['matchup_Xindex'                                             ][:]
            calipso_dfs   = pd.DataFrame({'sounding_id':calipso_ID.flatten(),'Xindex':calipso_index.flatten(),'Xdistance':calipso_dist.flatten()}) #-----Create dataframe with variables
            calipso_dfs[calipso_dfs.Xindex==-999.0] = np.nan #------- Replace missing values with nan
            calipso_dfs.dropna(inplace=True) #----------------------- Drop missing values of Xindex
            calipso_dfs[calipso_dfs.Xdistance>=2.0] = np.nan
            calipso_dfs.dropna(inplace=True)
            calipso_df_list.append(calipso_dfs) #-------------------- Add all calipso dataframes into a list
    calipso_df                     = pd.concat(calipso_df_list, ignore_index = True).drop_duplicates() #---------Create a final calipso dataframe for a day

    cloud_albedo                   = sif.groups['Cloud'].variables['albedo'                 ][:].flatten() #--------Read sif variables and flatten them
    cloud_flag                     = sif.groups['Cloud'].variables['cloud_flag'             ][:].flatten()
    cloud_co2_ratio                = sif.groups['Cloud'].variables['co2_ratio'              ][:].flatten()
    cloud_delta_surface_pressure   = sif.groups['Cloud'].variables['delta_surface_pressure' ][:].flatten()
    cloud_o2_ratio                 = sif.groups['Cloud'].variables['o2_ratio'               ][:].flatten()
    vapor_pressure_deficit         = sif.groups['Meteo'].variables['vapor_pressure_deficit' ][:].flatten()
    temperature_2m                 = sif.groups['Meteo'].variables['2m_temperature'         ][:].flatten()
    temperature_skin               = sif.groups['Meteo'].variables['skin_temperature'       ][:].flatten()
    specific_humidity              = sif.groups['Meteo'].variables['specific_humidity'      ][:].flatten()
    surface_pressure               = sif.groups['Meteo'].variables['surface_pressure'       ][:].flatten()
    wind_speed                     = sif.groups['Meteo'].variables['wind_speed'             ][:].flatten()
    continuum_radiance_757nm       = sif.variables         ['continuum_radiance_757nm'      ][:].flatten()
    continuum_radiance_771nm       = sif.variables         ['continuum_radiance_771nm'      ][:].flatten()
    daily_correction_factor        = sif.variables         ['daily_correction_factor'       ][:].flatten()
    footprint                      = sif.variables         ['footprint'                     ][:].flatten()
    IGBP_index                     = sif.variables         ['IGBP_index'                    ][:].flatten()
    latitude                       = sif.variables         ['latitude'                      ][:].flatten()
    longitude                      = sif.variables         ['longitude'                     ][:].flatten()
    measurement_mode               = sif.variables         ['measurement_mode'              ][:].flatten()
    orbit_number                   = sif.variables         ['orbit_number'                  ][:].flatten()
    reduced_chi2_757nm             = sif.variables         ['reduced_chi2_757nm'            ][:].flatten()
    reduced_chi2_771nm             = sif.variables         ['reduced_chi2_771nm'            ][:].flatten()
    sensor_azimuth_angle           = sif.variables         ['sensor_azimuth_angle'          ][:].flatten()
    sensor_zenith_angle            = sif.variables         ['sensor_zenith_angle'           ][:].flatten()
    SIF_757nm                      = sif.variables         ['SIF_757nm'                     ][:].flatten()
    SIF_757nm_relative             = sif.variables         ['SIF_757nm_relative'            ][:].flatten()
    SIF_757nm_uncert               = sif.variables         ['SIF_757nm_uncert'              ][:].flatten()
    SIF_771nm                      = sif.variables         ['SIF_771nm'                     ][:].flatten()
    SIF_771nm_relative             = sif.variables         ['SIF_771nm_relative'            ][:].flatten()
    SIF_771nm_uncert               = sif.variables         ['SIF_771nm_uncert'              ][:].flatten()
    solar_azimuth_angle            = sif.variables         ['solar_azimuth_angle'           ][:].flatten()
    solar_zenith_angle             = sif.variables         ['solar_zenith_angle'            ][:].flatten()
    sounding_id                    = sif.variables         ['sounding_id'                   ][:].flatten()
    surface_altitude               = sif.variables         ['surface_altitude'              ][:].flatten()
    time                           = sif.variables         ['time'                          ][:].flatten()
    uncorrected_SIF_757nm          = sif.variables         ['uncorrected_SIF_757nm'         ][:].flatten()
    uncorrected_SIF_757nm_relative = sif.variables         ['uncorrected_SIF_757nm_relative'][:].flatten()
    uncorrected_SIF_771nm          = sif.variables         ['uncorrected_SIF_771nm'         ][:].flatten()
    uncorrected_SIF_771nm_relative = sif.variables         ['uncorrected_SIF_771nm_relative'][:].flatten()

      
    sif_rows  = [(SIF_757nm[i], cloud_albedo[i], cloud_flag[i], cloud_co2_ratio[i], cloud_delta_surface_pressure[i], cloud_o2_ratio[i], vapor_pressure_deficit[i],
                  temperature_2m[i], temperature_skin[i], specific_humidity[i], surface_pressure[i], wind_speed[i], continuum_radiance_757nm[i],
                  continuum_radiance_771nm[i],daily_correction_factor[i], footprint[i], IGBP_index[i], latitude[i], longitude[i], measurement_mode[i],
                  orbit_number[i], reduced_chi2_757nm[i],reduced_chi2_771nm[i], sensor_azimuth_angle[i], sensor_zenith_angle[i], SIF_757nm_relative[i],
                  SIF_757nm_uncert[i], SIF_771nm[i],SIF_771nm_relative[i], SIF_771nm_uncert[i], solar_azimuth_angle[i], solar_zenith_angle[i], sounding_id[i],
                  surface_altitude[i], time[i], uncorrected_SIF_757nm[i], uncorrected_SIF_757nm_relative[i], uncorrected_SIF_771nm[i], uncorrected_SIF_771nm_relative[i])
                  for i in range(0,len(sounding_id))]
    
    column_labels = ['SIF_757nm', 'cloud_albedo', 'cloud_flag', 'cloud_co2_ratio', 'cloud_delta_surface_pressure', 'cloud_o2_ratio', 'vapor_pressure_deficit',
                     'temperature_2m', 'temperature_skin', 'specific_humidity', 'surface_pressure', 'wind_speed', 'continuum_radiance_757nm',
                     'continuum_radiance_771nm','daily_correction_factor', 'footprint', 'IGBP_index', 'latitude', 'longitude', 'measurement_mode',
                     'orbit_number', 'reduced_chi2_757nm','reduced_chi2_771nm', 'sensor_azimuth_angle', 'sensor_zenith_angle',
                     'SIF_757nm_relative', 'SIF_757nm_uncert', 'SIF_771nm','SIF_771nm_relative', 'SIF_771nm_uncert', 'solar_azimuth_angle', 'solar_zenith_angle',
                     'sounding_id', 'surface_altitude', 'time','uncorrected_SIF_757nm', 'uncorrected_SIF_757nm_relative', 'uncorrected_SIF_771nm',
                     'uncorrected_SIF_771nm_relative']
     
    sif_df                         = pd.DataFrame(sif_rows,columns = column_labels) #-------- Create sif variables' dataframe
    
    
    calipso_sif_merger             = pd.merge(sif_df, calipso_df, on = ['sounding_id'], how = 'inner') #--------Merge sif and calipso on sounding _id

    calipso_sif_merger['Date']     = calipso_sif_merger['sounding_id'].map(lambda x: '-'.join([str(x)[:4],str(x)[4:6],str(x)[6:]])[:10]) #-----Create new date column
    calipso_sif_merger['SIF_Time'] = calipso_sif_merger['sounding_id'].map(lambda x: Times(x))  #----------------------------------------------Create new time column
    calipso_sif_merger['tile_h'  ] = calipso_sif_merger.apply(lambda x: tile_finder(x['latitude'], x['longitude'])[0], axis=1) #------Create new horizontal tile column
    calipso_sif_merger['tile_v'  ] = calipso_sif_merger.apply(lambda x: tile_finder(x['latitude'], x['longitude'])[1], axis=1) #------Create new vertical tile column
    calipso_sif_merger             = calipso_sif_merger.dropna(how='any')
    calipso_sif_merger.to_csv('Processed_sif/df_sif_{}.csv'.format(sif_date), index=False)
    grp         = calipso_sif_merger.groupby(['tile_h', 'tile_v']).agg(lambda x: list(x))  #----Group sif-calipso merger(from now on called SIF*) by tile id
    grp         = grp.reset_index() #-----------------------------------------------------------Reset indices
    l_ungrouped = len(calipso_sif_merger)
    l_grouped   = len(grp)
    
    df          = grp.copy() #--------make a copy of the grouped file
    print('------------------------------------------------------------------------------------------------------------------------------------------------')
    printmd('**For {}, there are {} sif footprints scattered over {} tiles.**'.format(sif_date, l_ungrouped, l_grouped))
    print('------------------------------------------------------------------------------------------------------------------------------------------------')  
    
    
    
    # START PAR PROCESSING
    

    print('PAR Processing Started...\n')
    shape              = (240,240)
    Each_Par_Tile_Data = []   #-------------Create empty list to store all the par extracted data as tuples
    
    for index,h_sif,v_sif,sif_lon,sif_lat,sif_time,sif_sid in zip(df.index,df['tile_h'],df['tile_v'],df['longitude'],df['latitude'],df['SIF_Time'],df['sounding_id']): #Loop through rows
        print('h{}v{}'.format(h_sif,v_sif), flush = True, sep=',', end=' ')
        
        for folder_number in range(len(par_folder_list)): #-------------------------Go inside folders of daily par files
            par_julian_day    = par_folder_list[folder_number].split('/')[1] #------Extract par julian day
            
            if sif_julian_day == par_julian_day: #------------------------------------If sif julian day is same as par folder julian day,
                par_file_list = glob.glob(par_folder_list[folder_number]+'/*.hdf')#---open that folder (this folder contains par files for that day)
            
                for par_file in par_file_list: #--------------------------------------Loop through par files in par file list created in above line
                    h_par = par_file.split('.')[2][1:3] #---------------Extract par h tile no.
                    v_par = par_file.split('.')[2][4:6] #---------------Extract par v tile no.
                    
                    if (h_par==h_sif) and (v_par==v_sif): #-------------If sif tiles match with par tiles, open par file otherwise go to the next par file to check.
                        par       = Dataset(par_file, mode='r')
                        par_date  = datetime.datetime.strptime(par_file.split('.')[1][1:], '%Y%j').strftime("%Y-%m-%d")
                        gmt_0000  = par.variables['GMT_0000_PAR'][:].flatten() #-----------------------------------------Read par 3-hourly variables
                        gmt_0300  = par.variables['GMT_0300_PAR'][:].flatten()
                        gmt_0600  = par.variables['GMT_0600_PAR'][:].flatten()
                        gmt_0900  = par.variables['GMT_0900_PAR'][:].flatten()
                        gmt_1200  = par.variables['GMT_1200_PAR'][:].flatten()
                        gmt_1500  = par.variables['GMT_1500_PAR'][:].flatten()
                        gmt_1800  = par.variables['GMT_1800_PAR'][:].flatten()
                        gmt_2100  = par.variables['GMT_2100_PAR'][:].flatten()
                        struct    = getattr(par, 'StructMetadata.0')
                        struct1   = struct[struct.find('UpperLeftPointMtrs'): struct.find('LowerRightMtrs')][19:-3] #-------------]
                        struct2   = struct[struct.find('LowerRightMtrs')    : struct.find('Projection')    ][15:-3] #-------------] Extract upper right and lower left
                        ULx, ULy  = literal_eval(struct1) #-----------------------------------------------------------------------] coordinates of the tile of opened par file
                        LRx, LRy  = literal_eval(struct2) #-----------------------------------------------------------------------]
                        par_lon,par_lat = extract_pixel_coordinates(ULx,ULy,LRx,LRy,shape) #----------- Extract the par lat,lon meshgrid in proper projection 
                        tree      = spatial.KDTree(   list(  zip(par_lon, par_lat) )) #---------------- Create a nearest neighbor spatial tree

                        for sub in range(len(sif_time)): #--------------------------Loop through the list of a dataframe cell (grouped dataframe cells consist of lists of values)
                            target          = (sif_lat[sub] , sif_lon[sub])  #---------------------------------------------------Make sif coordinates target for spatial interpolation
                            sif_time_sub    = re.sub(':', '', sif_time[sub]) #---------------------------------------------------Remove : from sif_time
                            timeX           = pd.to_datetime(sif_time_sub.ljust(8, "0"), format="%H%M%S%f").strftime("%H:%M:%S")#--Change resolution from SSSS to SS for faster interpolation
                            lower_bound_key = str('interp_gmt_')+str(f(int(sif_time_sub))[0]).zfill(8)[:4] #---Find lower bound of interval in which sif time lies
                       
                            if f(int(sif_time_sub))[1] == 23595900: #-----------------------------------------------If upper bound is greater than 2100 hours that is 235959
                                upper_bound_key = str('interp_gmt_')+str(f(int(sif_time_sub))[1]).zfill(8)[:6] #----upper key becomes interp_gmt_235959
                            else:                                                                              #----else                   
                                upper_bound_key = str('interp_gmt_')+str(f(int(sif_time_sub))[1]).zfill(8)[:4] #----upper key is as it is (interp_gmt_1200,1500,1800,etc)
                        
                            neigh5          = tree.query([(sif_lon[sub], sif_lat[sub])], k=10)[1][0] #---Find k=3 nearest spatial neighbors indices at target
                            lon_for_idw     = [par_lon[i] for i in neigh5] #----------------------------Extract longitudes at these indices
                            lat_for_idw     = [par_lat[i] for i in neigh5] #----------------------------Extract latitudes also
                            coors_for_idw   = [(i,j) for i,j in zip(lat_for_idw,lon_for_idw)] #---------Create a list of coordinates (lat lon tuples)

                            gmt_0000n       = [gmt_0000[i] for i in neigh5] #---------------------------Find variable values at nearest neighbors
                            gmt_0300n       = [gmt_0300[i] for i in neigh5]
                            gmt_0600n       = [gmt_0600[i] for i in neigh5]
                            gmt_0900n       = [gmt_0900[i] for i in neigh5]
                            gmt_1200n       = [gmt_1200[i] for i in neigh5]
                            gmt_1500n       = [gmt_1500[i] for i in neigh5]
                            gmt_1800n       = [gmt_1800[i] for i in neigh5]
                            gmt_2100n       = [gmt_2100[i] for i in neigh5]

                            func_gmt_0000   = idw(coors_for_idw, gmt_0000n) #---------------------------Create inverse distance weighing (IDW) interpolation function at nn coordinates
                            func_gmt_0300   = idw(coors_for_idw, gmt_0300n)
                            func_gmt_0600   = idw(coors_for_idw, gmt_0600n)
                            func_gmt_0900   = idw(coors_for_idw, gmt_0900n)
                            func_gmt_1200   = idw(coors_for_idw, gmt_1200n)
                            func_gmt_1500   = idw(coors_for_idw, gmt_1500n)
                            func_gmt_1800   = idw(coors_for_idw, gmt_1800n)
                            func_gmt_2100   = idw(coors_for_idw, gmt_2100n)

                            interp_gmt_0000 = func_gmt_0000(target) #-----------------------------------Find interpolated hourly par at target
                            interp_gmt_0300 = func_gmt_0300(target)
                            interp_gmt_0600 = func_gmt_0600(target)
                            interp_gmt_0900 = func_gmt_0900(target)
                            interp_gmt_1200 = func_gmt_1200(target)
                            interp_gmt_1500 = func_gmt_1500(target)
                            interp_gmt_1800 = func_gmt_1800(target)
                            interp_gmt_2100 = func_gmt_2100(target)

                            interp_gmt_235959 = 0
                            mydict = {'interp_gmt_0000':interp_gmt_0000, 'interp_gmt_0300':interp_gmt_0300, 'interp_gmt_0600'  :interp_gmt_0600,
                                      'interp_gmt_0900':interp_gmt_0900, 'interp_gmt_1200':interp_gmt_1200, 'interp_gmt_1500'  :interp_gmt_1500,
                                      'interp_gmt_1800':interp_gmt_1800, 'interp_gmt_2100':interp_gmt_2100, 'interp_gmt_235959':interp_gmt_235959} #---Save variables in dictionary 
                        
                        
                        
                            lower_bound_value = mydict[lower_bound_key] #-----Find lower 3 hourly par value
                            upper_bound_value = mydict[upper_bound_key] #-----Find upper 3 hourly par value

                            if upper_bound_value == interp_gmt_235959:  #---------------------------- If upper par value > 2100, we need to extract its value from next day par file
                                par_file_list = glob.glob(par_folder_list[folder_number+1]+'/*.hdf') #---Open next day par file list

                                for par_file in par_file_list:  #----------------------------]
                                    h_par = par_file.split('.')[2][1:3]#---------------------] Check for date and tile
                                    v_par = par_file.split('.')[2][4:6]#---------------------]

                                    if (h_par == h_sif) and (v_par == v_sif): #-----------------------------------------] Repeat above procedure
                                        par = Dataset(par_file, mode='r') #---------------------------------------------] once again just
                                        gmt_235959        = par.variables['GMT_0000_PAR'][:].flatten() #----------------] to obtain the spatially
                                        lon_for_idw       = [par_lon[i] for i in neigh5] #------------------------------] interpolated par 
                                        lat_for_idw       = [par_lat[i] for i in neigh5] #------------------------------] value at the upper bound
                                        coors_for_idw     = [(i,j) for i,j in zip(lat_for_idw,lon_for_idw)] #-----------]
                                        gmt_235959        = [gmt_235959[i] for i in neigh5] #---------------------------]
                                        func_gmt_235959   = idw(coors_for_idw, gmt_235959) #----------------------------]
                                        interp_gmt_235959 = func_gmt_235959(target) #-----------------------------------]
                                        break
                                upper_bound_value = interp_gmt_235959
                            
                            # Do TIME INTERPOLATION
                            time1a = pd.to_datetime(lower_bound_key[11:].ljust(8, "0"), format="%H%M%S%f").strftime("%H:%M:%S")
                            time2a = pd.to_datetime(upper_bound_key[11:].ljust(8, "0"), format="%H%M%S%f").strftime("%H:%M:%S")
                            val1   = lower_bound_value
                            val2   = upper_bound_value
                            valX   = temporal_interpolation(time1a,val1,time2a,val2,timeX)
                            Each_Par_Tile_Data.append((sif_sid[sub],sif_lat[sub],sif_lon[sub],valX)) #--------Add sif sounding id, sif lat, sif lon and final interpolated value to list
    
    DF = pd.DataFrame(np.array(Each_Par_Tile_Data), columns=['sounding_id','latitude','longitude', 'par']) #---------------Write each list of par data in a dataframe
    DF.to_csv('Processed_par/df_par_{}.csv'.format(sif_date),index=False)
    print('\n')


    # START REF PROCESSING
    
    print('REF Processing Started...\n')
    shape2              = (2400,2400)
    Each_Ref_Tile_Data  = []
    for index,h_sif,v_sif,sif_lon,sif_lat,sif_time,sif_sid in zip(df.index,df['tile_h'],df['tile_v'],df['longitude'],df['latitude'],df['SIF_Time'],df['sounding_id']):
        print('h{}v{}'.format(h_sif,v_sif), flush = True, sep=',', end=' ')
        
        for folder_number in range(len(ref_folder_list)):
            ref_julian_day    = ref_folder_list[folder_number].split('/')[1]
            
            if sif_julian_day == ref_julian_day:
                ref_file_list = glob.glob(ref_folder_list[folder_number]+'/*.hdf')
            
                for num3,ref_file in enumerate(ref_file_list):
                    h_ref = ref_file.split('.')[2][1:3]
                    v_ref = ref_file.split('.')[2][4:6]
                    
                    if (h_ref==h_sif) and (v_ref==v_sif):
                        ref_date          = datetime.datetime.strptime(ref_file.split('.')[1][1:], '%Y%j').strftime("%Y-%m-%d")
                        ref               = Dataset(ref_file, mode='r')
                        struct            = getattr(ref, 'StructMetadata.0')
                        struct1           = struct[struct.find('UpperLeftPointMtrs'): struct.find('LowerRightMtrs')][19:-3]
                        struct2           = struct[struct.find('LowerRightMtrs')    : struct.find('Projection')    ][15:-3]
                        ULx, ULy          = literal_eval(struct1)
                        LRx, LRy          = literal_eval(struct2)
                        nrb1x             = ref.variables['Nadir_Reflectance_Band1'][:].flatten() 
                        nrb2x             = ref.variables['Nadir_Reflectance_Band2'][:].flatten()
                        ref_lonx,ref_latx = extract_pixel_coordinates(ULx,ULy,LRx,LRy,shape2)
                        ref_lat           = ref_latx[(nrb1x.mask == False) & (nrb2x.mask == False)] #----- Drop missing values
                        ref_lon           = ref_lonx[(nrb1x.mask == False) & (nrb2x.mask == False)] #----- values where
                        nrb1              = nrb1x   [(nrb1x.mask == False) & (nrb2x.mask == False)] #----- mask == True
                        nrb2              = nrb2x   [(nrb1x.mask == False) & (nrb2x.mask == False)] #----- for both reflectance bands
                        tree              = spatial.KDTree( list(  zip(ref_lon, ref_lat) ))
                    
                        for sub in range(len(sif_time)):
                            target         = (sif_lat[sub] , sif_lon[sub])
                            neigh          = tree.query([(sif_lon[sub], sif_lat[sub])], k=4)[1][0] #---Find k=4 nearest spatial neighbors indices at target
                            lon_for_idw    = [ref_lon[i] for i in neigh] #----------------------------Extract longitudes at these indices
                            lat_for_idw    = [ref_lat[i] for i in neigh] #----------------------------Extract latitudes also
                            coors_for_idw  = [(i,j) for i,j in zip(lat_for_idw,lon_for_idw)] 
                            nrb_01         = [nrb1[i] for i in neigh]  #---------------------------Find variable values at nearest neighbors
                            nrb_02         = [nrb2[i] for i in neigh]
                            func_nrb_01    = idw(coors_for_idw, nrb_01) #---------------------------Create inverse distance weighing (IDW) interpolation function at nn coordinates
                            func_nrb_02    = idw(coors_for_idw, nrb_02)
                            interp_nrb1    = func_nrb_01(target) #-----------------------------------Find interpolated hourly par at target
                            interp_nrb2    = func_nrb_02(target)

                            Each_Ref_Tile_Data.append((sif_sid[sub], sif_lat[sub], sif_lon[sub], interp_nrb1, interp_nrb2))
                
                        
    DG = pd.DataFrame(np.array(Each_Ref_Tile_Data),columns=['sounding_id','latitude','longitude','nrb1','nrb2'])
    DG.to_csv('Processed_ref/df_ref_{}.csv'.format(sif_date),index=False)
    print('\n')


    # START FPAR PROCESSING    

    print('FPAR Processing Started...\n')
    shape3               = (2400,2400)
    Each_fpar_Tile_Data  = []
    for index,h_sif,v_sif,sif_lon,sif_lat,sif_time,sif_sid in zip(df.index,df['tile_h'],df['tile_v'],df['longitude'],df['latitude'],df['SIF_Time'], df['sounding_id']):
        print('h{}v{}'.format(h_sif,v_sif), flush = True, sep=',', end=' ') 
        
        for folder_number in range(len(fpar_folder_list)):
            fpar_julian_day    = fpar_folder_list[folder_number].split('/')[1]

            if int(fpar_julian_day) in [int(sif_julian_day), int(sif_julian_day)+1, int(sif_julian_day)+2, int(sif_julian_day)+3]:
                fpar_file_list = glob.glob(fpar_folder_list[folder_number]+'/*.hdf')
            
                for num3,fpar_file in enumerate(fpar_file_list):
                    h_fpar = fpar_file.split('.')[2][1:3]
                    v_fpar = fpar_file.split('.')[2][4:6]

                    if (h_fpar==h_sif) and (v_fpar==v_sif):
                        fpar_date           = datetime.datetime.strptime(fpar_file.split('.')[1][1:], '%Y%j').strftime("%Y-%m-%d")
                        fpar                = Dataset(fpar_file, mode='r')             
                        struct              = getattr(fpar, 'StructMetadata.0')
                        struct1             = struct[struct.find('UpperLeftPointMtrs'): struct.find('LowerRightMtrs')][19:-3]
                        struct2             = struct[struct.find('LowerRightMtrs')    : struct.find('Projection')    ][15:-3]
                        ULx, ULy            = literal_eval(struct1)
                        LRx, LRy            = literal_eval(struct2)
                        fpar500x            = fpar.variables['Fpar_500m'][:].flatten()
                        lai500x             = fpar.variables['Lai_500m' ][:].flatten()
                        fpar_lonx,fpar_latx = extract_pixel_coordinates(ULx,ULy,LRx,LRy,shape3)
                        fpar_lon            = fpar_lonx[(fpar500x.mask == False) & (lai500x.mask == False)]
                        fpar_lat            = fpar_latx[(fpar500x.mask == False) & (lai500x.mask == False)]
                        fpar500             = fpar500x [(fpar500x.mask == False) & (lai500x.mask == False)]
                        lai500              = lai500x  [(fpar500x.mask == False) & (lai500x.mask == False)]                
                        tree                = spatial.KDTree( list(  zip(fpar_lon, fpar_lat) ))

                        for sub in range(len(sif_time)):
                            target          = (sif_lat[sub] , sif_lon[sub])
                            neigh           = tree.query([(sif_lon[sub], sif_lat[sub])], k=4)[1][0] #---Find k=4 nearest spatial neighbors indices at target
                            lon_for_idw     = [fpar_lon[i] for i in neigh] #----------------------------Extract longitudes at these indices
                            lat_for_idw     = [fpar_lat[i] for i in neigh] #----------------------------Extract latitudes also
                            coors_for_idw   = [(i,j) for i,j in zip(lat_for_idw,lon_for_idw)] 
                            fpar500_01      = [fpar500[i] for i in neigh]  #---------------------------Find variable values at nearest neighbors
                            lai500_02       = [lai500 [i] for i in neigh]
                            func_fpar500_01 = idw(coors_for_idw, fpar500_01) #---------------------------Create inverse distance weighing (IDW) interpolation function at nn coordinates
                            func_lai500_02  = idw(coors_for_idw, lai500_02)
                            interp_fpar500  = func_fpar500_01(target) #-----------------------------------Find interpolated hourly par at target
                            interp_lai500   = func_lai500_02(target)
                            #ind        = tree.query([(sif_lon[sub],sif_lat[sub])], k=1)[1][0] #-----------In case nearest value needs to be taken as it is
                            Each_fpar_Tile_Data.append((sif_sid[sub],sif_lat[sub],sif_lon[sub],interp_fpar500, interp_lai500))

                        
    DH = pd.DataFrame(np.array(Each_fpar_Tile_Data),columns=['sounding_id','latitude','longitude','Fpar_500m','Lai_500m'])
    DH.to_csv('Processed_fpar/df_fpar_{}.csv'.format(sif_date),index=False)

    
    di  = calipso_sif_merger
    dj  = DF
    dk  = DG
    dl  = DH

    dM1 = pd.merge(di, dj, on=['sounding_id','latitude','longitude' ],how='inner')
    dM2 = pd.merge(dM1, dk, on=['sounding_id','latitude','longitude'],how='inner')
    dM3 = pd.merge(dM2, dl, on=['sounding_id','latitude','longitude'],how='inner')

    dM3 = dM3.rename({'par':'PAR', 'nrb1':'Nadir_Reflectance_Band1', 'nrb2':'Nadir_Reflectance_Band2'}, axis=1)
    L = len(dM3)
    with Dataset('OCO2_sif/oco2_LtSIF_180501_B8100r_180703004855s.nc4','r') as src_sif,\
    Dataset("Data__%s.nc4"%i_sif[21:-4], "w")                                 as output ,\
    Dataset('MCD18A2/121/MCD18A2.A2018121.h01v08.006.2019091180956.hdf')    as src_par,\
    Dataset('MCD43A4/121/MCD43A4.A2018121.h00v08.006.2018130031808.hdf')    as src_ref,\
    Dataset('MCD15A3H/121/MCD15A3H.A2018121.h00v08.006.2018129231051.hdf')  as src_fpar:

        group_sif     = output.createGroup("Group_OCO2"       )
        group_par     = output.createGroup("Group_MCD18A2"    )
        group_ref     = output.createGroup("Group_MCD43A4"    )
        group_fpar    = output.createGroup("Group_MCD15A3H"   )

        output.setncatts(src_sif.__dict__)
        output.setncatts(src_par.__dict__)
        output.setncatts(src_ref.__dict__)
        output.setncatts(src_fpar.__dict__)

        for dfname in dM3.columns:

            for name, variable in src_sif.variables.items():
                if name == dfname:
                    outvar         = '/Group_OCO2/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_sif[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])

            for name, variable in src_par.variables.items():
                if name == dfname:
                    outvar         = '/Group_MCD18A2/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_par[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])

            for name, variable in src_ref.variables.items():
                if name == dfname:
                    outvar         = '/Group_MCD43A4/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_ref[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])

            for name, variable in src_fpar.variables.items():
                if name == dfname:
                    outvar         = '/Group_MCD15A3H/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_fpar[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])
                    
    TTT2 = TTT.time()
    print(np.round((TTT2-TTT1)/60),'minutes')

### Write netCDF

In [None]:

#dM2 = dM1.merge(dk, on=['sounding_id','latitude','longitude'], how='inner')
#len(dM2)
#dM3 = dM2.merge(dl, on=['sounding_id','latitude','longitude'], how='inner')
#len(dM3)
#dM3 = dM3.dropna(subset=['par'])#,'nrb1','nrb2','Fpar_500m','Lai_500m'])
#3L   = len(dM3)
#print('Dimensions:',L)
#3dM3 = dM3.rename({'par':'PAR', 'nrb1':'Nadir_Reflectance_Band1', 'nrb2':'Nadir_Reflectance_Band2'}, axis=1)
#break

In [None]:
from netCDF4 import Dataset
import numpy as np
import pandas as pd
import glob

sif_list  = sorted(glob.glob('Processed_sif/*.csv' ))
par_list  = sorted(glob.glob('Processed_par/*.csv' ))
ref_list  = sorted(glob.glob('Processed_ref/*.csv' ))
fpar_list = sorted(glob.glob('Processed_fpar/*.csv'))

for i_sif, i_par, i_ref, i_fpar in zip(sif_list, par_list, ref_list, fpar_list):

    di  = pd.read_csv(i_sif, dtype={'sounding_id':float})
    
    dj  = pd.read_csv(i_par)
    dk  = pd.read_csv(i_ref)
    dl  = pd.read_csv(i_fpar)
    len(di),len(dj),len(dk),len(dl)
    #di['sounding_id'].nunique(),dj['sounding_id'].nunique(),dk['sounding_id'].nunique(),dl['sounding_id'].nunique()
    #di['latitude'].nunique(),dj['latitude'].nunique(),dk['latitude'].nunique(),dl['latitude'].nunique()
    #di['longitude'].nunique(),dj['longitude'].nunique(),dk['longitude'].nunique(),dl['longitude'].nunique()
    dM1 = di .merge(dj, on=['sounding_id'])#,'latitude','longitude'], how='inner')
    kk = di[(~di['sounding_id'].isin(dM1['sounding_id']))&(~di['sounding_id'].isin(dM1['sounding_id']))]
    kk.to_csv('missing.csv',index=False)
    #len(dM1)
    #3dM1.info()
    #dM2 = dM1.merge(dk, on=['sounding_id','latitude','longitude'], how='inner')
    #len(dM2)
    #dM3 = dM2.merge(dl, on=['sounding_id','latitude','longitude'], how='inner')
    #len(dM3)
    #dM3 = dM3.dropna(subset=['par'])#,'nrb1','nrb2','Fpar_500m','Lai_500m'])
    #3L   = len(dM3)
    #print('Dimensions:',L)
    #3dM3 = dM3.rename({'par':'PAR', 'nrb1':'Nadir_Reflectance_Band1', 'nrb2':'Nadir_Reflectance_Band2'}, axis=1)
    #break

In [None]:
#di['sounding_id'].where(di['sounding_id'].values==dj['sounding_id'].values).notna()
l = pd.merge(di, dj, on=['sounding_id'], how='inner')
l.shape
pd.DataFrame(l['sounding_id']).to_csv('vvv.csv')

In [None]:
DF.shape
DG.shape
DH.shape

In [None]:
    with Dataset('OCO2_sif/oco2_LtSIF_180501_B8100r_180703004855s.nc4','r') as src_sif,\
    Dataset("Datas__%s.nc4"%i_sif[21:-4], "w")                                 as output ,\
    Dataset('MCD18A2/121/MCD18A2.A2018121.h01v08.006.2019091180956.hdf')    as src_par,\
    Dataset('MCD43A4/121/MCD43A4.A2018121.h00v08.006.2018130031808.hdf')    as src_ref,\
    Dataset('MCD15A3H/121/MCD15A3H.A2018121.h00v08.006.2018129231051.hdf')  as src_fpar:
        
        group_sif     = output.createGroup("Group_OCO2"       )
        group_par     = output.createGroup("Group_MCD18A2"    )
        group_ref     = output.createGroup("Group_MCD43A4"    )
        group_fpar    = output.createGroup("Group_MCD15A3H"   )
        
        output.setncatts(src_sif.__dict__)
        output.setncatts(src_par.__dict__)
        output.setncatts(src_ref.__dict__)
        output.setncatts(src_fpar.__dict__)
        
        for dfname in dM3.columns:
            
            for name, variable in src_sif.variables.items():
                if name == dfname:
                    outvar         = '/Group_OCO2/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_sif[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])

            for name, variable in src_par.variables.items():
                if name == dfname:
                    outvar         = '/Group_MCD18A2/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_par[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])
                    
            for name, variable in src_ref.variables.items():
                if name == dfname:
                    outvar         = '/Group_MCD43A4/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_ref[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])

            for name, variable in src_fpar.variables.items():
                if name == dfname:
                    outvar         = '/Group_MCD15A3H/%s' %name
                    output.createDimension(name, L)
                    x                  = output.createVariable(outvar, variable.datatype, (name,))
                    output[outvar].setncatts(src_fpar[name].__dict__)
                    output[outvar][:]  = np.array(dM3[name])