In [1]:
import pandas as pd
import glob
import os
import shutil 


In [2]:
def gridData(csvfiles):
    # Definition of the paths to folders and files:
    grid_path = 'C:/git/FB_ODC_2021/griglia_csv/GRIGLIA_MILANO.csv'  
    path_to_netcdf_folder = 'C:/git/FB_ODC_2021/netcdf_files'
    origin = 'C:/git/FB_ODC_2021/empty_yaml.yaml'
    
    # Upload the grid 
    grid = pd.read_csv(grid_path)
    
    #  All the csv are loaded as pandas dataframe, then are joined to the grid on quadkey value 
    gridded_csv = []
    
    for i in range(len(csvfiles)):
        # Temporary dataframe, from csv
        temp_df = pd.read_csv(csvfiles[i])
        # Not necessary columns are dropped
        temp_df = temp_df.drop(columns = ['country','lon', 'lat', 'n_baseline', 'n_difference', 'density_crisis', 'density_baseline', 'percent_change', 'clipped_z_score', 'ds'])
        # nan values are set to 0
        temp_df['n_crisis'] = temp_df['n_crisis'].replace( '\\N', 0)
        temp_df['n_crisis'] = temp_df['n_crisis'].astype(float)
        
        # merge of the csv with the grid on quadkey
        temp_gridded = grid.merge(temp_df, on = 'quadkey', how = 'outer')
        temp_gridded = temp_gridded.rename(columns = {'latitude_g':'latitude', 'longitude_g':'longitude'})
        temp_gridded['n_crisis'].fillna(0, inplace=True)
      
        # Datetimes are used to name the NETCDF files
        date_time = (((csvfiles[i].split('_'))[3]).split('.'))[0]  
        temp_gridded['date_time'] = datetime
        temp_gridded = temp_gridded.set_index(['quadkey'])
        # all gridded csv are stored in gridded_csv list of dataframe
        gridded_csv.append(temp_gridded)
        print('ok',i,'gridded')
        
        # Gridded dataframes are transformed in xarray
        temp_xarray = temp_gridded.to_xarray()
        print('ok',i,'to xarray')
        
        # xarray are transformed to netcdf files and saved in a specific folder
        netcdf_path = path_to_netcdf_folder+'/'+datetime+'.nc'
        temp_xarray.to_netcdf(netcdf_path)
        print('ok',i,'in netcdf')
    
        # Writing of the metadata on a yaml file
        datetime_string = datetime[0:10]+"T"+datetime[11]+datetime[12]+":"+datetime[13]+datetime[14]+":00.000Z"
        PID = list(datetime)
        PID.remove("-")
        PID.remove("-")
        PID.remove(" ")
        PID = "".join(PID)
        file1 = open(origin, "w")
        to_write = "$schema: https://schemas.opendatacube.org/dataset \n \nid: 00000000-0000-0000-0000-"+PID+"\n\nproduct:\n  name: FB_POI_MILANO\n  href: https://dataforgood.fb.com/ \n  format: NetCDF\n\ncrs: epsg:4326\n\ngeometry:\n  type: Polygon\n  coordinates: [[[ 8.995056152343800, 45.311597470877999], [8.995056152343800, 45.627484179430269], [9.549865722656120, 45.627484179430269], [9.549865722656120, 45.311597470877999], [ 8.995056152343800, 45.311597470877999]]]\n\ngrids:\n  default:\n    shape: [102,83] \n    transform: [1,0,0,0,1,0,0,0,1]\n\nlineage: {}\n\nmeasurements:\n  n_crisis:\n    layer: n_crisis\n    path: "+netcdf_path+"\n    nodata: -9999\n\nproperties:\n  odc:file_format: NetCDF\n  datetime: "+datetime_string
        file1.write(to_write)
        file1.close()
        target = "C:/git/FB_ODC_2021/cubeenv/dataset/"+PID+".yaml"
        # shutil library is used to save a copy of the file in the folder containing all metadata
        shutil.copy(origin, target)
        
        command = "datacube dataset add "+ target
        os.system(command)
        
    # In conclusion the names of the new peocessed csv are now written in loaded_csv.txt 
    to_write_on_txt = ",".join(csvfiles)+','
    with open("loaded_csv.txt", "a") as output:
        output.write(to_write_on_txt)
    
    
    
    return 'DONE!'

In [3]:
# Check if there are new csv files in /Coronavirus Disease Prevention/Population Map/milan to upload in OpenDataCube
fold_path = 'C:/git/FB_ODC_2021/milan'
csvfiles = []
for file in glob.glob(fold_path+"/"+"*.csv"):
    # We only consider not-empy files
    if os.stat(file).st_size != 0:
        csvfiles.append(file)

# 'loadedCSV.txt' contains all the names of the already loaded files, already_loaded is a list containing these files names
with open("C:/git/FB_ODC_2021/loaded_csv.txt", "r") as txt:
    already_loaded = (txt.read()).split(',')

# in order to check if there are new csv i use a loop that removes from csvfiles list the names of the already loaded csv
for name in already_loaded:
    if name in csvfiles:
        csvfiles.remove(name)  

if len(csvfiles) != 0:
    print(len(csvfiles), 'new CSVs have been found')
    print('Processing of CSVs is started...')
    ret = gridData(csvfiles)
    if ret == 'DONE!':
        print('CSVs have been processed and transformed in NETCDF format, metadata dataset have been created and correctly uploaded in ODC')
        
    else:
        print('Something whent wrong :(')
else:
    print('No new csv have been found, all csv are already uploaded in ODC')

No new csv have been found, all csv are already uploaded in ODC


# General information about ODC 

In [1]:
import datacube
#from odc.ui import DcViewer

In [2]:
dc = datacube.Datacube(app = "FB_ODC")

  username=username, password=password,


In [3]:
# Display all products contained in ODC database
list_of_products = dc.list_products()
products = dc.list_products()
display_columns = ["name",
                   "description",
                   "platform",
                   "instrument",
                   "crs",
                   "resolution"]

products[display_columns].sort_index()

Unnamed: 0_level_0,name,description,platform,instrument,crs,resolution
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,prova_1,FB for good data about population,,,EPSG:4326,


In [4]:
# Select a specific product and show it's characteri
product = 'prova_1'
measurements = dc.list_measurements()
measurements.loc[product]

Unnamed: 0_level_0,name,dtype,units,nodata
measurement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
n_crisis,n_crisis,float64,people,-9999
quadkey,quadkey,int64,adimentional,-9999


In [5]:
DcViewer(dc=dc,
         time='2020',
         width='800px',
         center=(45.469, 9.265),
         zoom=10)

VBox(children=(HBox(children=(Dropdown(layout=Layout(flex='0 1 auto', width='10em'), options=('Population_MI',…

# Loading data

In [5]:
#from odc.ui import with_ui_cbk

# The function dc.load allows to load all dataset from 'FB_POP_MILANO' that matches the given spatial and temporal extent
latitude_bounds = (45.311597470877999, 45.627484179430269)
longitude_bounds = (8.995056152343800, 9.549865722656120)
ds = dc.load(product='prova_1', latitude=latitude_bounds, longitude=longitude_bounds, time = '2020', resolution = (-10,10), output_crs='EPSG:4326')

TypeError: zip argument #2 must support iteration

In [12]:
print(ds)

<xarray.Dataset>
Dimensions:  ()
Data variables:
    *empty*


In [2]:
import python_utils as utils