In [1]:
import os 
import glob
import gzip
import shutil
import torch
import rasterio
import pandas as pd 
import numpy as np
from tfrecord.torch.dataset import TFRecordDataset

In [2]:
CSV              = os.path.join( "..", "data", "dataset_viirs_only.csv" )
RECORDS_DIR      = os.path.join( "..", "data", "landsat_7_less","" )
TIF_DIR          = os.path.join( "..", "data", "landsat_7_less","" )
BANDS            = ['BLUE','GREEN','RED','NIR','SWIR1','SWIR2','TEMP1','NIGHTLIGHTS']
DESCRIPTOR       = {
                'cluster':"float",
                'lat':"float", 
                "lon":"float",
                'wealthpooled':"float",
                'BLUE':"float",
                'GREEN':"float",
                'RED':"float",
                'NIR':"float",
                'SWIR1':"float",
                'SWIR2':"float",
                'TEMP1':"float",
                'NIGHTLIGHTS':"float"
              }   

In [3]:
csv=pd.read_csv(CSV)

In [4]:
records = dict()
for year in csv.year.unique():
    records[year]=dict()
    sub_year = csv[ csv.year == year ]
    for country in sub_year.country.unique():
        sub_country = sub_year[ sub_year.country == country ].copy()
        pattern = RECORDS_DIR+"*"+str(country)+"_"+str(year)+"/*.tfrecord*"
        records[year][country] = glob.glob(pattern)
records[2015]['angola'][:5]

['../data/landsat_7_less/angola_2015/604.tfrecord.gz',
 '../data/landsat_7_less/angola_2015/605.tfrecord.gz',
 '../data/landsat_7_less/angola_2015/606.tfrecord.gz',
 '../data/landsat_7_less/angola_2015/607.tfrecord.gz',
 '../data/landsat_7_less/angola_2015/608.tfrecord.gz']

In [5]:
def decompress_tfrecord(tfrecord_archive):
    with gzip.open(tfrecord_archive, 'rb') as f_in:
        # WITHOUT .GZ
        with open(tfrecord_archive[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    return tfrecord_archive[:-3]

def tensor_to_string(data, variable):
    filename = (data[variable].numpy())[0][0]
    return str(filename).replace(".","")

def tfrecord_to_tif(data, filename):
    arrays = []
    for i in range(len(BANDS)):
        new_arr = data[BANDS[i]][0].numpy().reshape((255,255))
        arrays.append(new_arr)

    arr = np.swapaxes(np.array(arrays), 0, 2 )
    tif_path = TIF_DIR + filename
    tif = rasterio.open(tif_path, 'w', driver='GTiff',
                            height = arr.shape[0], width = arr.shape[1],
                            count=8, dtype=str(arr.dtype),
                            crs='epsg:3857',
                            transform=None)
    for i in range(len(BANDS)):
        tif.write(arr[:,:,i],i+1)
    tif.close()

    return 

In [None]:
for year in records:
    print(year)
    for country in records[year]:
        if records[year][country]==[]:
            continue
        for tfrecord_archive in records[year][country]:
            if tfrecord_archive[-3:] == '.gz':
                tfrecord = decompress_tfrecord(tfrecord_archive=tfrecord_archive)
                tfrecord = tfrecord_archive[:-3]
            else:
                tfrecord = tfrecord_archive
            dataset = TFRecordDataset(tfrecord, index_path=None, description=DESCRIPTOR)
            loader = torch.utils.data.DataLoader(dataset, batch_size=1)
            iterator = iter(loader)
            while (data := next(iterator, None)) is not None:
                filename = str(country)+"_"+str(year)+"/"+tensor_to_string(data, "cluster")[:-1]+".tif"
                tfrecord_to_tif(data, filename)

#### Get `MEAN` and `STD` over the dataset

In [1]:
from helper import get_dataset_mean, get_dataset_std
import pickle
import glob
import os
import numpy as np
NORMALIZER = os.path.join( '..','datasets', 'normalizer.pkl')
RECORDS_DIR      = os.path.join( "..", "data", "landsat_7_less","" )

2023-08-28 14:31:12.404545: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-28 14:31:12.468940: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [3]:
mean = get_dataset_mean( image_dir=RECORDS_DIR, num_channels=8 )
mean

array([5.17201265e-02, 8.82524713e-02, 1.02674783e-01, 2.64303597e-01,
       2.49776030e-01, 1.71501024e-01, 3.00020234e+02, 1.59417214e+00])

In [2]:
std = get_dataset_std( image_dir=RECORDS_DIR, num_channels=8, dataset_mean=mean )
std

array([6.19937578e-04, 1.27377769e-03, 3.33014929e-03, 4.55055797e-03,
       9.47793062e-03, 8.13588032e-03, 2.26191530e+01, 1.01391193e+00])

In [6]:
normalizer = dict()
normalizer['landsat_+_nightlights'] = mean, std
with open(NORMALIZER, "wb") as output_file:
    pickle.dump(normalizer, output_file)