In [1]:
import os 
import glob

import torch
import rasterio
import pandas as pd 
import numpy as np
from tfrecord.torch.dataset import TFRecordDataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
CSV              = os.path.join( "..", "data", "geometry_less_dataset.csv" )
RECORDS_DIR      = os.path.join( "..", "data", "landsat_7", "" )
TIF_DIR          = os.path.join( "..", "data", "landsat_tif","" )
BANDS            = ['BLUE','GREEN','RED','NIR','SWIR1','SWIR2','TEMP1','NIGHTLIGHTS']
DESCRIPTOR       = {
                'system:index':"byte",
                'filename':"byte", 
                'wealthpooled':"float",
                'bounding_box':"byte",
                'BLUE':"float",
                'GREEN':"float",
                'RED':"float",
                'NIR':"float",
                'SWIR1':"float",
                'SWIR2':"float",
                'TEMP1':"float",
                'NIGHTLIGHTS':"float"
              }   

In [5]:
csv = pd.read_csv(CSV)
csv.head()

Unnamed: 0,country,year,cluster,lat,lon,households,wealthpooled,filename
0,angola,2011,1,-12.350257,13.534922,36,2.312757,-12_3_13_53.tif
1,angola,2011,2,-12.360865,13.551494,32,2.010293,-12_3_13_55.tif
2,angola,2011,3,-12.613421,13.413085,36,0.877744,-12_6_13_41.tif
3,angola,2011,4,-12.581454,13.397711,35,1.066994,-12_5_13_39.tif
4,angola,2011,5,-12.578135,13.418748,37,1.750153,-12_5_13_41.tif


In [6]:
records = dict()
for year in csv.year.unique():
    records[year]=dict()
    sub_year = csv[ csv.year == year ]
    for country in sub_year.country.unique():
        sub_country = sub_year[ sub_year.country == country ].copy()
        pattern = RECORDS_DIR+"*"+str(country)+"_"+str(year)+"*.tfrecord"
        # files = [file.replace('(1)','') for file in glob.glob(pattern)]
        records[year][country] = glob.glob(pattern)


In [7]:
def tensor_to_string(data, variable):
    filename = (data[variable].numpy())[0]
    return "".join([chr(item) for item in filename])

def update_csv(csv, idx, bounding_box, filename):
    csv.iloc[int(idx), csv.columns.get_loc('geometry')] = bounding_box
    csv.iloc[int(idx), csv.columns.get_loc('filename')] = filename
    return csv

def tfrecord_to_tif(data, filename, mins, maxs):
    arrays = []
    for i in range in len(BANDS):
        new_arr = data[BANDS[i]][0].numpy().reshape((255,255))
        arrays.append(new_arr)
        mins[i] = min(mins[i], new_arr.min())
        maxs[i] = max(maxs[i], new_arr.max())

    arr = np.swapaxes(np.array(arrays), 0,2 )
    tif_path = TIF_DIR + filename
    transform = rasterio.Affine(1, 0, 0, 0, 1, 0)
    tif = rasterio.open(tif_path, 'w', driver='GTiff',
                            height = arr.shape[0], width = arr.shape[1],
                            count=7, dtype=str(arr.dtype),
                            crs='epsg:3857',
                            transform=transform)
    for i in range(len(BANDS)):
        tif.write(arr[:,:,i],i+1)
    tif.close()

    return mins, maxs

In [8]:
mins=[1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3]
maxs=[-1e3, -1e3, -1e3, -1e3, -1e3, -1e3, -1e3, -1e3]

for year in records:
    for country in records[year]:
        if records[year][country]==[]:
            continue
        tfrecord_path = records[year][country][0]
        dataset = TFRecordDataset(tfrecord_path, index_path=None, description=DESCRIPTOR)
        loader = torch.utils.data.DataLoader(dataset, batch_size=1)
        iterator = iter(loader)
        while (data := next(iterator, None)) is not None:
            idx = tensor_to_string(data, "system:index")
            filename = tensor_to_string(data, "filename")
            bounding_box = tensor_to_string(data, "bounding_box")
            csv = update_csv(csv, idx, bounding_box, filename)
            mins, maxs = tfrecord_to_tif(data, filename)

In [9]:
print(mins, maxs)

[1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0] [-1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0]
