In [1]:
import os 
import glob
import gzip
import shutil
import torch
import rasterio
import pandas as pd 
import numpy as np
from tfrecord.torch.dataset import TFRecordDataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
CSV              = os.path.join( "..", "data", "dataset.csv" )
RECORDS_DIR      = os.path.join( "..", "data", "landsat_7","" )
TIF_DIR          = os.path.join( "..", "data", "landsat_7","" )
BANDS            = ['BLUE','GREEN','RED','NIR','SWIR1','SWIR2','TEMP1','NIGHTLIGHTS']
DESCRIPTOR       = {
                'cluster':"float",
                'lat':"float", 
                "lon":"float",
                'wealthpooled':"float",
                'BLUE':"float",
                'GREEN':"float",
                'RED':"float",
                'NIR':"float",
                'SWIR1':"float",
                'SWIR2':"float",
                'TEMP1':"float",
                'NIGHTLIGHTS':"float"
              }   

In [5]:
csv=pd.read_csv(CSV)

In [6]:
records = dict()
for year in csv.year.unique():
    records[year]=dict()
    sub_year = csv[ csv.year == year ]
    for country in sub_year.country.unique():
        sub_country = sub_year[ sub_year.country == country ].copy()
        pattern = RECORDS_DIR+"*"+str(country)+"_"+str(year)+"/*.tfrecord"
        records[year][country] = glob.glob(pattern)
records[2011]['angola'][:5]

['../data/landsat_7/angola_2011/1.tfrecord',
 '../data/landsat_7/angola_2011/2.tfrecord',
 '../data/landsat_7/angola_2011/3.tfrecord',
 '../data/landsat_7/angola_2011/4.tfrecord',
 '../data/landsat_7/angola_2011/5.tfrecord']

In [7]:
def decompress_tfrecord(tfrecord_archive):
    with gzip.open(tfrecord_archive, 'rb') as f_in:
        # WITHOUT .GZ
        with open(tfrecord_archive[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    return tfrecord_archive[:-3]

def tensor_to_string(data, variable):
    filename = (data[variable].numpy())[0][0]
    return str(filename).replace(".","")

def tfrecord_to_tif(data, filename, mins, maxs):
    arrays = []
    for i in range(len(BANDS)):
        new_arr = data[BANDS[i]][0].numpy().reshape((255,255))
        arrays.append(new_arr)
        mins[i] = min(mins[i], new_arr.min())
        maxs[i] = max(maxs[i], new_arr.max())

    arr = np.swapaxes(np.array(arrays), 0, 2 )
    tif_path = TIF_DIR + filename
    # transform = rasterio.Affine(1, 0, 0, 0, 1, 0)
    tif = rasterio.open(tif_path, 'w', driver='GTiff',
                            height = arr.shape[0], width = arr.shape[1],
                            count=8, dtype=str(arr.dtype),
                            crs='epsg:3857',
                            transform=None)
    for i in range(len(BANDS)):
        tif.write(arr[:,:,i],i+1)
    tif.close()

    return mins, maxs

In [9]:
mins=[1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3]
maxs=[-1e3, -1e3, -1e3, -1e3, -1e3, -1e3, -1e3, -1e3]
given_names = set()
for year in records:
    print(year)
    for country in records[year]:
        if records[year][country]==[]:
            continue
        for tfrecord in records[year][country]:
            dataset = TFRecordDataset(tfrecord, index_path=None, description=DESCRIPTOR)
            loader = torch.utils.data.DataLoader(dataset, batch_size=1)
            iterator = iter(loader)
            while (data := next(iterator, None)) is not None:
                filename = str(country)+"_"+str(year)+"/"+tensor_to_string(data, "cluster")[:-1]+".tif"
                mins, maxs = tfrecord_to_tif(data, filename, mins, maxs)

2011
2015
2012
2017
1999
2010
2014
2004
2018
1998
2013
2005
2016
2019
2003
2009
1997
1996
2006


In [10]:
print(mins, maxs)

[-0.0994, -0.0574, -0.0318, -0.0209, -0.0102, -0.0152, 0.0, -0.07087274] [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 316.7, 3104.1401]


In [None]:
# CHECK INTEGRITY
records = dict()
for year in csv.year.unique():
    records[year]=dict()
    sub_year = csv[ csv.year == year ]
    for country in sub_year.country.unique():
        sub_country = sub_year[ sub_year.country == country ].copy()
        pattern = RECORDS_DIR+"*"+str(country)+"_"+str(year)+"/*.tif"
        records[year][country] = glob.glob(pattern)
for year in records:
    print(year)
    for country in records[year]:
        if records[year][country]==[]:
            continue
        for tif in records[year][country]:
            tile = rasterio.open(tif) 
            tile_= tile.read()
            

In [13]:
def map_filename( row ):
    row.filename = row.country+"_"+str(row.year)+"/"+str(row.cluster)+".tif"
    return row

csv = csv.apply(map_filename, axis=1)
csv.to_csv(CSV,index=False)

In [14]:
csv.head()

Unnamed: 0,country,year,cluster,lat,lon,households,wealthpooled,geometry,filename,bounding_box
0,angola,2011,1,-12.350257,13.534922,36,2.312757,POINT (1506700.58557273 -1385596.0684884773),angola_2011/1.tif,POLYGON ((1503340.58557273 -1388956.0684884773...
1,angola,2011,2,-12.360865,13.551494,32,2.010293,POINT (1508545.372017885 -1386804.9130245172),angola_2011/2.tif,POLYGON ((1505185.372017885 -1390164.913024517...
2,angola,2011,3,-12.613421,13.413085,36,0.877744,POINT (1493137.790366379 -1415600.6075743325),angola_2011/3.tif,POLYGON ((1489777.790366379 -1418960.607574332...
3,angola,2011,4,-12.581454,13.397711,35,1.066994,POINT (1491426.3440705661 -1411954.2588894619),angola_2011/4.tif,POLYGON ((1488066.3440705661 -1415314.25888946...
4,angola,2011,5,-12.578135,13.418748,37,1.750153,POINT (1493768.1835246533 -1411575.617279712),angola_2011/5.tif,POLYGON ((1490408.1835246533 -1414935.61727971...


In [15]:
print(len(csv))
print(len(csv.filename.unique()))

32466
32201
