In [1]:
import os 
import glob
import gzip
import shutil
import torch
import rasterio
import pandas as pd 
import numpy as np
from tfrecord.torch.dataset import TFRecordDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
CSV              = os.path.join( "..", "data", "dataset.csv" )
RECORDS_DIR      = os.path.join( "..", "data", "landsat_7less","" )
TIF_DIR          = os.path.join( "..", "data", "landsat_7less","" )
BANDS            = ['BLUE','GREEN','RED','NIR','SWIR1','SWIR2','TEMP1','NIGHTLIGHTS']
DESCRIPTOR       = {
                'cluster':"float",
                'lat':"float", 
                "lon":"float",
                'wealthpooled':"float",
                'BLUE':"float",
                'GREEN':"float",
                'RED':"float",
                'NIR':"float",
                'SWIR1':"float",
                'SWIR2':"float",
                'TEMP1':"float",
                'NIGHTLIGHTS':"float"
              }   

In [4]:
csv=pd.read_csv(CSV)

In [7]:
records = dict()
for year in csv.year.unique():
    records[year]=dict()
    sub_year = csv[ csv.year == year ]
    for country in sub_year.country.unique():
        sub_country = sub_year[ sub_year.country == country ].copy()
        pattern = RECORDS_DIR+"*"+str(country)+"_"+str(year)+"/*.tfrecord*"
        records[year][country] = glob.glob(pattern)
records[2015]['angola'][:5]

['../data/landsat_7less/angola_2015/604.tfrecord.gz',
 '../data/landsat_7less/angola_2015/605.tfrecord.gz',
 '../data/landsat_7less/angola_2015/606.tfrecord.gz',
 '../data/landsat_7less/angola_2015/607.tfrecord.gz',
 '../data/landsat_7less/angola_2015/608.tfrecord.gz']

In [8]:
def decompress_tfrecord(tfrecord_archive):
    with gzip.open(tfrecord_archive, 'rb') as f_in:
        # WITHOUT .GZ
        with open(tfrecord_archive[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    return tfrecord_archive[:-3]

def tensor_to_string(data, variable):
    filename = (data[variable].numpy())[0][0]
    return str(filename).replace(".","")

def tfrecord_to_tif(data, filename, mins, maxs):
    arrays = []
    for i in range(len(BANDS)):
        new_arr = data[BANDS[i]][0].numpy().reshape((255,255))
        arrays.append(new_arr)
        mins[i] = min(mins[i], new_arr.min())
        maxs[i] = max(maxs[i], new_arr.max())

    arr = np.swapaxes(np.array(arrays), 0, 2 )
    tif_path = TIF_DIR + filename
    # transform = rasterio.Affine(1, 0, 0, 0, 1, 0)
    tif = rasterio.open(tif_path, 'w', driver='GTiff',
                            height = arr.shape[0], width = arr.shape[1],
                            count=8, dtype=str(arr.dtype),
                            crs='epsg:3857',
                            transform=None)
    for i in range(len(BANDS)):
        tif.write(arr[:,:,i],i+1)
    tif.close()

    return mins, maxs

In [10]:
mins=[1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3, 1e3]
maxs=[-1e3, -1e3, -1e3, -1e3, -1e3, -1e3, -1e3, -1e3]
given_names = set()
for year in records:
    print(year)
    for country in records[year]:
        if records[year][country]==[]:
            continue
        for tfrecord_archive in records[year][country]:
            tfrecord = decompress_tfrecord(tfrecord_archive=tfrecord_archive)
            dataset = TFRecordDataset(tfrecord, index_path=None, description=DESCRIPTOR)
            loader = torch.utils.data.DataLoader(dataset, batch_size=1)
            iterator = iter(loader)
            while (data := next(iterator, None)) is not None:
                filename = str(country)+"_"+str(year)+"/"+tensor_to_string(data, "cluster")[:-1]+".tif"
                mins, maxs = tfrecord_to_tif(data, filename, mins, maxs)

2011
2015


  dataset = writer(


2012
2017
1999
2010
2014
2004
2018
1998
2013
2005
2016
2019
2003
2009
1997
1996
2006


In [11]:
print(mins, maxs)

[-0.0788, -0.0133, -0.0428, -0.0308, 0.0, 0.0, 0.0, -0.07087274] [0.65745, 0.7449, 0.80175, 0.90695, 1.4221, 1.57635, 310.4, 461.17834]


In [12]:
# CHECK INTEGRITY
records = dict()
for year in csv.year.unique():
    records[year]=dict()
    sub_year = csv[ csv.year == year ]
    for country in sub_year.country.unique():
        sub_country = sub_year[ sub_year.country == country ].copy()
        pattern = RECORDS_DIR+"*"+str(country)+"_"+str(year)+"/*.tif"
        records[year][country] = glob.glob(pattern)
for year in records:
    print(year)
    for country in records[year]:
        if records[year][country]==[]:
            continue
        for tif in records[year][country]:
            tile = rasterio.open(tif) 
            tile_= tile.read()

2011
2015


  dataset = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


2012
2017
1999
2010
2014
2004
2018
1998
2013
2005
2016
2019
2003
2009
1997
1996
2006


In [9]:
def map_filename( row ):
    row.filename = row.country+"_"+str(row.year)+"/"+str(row.cluster)+".tif"
    return row

csv = csv.apply(map_filename, axis=1)
csv.to_csv(CSV,index=False)

In [10]:
csv.head()

Unnamed: 0,country,year,cluster,lat,lon,households,wealthpooled
0,angola,2011,1,-12.350257,13.534922,36,2.312757
1,angola,2011,2,-12.360865,13.551494,32,2.010293
2,angola,2011,3,-12.613421,13.413085,36,0.877744
3,angola,2011,4,-12.581454,13.397711,35,1.066994
4,angola,2011,5,-12.578135,13.418748,37,1.750153


In [11]:
print(len(csv))
print(len(csv.filename.unique()))

32466


AttributeError: 'DataFrame' object has no attribute 'filename'