In [1]:
import os
import json
import requests
import time
import pandas as pd
import datetime
from matplotlib import pyplot as plt
import numpy as np
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from os import listdir, path
from os.path import isfile, join
import h5py

In [2]:
coords = []
with open("../data/countries.geojson", "r") as countries_geojson:
    country_dict = json.load(countries_geojson)["features"]
for obj in country_dict:
    name = obj['properties']['ADMIN']
    if name == "Bangladesh":
        coords = obj['geometry']["coordinates"]
flat_coords = []
for sublist in coords:
    for coord in sublist:
        for c in coord:
            flat_coords.append(c)
            
flat_coords += [flat_coords[0]]
bangladesh_geo = Polygon(flat_coords)
# print(bangladesh_geo)

In [14]:
# directory containing all hdf files to process
IN_DIR_PATH = '/atlas/u/jihyeonlee/handlabeling/delta+1/jihyeon'
OUT_DIR_PATH = IN_DIR_PATH + 'filtered_out/'
CACHE_DIR_PATH = '/atlas/u/jihyeonlee/handlabeling/cache/'

if not os.path.exists(OUT_DIR_PATH):
    os.makedirs(OUT_DIR_PATH)
    
if not os.path.exists(CACHE_DIR_PATH):
    os.makedirs(CACHE_DIR_PATH)

In [12]:
# delete any existing examples{N}_new.hdf5 and collect paths to all examples_{N}.hdf5

filepaths = []
for dir_path, subdirs, files in os.walk(IN_DIR_PATH):
#     print(dir_path, subdirs, files)
    if len(files) != 0:
        for file in files:
            full_path = dir_path + file
            if file[-9:] == "_new.hdf5":
                os.remove(full_path)
            elif file[:9] == 'examples_':
                filepaths += [full_path]
        
filepaths.sort() 
filepaths

['/atlas/u/jihyeonlee/handlabeling/positives/examples_0.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_1.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_2.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_3.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_4.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_5.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_6.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_7.hdf5',
 '/atlas/u/jihyeonlee/handlabeling/positives/examples_9.hdf5']

In [13]:
for file_index, filepath in enumerate(filepaths):
    print("Starting file #{}: {}".format(file_index + 1, filepath))
    new_filepath = filepath[:-5] + "_new.hdf5"
    
    if os.path.exists(new_filepath):
        print("Skip, already done!")
    else:
        with h5py.File(filepath, 'r') as file:
            bad_idxs = []
            all_tile_bounds = file['bounds']
            for tile_idx in range(all_tile_bounds.shape[0]):
                bounds = all_tile_bounds[tile_idx]
                corners = [[bounds[0], bounds[1]], [bounds[0], bounds[3]], [bounds[2], bounds[3]], [bounds[2], bounds[1]]]
                tile_in_country = False
                for point in corners:
                    if bangladesh_geo.contains(Point(point)):
                        tile_in_country = True
                if not tile_in_country:
                    bad_idxs += [tile_idx]
            print("# incorrect tiles found: {}".format(len(bad_idxs)))
            
            # write filtered out tiles to their own file
            with h5py.File(OUT_DIR_PATH + 'india_tiles' + str(file_index) + '.hdf5', 'w') as out_file:
                for key in file.keys():
                    out_file.create_dataset(key, data=[file[key][idx] for idx in bad_idxs])
            
            # write saved tiles to their own file
            new_file = h5py.File(new_filepath, 'w')
            for key in file.keys():
                dataset = np.delete(np.array(file[key]), bad_idxs, 0)
                new_file.create_dataset(key, data=dataset)
            print("# examples in new file:", np.array(new_file["bounds"]).shape[0])            
            new_file.close()

Starting file #1: /atlas/u/jihyeonlee/handlabeling/positives/examples_0.hdf5
# incorrect tiles found: 189
# examples in new file: 810
Starting file #2: /atlas/u/jihyeonlee/handlabeling/positives/examples_1.hdf5
# incorrect tiles found: 281
# examples in new file: 718
Starting file #3: /atlas/u/jihyeonlee/handlabeling/positives/examples_2.hdf5
# incorrect tiles found: 256
# examples in new file: 743
Starting file #4: /atlas/u/jihyeonlee/handlabeling/positives/examples_3.hdf5
# incorrect tiles found: 208
# examples in new file: 791
Starting file #5: /atlas/u/jihyeonlee/handlabeling/positives/examples_4.hdf5
# incorrect tiles found: 279
# examples in new file: 720
Starting file #6: /atlas/u/jihyeonlee/handlabeling/positives/examples_5.hdf5
# incorrect tiles found: 172
# examples in new file: 827
Starting file #7: /atlas/u/jihyeonlee/handlabeling/positives/examples_6.hdf5
# incorrect tiles found: 171
# examples in new file: 828
Starting file #8: /atlas/u/jihyeonlee/handlabeling/positives/e

In [17]:
for filepath in filepaths:
    cache_filename = "-".join(filepath[:-5].split("/")[5:]) + ".hdf5"
    # move the old files into cache & rename
    os.rename(filepath, CACHE_DIR_PATH + cache_filename)
    print(cache_filename)

positives-examples_0.hdf5
positives-examples_1.hdf5
positives-examples_2.hdf5
positives-examples_3.hdf5
positives-examples_4.hdf5
positives-examples_5.hdf5
positives-examples_6.hdf5
positives-examples_7.hdf5
positives-examples_9.hdf5


In [80]:
# if need to revert last step
# for filepath in filepaths:
#     cache_filename = "-".join(filepath[:-5].split("/")[5:]) + ".hdf5"
#     # move the old files into cache & rename
#     os.rename(CACHE_DIR_PATH + '/' + cache_filename, filepath)

In [19]:
# test to make sure everything worked properly
with h5py.File('/atlas/u/jihyeonlee/handlabeling/cache/positives-examples_0.hdf5', 'r') as f:
    print(f.keys())
    print(f['labels'][:20])
    
with h5py.File('/atlas/u/jihyeonlee/handlabeling/positives/examples_0_new.hdf5', 'r') as f:
    print(f.keys())
    print(f['labels'][:20])

<KeysViewHDF5 ['bounds', 'images', 'indices', 'labels', 'pred_labels']>
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
<KeysViewHDF5 ['bounds', 'images', 'indices', 'labels', 'pred_labels']>
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
