# Big Earth Net Preprocessing
## Irrigation Capstone Fall 2020
### TP Goter

This notebook is used to preprocess the GeoTiff files that contain the Sentinel-2 MSI data comprising the BigEarthNet dataset into TFRecords files. It is based on the preprocessing scripts from the BigEarthNet repo, but has been updated to work in Colaboratory with Python3.7+ and TensorFlow 2.3.

This version of the preprocessor is for specifically isolating the irrigated and non-irrigated examples.

In [1]:
import pandas as pd
import tensorflow as tf
from glob import glob
import os
#from matplotlib import pyplot as plt
#%matplotlib inline
import numpy as np
from tqdm import tqdm
#from google.colab import drive
#import seaborn as sns
#from matplotlib.cm import get_cmap
#import folium
#import gdal
import rasterio
import csv
import json

In [2]:
print(pd.__version__)
print(tf.__version__)


1.1.2
2.3.1


## Mount Google Drive and Set Paths

In [3]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [4]:
#base_path = '/content/gdrive/My Drive/Capstone Project'
big_earth_path ='./BigEarthNet-v1.0/'

## Create Symbolic Link(s)
Set up a symbolic link to allow for easy Python module imports. Then check to make sure the link works (it is a Unix link so check from shell)

In [5]:
!ln -s './bigearthnet-models/' bemodels

ln: bemodels/bigearthnet-models: File exists


In [6]:
!ls bemodels

README.md           bigearthnet-models  prep_splits.py      tensorflow_utils.py
[34m__pycache__[m[m         label_indices.json  [34msplits[m[m


In [7]:
from bemodels import tensorflow_utils

## Process All of the BigEarthNet data
This simple script will loop over all of the subfolders in the BigEarthNet-v1.0 folder. Currently this folder does not contain the entirety of the BigEarthNet Dataset. Due to this issue, the original scripting was modified to run through the train, test, val sets and only process files if they exist. The previous script simply aborted if a file was listed in the train.csv file and was not in the directory.

### Note: This processing takes a really long time. 
We need to determine if there is a better way to get this data ready for ingestion into our models.

In [8]:
with open('./bigearthnet-models/label_indices.json', 'rb') as f:
    label_indices = json.load(f)

root_folder = big_earth_path
out_folder = './tfrecords'
splits = glob(f'./bigearthnet-models/splits/train.csv')

# Checks the existence of patch folders and populate the list of patch folder paths
folder_path_list = []
if not os.path.exists(root_folder):
    print('ERROR: folder', root_folder, 'does not exist')




In [9]:
patch_names_list = []
split_names = []
for csv_file in splits:
    patch_names_list.append([])
    split_names.append(os.path.basename(csv_file).split('.')[0])
    with open(csv_file, 'r') as fp:
        csv_reader = csv.reader(fp, delimiter=',')
        for row in csv_reader:
            patch_names_list[-1].append(row[0].strip())    

# tensorflow_utils.prep_tf_record_files(
#     root_folder, out_folder, 
#     split_names, patch_names_list, 
#     label_indices)

In [10]:
len(patch_names_list[0])

269695

In [11]:
irrigated_examples = []
nonirrigated_examples = []
missing_count = 0
for patch_name in tqdm(patch_names_list[0]):
    patch_folder_path = os.path.join(root_folder, patch_name)
    patch_json_path = os.path.join(
                    patch_folder_path, patch_name + '_labels_metadata.json')
    try:
        with open(patch_json_path, 'rb') as f:
                        patch_json = json.load(f)
    except:
#         print(f'Missing Labels for {patch_name}')
        missing_count += 1
        continue

    if 'Permanently irrigated land' in patch_json['labels']:
        irrigated_examples.append(patch_folder_path)
    else:
        nonirrigated_examples.append(patch_folder_path)


100%|██████████| 125866/125866 [20:21<00:00, 103.03it/s]


In [12]:
len(irrigated_examples)

2375

In [13]:
len(nonirrigated_examples)

87739

In [12]:
pos_df = pd.read_csv('./bigearthnet-models/splits/positive_test.csv')
neg_df = pd.read_csv('./bigearthnet-models/splits/negative_test.csv')

In [14]:
pos_df = pd.DataFrame(irrigated_examples,columns=['file'])
neg_df = pd.DataFrame(nonirrigated_examples,columns=['file'])
pos_df.to_csv('./bigearthnet-models/splits/positive_test.csv')
neg_df.to_csv('./bigearthnet-models/splits/negative_test.csv')

# Create Data sets for finetuning. Make total dataset size divisible by 32 or 64 for easy batching

In [30]:
pos_df_1_percent = pos_df.sample(frac=0.027)
pos_df_10_percent = pos_df.sample(frac=0.1346)

In [31]:
print(len(pos_df_1_percent))
print(len(pos_df_10_percent))

64
320


In [32]:
sample_frac_1p = len(pos_df_1_percent)/len(neg_df)
sample_frac_10p = len(pos_df_10_percent)/len(neg_df)

In [33]:
subset_neg_df_1p = neg_df.sample(frac=sample_frac_1p)
subset_neg_df_10p = neg_df.sample(frac=sample_frac_10p)

In [34]:
print(len(subset_neg_df_1p))
print(len(subset_neg_df_10p))

64
320


In [35]:
64*2
320*2

640

In [27]:
# start_index = 0
# stop_index = 0
# # for i in range(5):
# #     print(f'Start Index: {start_index}')
# #     stop_index = len(subset_neg_df)*(i+1)//5
# #     print(f'Stop Index: {stop_index}')
# #     balanced_df = pd.concat([pos_df, subset_neg_df[start_index:stop_index]])
# #     start_index = stop_index
# #     # Shuffle the examples
# #     balanced_df = balanced_df.sample(frac=1)
# #     balanced_df.to_csv(f'./bigearthnet-models/splits/balanced_val{i}.csv')

Start Index: 0
Stop Index: 4971
Start Index: 4971
Stop Index: 9942
Start Index: 9942
Stop Index: 14913
Start Index: 14913
Stop Index: 19884
Start Index: 19884
Stop Index: 24855


In [39]:
balanced_df = pd.concat([pos_df_10_percent, subset_neg_df_10p])
# Shuffle the examples
balanced_df = balanced_df.sample(frac=1)
balanced_df.to_csv(f'./bigearthnet-models/splits/balanced_train_13percent.csv')

In [40]:
splits = glob(f'./bigearthnet-models/splits/balanced_train_13percent.*')
patch_names_list = []
split_names = []
for csv_file in splits:
    patch_names_list.append([])
    split_names.append(os.path.basename(csv_file).split('.')[0])
    csv_df = pd.read_csv(csv_file)
    patch_names_list[-1] = list(csv_df.file)
    patch_names_list[-1] = [name.split('/')[-1] for name in patch_names_list[-1]]
    

tensorflow_utils.prep_tf_record_files(
    root_folder, out_folder, 
    split_names, patch_names_list, 
    label_indices)

0it [00:00, ?it/s]

INFO: creating the split of balanced_train_13percent is started
  1/640 [..............................] - ETA: 1:27

2it [00:00, 14.69it/s]

  3/640 [..............................] - ETA: 53s 

4it [00:00, 15.35it/s]

  5/640 [..............................] - ETA: 47s

6it [00:00, 15.81it/s]

  7/640 [..............................] - ETA: 44s

8it [00:00, 15.87it/s]

  9/640 [..............................] - ETA: 43s

10it [00:00, 16.11it/s]

 11/640 [..............................] - ETA: 41s

12it [00:00, 16.39it/s]

 13/640 [..............................] - ETA: 41s

14it [00:00, 15.98it/s]

 15/640 [..............................] - ETA: 41s

16it [00:00, 15.77it/s]

 17/640 [..............................] - ETA: 41s

18it [00:01, 15.60it/s]

 19/640 [..............................] - ETA: 40s

20it [00:01, 16.01it/s]

 21/640 [..............................] - ETA: 40s

22it [00:01, 16.00it/s]

 23/640 [>.............................] - ETA: 40s

24it [00:01, 15.96it/s]

 25/640 [>.............................] - ETA: 39s

26it [00:01, 16.10it/s]

 27/640 [>.............................] - ETA: 39s

28it [00:01, 15.96it/s]

 29/640 [>.............................] - ETA: 39s

30it [00:01, 15.21it/s]

 31/640 [>.............................] - ETA: 39s

32it [00:02, 15.89it/s]

 33/640 [>.............................] - ETA: 42s

34it [00:02, 11.15it/s]

 35/640 [>.............................] - ETA: 42s

36it [00:02, 11.77it/s]

 37/640 [>.............................] - ETA: 42s

38it [00:02, 12.73it/s]

 39/640 [>.............................] - ETA: 43s

40it [00:02, 10.69it/s]

 41/640 [>.............................] - ETA: 43s

42it [00:02, 11.93it/s]

 43/640 [=>............................] - ETA: 42s

44it [00:03, 13.15it/s]

 45/640 [=>............................] - ETA: 42s

46it [00:03, 13.75it/s]

 47/640 [=>............................] - ETA: 41s

48it [00:03, 14.52it/s]

 49/640 [=>............................] - ETA: 41s

50it [00:03, 15.27it/s]

 51/640 [=>............................] - ETA: 41s

52it [00:03, 15.49it/s]

 53/640 [=>............................] - ETA: 40s

54it [00:03, 15.37it/s]

 55/640 [=>............................] - ETA: 40s

56it [00:03, 15.15it/s]

 57/640 [=>............................] - ETA: 40s

58it [00:03, 15.66it/s]

 59/640 [=>............................] - ETA: 40s

60it [00:04, 16.09it/s]

 61/640 [=>............................] - ETA: 39s

62it [00:04, 15.69it/s]

 63/640 [=>............................] - ETA: 39s

64it [00:04, 16.09it/s]

 65/640 [==>...........................] - ETA: 39s

66it [00:04, 16.44it/s]

 67/640 [==>...........................] - ETA: 38s

68it [00:04, 16.65it/s]

 69/640 [==>...........................] - ETA: 38s

70it [00:04, 16.66it/s]

 70/640 [==>...........................] - ETA: 38s

72it [00:04, 17.28it/s]

 73/640 [==>...........................] - ETA: 38s

74it [00:04, 17.36it/s]

 75/640 [==>...........................] - ETA: 38s

76it [00:05, 14.64it/s]

 77/640 [==>...........................] - ETA: 38s

78it [00:05, 15.11it/s]

 79/640 [==>...........................] - ETA: 37s

80it [00:05, 15.22it/s]

 81/640 [==>...........................] - ETA: 37s

82it [00:05, 14.88it/s]

 83/640 [==>...........................] - ETA: 37s

84it [00:05, 15.07it/s]

 85/640 [==>...........................] - ETA: 37s

86it [00:05, 13.36it/s]

 87/640 [===>..........................] - ETA: 37s

88it [00:05, 14.20it/s]

 89/640 [===>..........................] - ETA: 37s

90it [00:06, 15.04it/s]

 91/640 [===>..........................] - ETA: 37s

92it [00:06, 15.49it/s]

 93/640 [===>..........................] - ETA: 36s

94it [00:06, 16.03it/s]

 95/640 [===>..........................] - ETA: 36s

96it [00:06, 15.33it/s]

 97/640 [===>..........................] - ETA: 36s

98it [00:06, 15.17it/s]

 99/640 [===>..........................] - ETA: 36s

100it [00:06, 15.99it/s]

101/640 [===>..........................] - ETA: 36s

102it [00:06, 16.05it/s]

103/640 [===>..........................] - ETA: 35s

104it [00:06, 16.08it/s]

105/640 [===>..........................] - ETA: 35s

106it [00:07, 16.41it/s]

107/640 [====>.........................] - ETA: 35s

108it [00:07, 16.50it/s]

109/640 [====>.........................] - ETA: 35s

110it [00:07, 15.00it/s]

111/640 [====>.........................] - ETA: 35s

112it [00:07, 15.27it/s]

113/640 [====>.........................] - ETA: 35s

114it [00:07, 15.70it/s]

115/640 [====>.........................] - ETA: 35s

116it [00:07, 13.45it/s]

117/640 [====>.........................] - ETA: 35s

118it [00:07, 14.19it/s]

119/640 [====>.........................] - ETA: 34s

120it [00:07, 15.11it/s]

121/640 [====>.........................] - ETA: 34s

122it [00:08, 15.37it/s]

123/640 [====>.........................] - ETA: 34s

124it [00:08, 15.81it/s]

125/640 [====>.........................] - ETA: 34s

126it [00:08, 15.99it/s]

127/640 [====>.........................] - ETA: 34s

128it [00:08, 15.89it/s]

129/640 [=====>........................] - ETA: 34s

130it [00:08, 13.91it/s]

131/640 [=====>........................] - ETA: 34s

132it [00:08, 12.37it/s]

133/640 [=====>........................] - ETA: 34s

134it [00:08, 13.32it/s]

135/640 [=====>........................] - ETA: 34s

136it [00:09, 14.30it/s]

137/640 [=====>........................] - ETA: 33s

138it [00:09, 14.63it/s]

139/640 [=====>........................] - ETA: 33s

140it [00:09, 15.07it/s]

141/640 [=====>........................] - ETA: 34s

142it [00:09, 10.42it/s]

143/640 [=====>........................] - ETA: 34s

144it [00:09, 11.89it/s]

145/640 [=====>........................] - ETA: 33s

146it [00:09, 13.02it/s]

147/640 [=====>........................] - ETA: 33s

148it [00:10, 13.20it/s]

149/640 [=====>........................] - ETA: 33s

150it [00:10, 13.43it/s]



152it [00:10, 14.33it/s]



154it [00:10, 15.19it/s]



156it [00:10, 15.63it/s]



158it [00:10, 15.58it/s]



160it [00:10, 16.07it/s]



162it [00:10, 16.16it/s]



164it [00:11, 15.93it/s]



166it [00:11, 14.76it/s]



168it [00:11, 14.32it/s]



170it [00:11, 13.83it/s]



172it [00:11, 13.26it/s]



174it [00:11, 13.94it/s]



176it [00:11, 12.56it/s]



178it [00:12, 13.25it/s]



180it [00:12, 14.19it/s]



182it [00:12, 14.63it/s]



184it [00:12, 15.49it/s]



186it [00:12, 16.17it/s]



188it [00:12, 16.71it/s]



190it [00:12, 15.97it/s]



193it [00:12, 16.94it/s]



195it [00:13, 16.02it/s]



197it [00:13, 15.54it/s]



199it [00:13, 15.96it/s]



201it [00:13, 16.39it/s]



203it [00:13, 16.29it/s]



205it [00:13, 16.71it/s]



207it [00:13, 17.38it/s]



209it [00:13, 17.27it/s]



211it [00:14, 17.11it/s]



213it [00:14, 17.17it/s]



215it [00:14, 16.36it/s]



217it [00:14, 16.21it/s]



219it [00:14, 16.53it/s]



221it [00:14, 16.35it/s]



223it [00:14, 16.42it/s]



225it [00:14, 14.44it/s]



227it [00:15, 15.56it/s]



229it [00:15, 15.89it/s]



231it [00:15, 15.34it/s]



233it [00:15, 15.71it/s]



235it [00:15, 16.25it/s]



237it [00:15, 16.27it/s]



239it [00:15, 16.69it/s]



241it [00:15, 15.96it/s]



243it [00:16, 16.57it/s]



245it [00:16, 16.42it/s]



247it [00:16, 16.79it/s]



249it [00:16, 16.55it/s]



251it [00:16, 16.39it/s]



253it [00:16, 16.43it/s]



255it [00:16, 16.83it/s]



257it [00:16, 16.52it/s]



259it [00:17, 16.84it/s]



261it [00:17, 16.89it/s]



263it [00:17, 16.68it/s]



265it [00:17, 16.77it/s]



267it [00:17, 16.46it/s]



269it [00:17, 16.19it/s]



271it [00:17, 16.88it/s]



273it [00:18,  7.89it/s]



275it [00:18,  7.95it/s]



277it [00:18,  9.65it/s]



279it [00:18, 11.25it/s]



281it [00:18, 12.21it/s]



283it [00:19, 13.60it/s]



285it [00:19, 14.60it/s]



287it [00:19, 15.13it/s]



289it [00:19, 15.60it/s]



291it [00:19, 15.81it/s]



293it [00:19, 16.56it/s]



295it [00:19, 16.20it/s]



297it [00:19, 14.52it/s]



299it [00:20, 15.10it/s]



301it [00:20, 15.47it/s]



303it [00:20, 15.23it/s]



305it [00:20, 15.67it/s]



307it [00:20, 15.21it/s]



309it [00:20, 15.67it/s]



311it [00:20, 15.57it/s]



313it [00:20, 15.99it/s]



315it [00:21, 15.99it/s]



317it [00:21, 12.22it/s]



319it [00:21, 13.50it/s]



321it [00:21, 14.41it/s]



323it [00:21, 15.24it/s]



325it [00:21, 16.03it/s]



327it [00:21, 16.43it/s]



329it [00:21, 16.42it/s]



331it [00:22, 16.56it/s]



333it [00:22, 16.34it/s]



335it [00:22, 16.87it/s]



337it [00:22, 17.05it/s]



339it [00:22, 16.39it/s]



342it [00:22, 17.26it/s]



344it [00:22, 15.30it/s]



346it [00:23, 15.27it/s]



348it [00:23, 15.51it/s]



350it [00:23, 16.54it/s]



352it [00:23, 16.74it/s]



354it [00:23, 16.79it/s]



356it [00:23, 16.59it/s]



358it [00:23, 16.85it/s]



360it [00:23, 17.01it/s]



362it [00:23, 17.01it/s]



364it [00:24, 16.57it/s]



366it [00:24, 14.32it/s]



368it [00:24, 11.07it/s]



370it [00:24, 12.44it/s]



372it [00:24, 13.58it/s]



374it [00:24, 14.43it/s]



376it [00:25, 14.88it/s]



378it [00:25, 15.26it/s]



380it [00:25, 15.89it/s]



382it [00:25, 16.33it/s]



384it [00:25, 12.04it/s]



386it [00:25, 13.37it/s]



388it [00:25, 13.99it/s]



390it [00:26, 14.74it/s]



392it [00:26, 15.44it/s]



394it [00:26, 15.84it/s]



396it [00:26, 16.07it/s]



398it [00:26, 16.58it/s]



400it [00:26, 16.75it/s]



402it [00:26, 15.36it/s]



404it [00:26, 15.97it/s]



406it [00:26, 15.73it/s]



408it [00:27, 15.01it/s]



410it [00:27, 13.23it/s]



412it [00:27, 14.09it/s]



414it [00:27, 15.21it/s]



416it [00:27, 15.45it/s]



418it [00:27, 16.27it/s]



420it [00:27, 16.13it/s]



422it [00:28, 16.42it/s]



424it [00:28, 16.47it/s]



426it [00:28, 14.86it/s]



428it [00:28, 15.48it/s]



430it [00:28, 13.38it/s]



432it [00:28, 14.17it/s]



434it [00:28, 15.24it/s]



436it [00:29, 14.23it/s]



438it [00:29, 14.29it/s]



440it [00:29, 14.91it/s]



442it [00:29, 14.87it/s]



444it [00:29, 14.75it/s]



446it [00:29, 15.65it/s]



448it [00:29, 16.09it/s]



450it [00:29, 15.22it/s]



452it [00:30, 15.82it/s]



454it [00:30, 16.29it/s]



456it [00:30, 14.06it/s]



458it [00:30, 14.83it/s]



460it [00:30, 15.66it/s]



462it [00:30, 16.38it/s]



464it [00:30, 15.09it/s]



466it [00:30, 16.18it/s]



468it [00:31, 16.12it/s]



470it [00:31, 15.69it/s]



472it [00:31, 14.21it/s]



474it [00:31, 14.34it/s]



476it [00:31, 14.79it/s]



478it [00:31, 15.51it/s]



480it [00:31, 15.97it/s]



482it [00:31, 16.09it/s]



484it [00:32, 16.56it/s]



486it [00:32, 15.33it/s]



488it [00:32, 15.53it/s]



490it [00:32, 15.42it/s]



492it [00:32, 15.93it/s]



494it [00:32, 12.89it/s]



496it [00:32, 13.75it/s]



498it [00:33, 14.66it/s]



500it [00:33, 14.05it/s]



502it [00:33, 13.12it/s]



504it [00:33, 14.05it/s]



506it [00:33, 15.04it/s]



508it [00:33, 15.46it/s]



510it [00:33, 16.53it/s]



512it [00:33, 17.36it/s]



514it [00:34, 11.65it/s]



516it [00:34, 12.75it/s]



518it [00:34, 14.07it/s]



520it [00:34, 14.53it/s]



522it [00:34, 14.80it/s]



524it [00:34, 15.85it/s]



526it [00:35, 15.67it/s]



528it [00:35, 15.41it/s]



530it [00:35, 14.76it/s]



532it [00:35, 15.13it/s]



534it [00:35, 15.89it/s]



536it [00:35, 15.29it/s]



538it [00:35, 15.76it/s]



540it [00:35, 16.00it/s]



542it [00:36, 16.01it/s]



544it [00:36, 16.74it/s]



546it [00:36, 16.70it/s]



548it [00:36, 16.49it/s]



550it [00:36, 14.68it/s]



552it [00:36, 13.68it/s]



554it [00:36, 13.74it/s]



556it [00:36, 14.28it/s]



558it [00:37, 15.16it/s]



560it [00:37, 15.80it/s]



562it [00:37, 16.24it/s]



564it [00:37, 16.52it/s]



566it [00:37, 16.64it/s]



568it [00:37, 16.93it/s]



570it [00:37, 16.87it/s]



572it [00:37, 16.74it/s]



574it [00:38, 17.16it/s]



576it [00:38, 16.99it/s]



578it [00:38, 17.04it/s]



580it [00:38, 17.26it/s]



582it [00:38, 17.20it/s]



584it [00:38, 16.73it/s]



586it [00:38, 15.55it/s]



588it [00:38, 15.79it/s]



590it [00:39, 16.26it/s]



592it [00:39, 16.43it/s]



594it [00:39, 15.44it/s]



596it [00:39,  9.66it/s]



598it [00:39, 11.25it/s]



600it [00:39, 12.36it/s]



602it [00:40, 13.51it/s]



604it [00:40, 13.65it/s]



606it [00:40, 14.43it/s]



608it [00:40, 14.45it/s]



610it [00:40, 15.25it/s]



612it [00:40, 15.95it/s]



614it [00:40, 16.57it/s]



616it [00:40, 16.92it/s]



618it [00:40, 16.66it/s]



620it [00:41, 15.99it/s]



622it [00:41, 16.94it/s]



624it [00:41, 17.11it/s]



626it [00:41, 16.89it/s]



628it [00:41, 16.85it/s]



630it [00:41, 17.32it/s]



632it [00:41, 17.22it/s]



634it [00:41, 17.96it/s]



636it [00:42, 17.55it/s]



638it [00:42, 17.71it/s]



640it [00:42, 15.15it/s]
