# Big Earth Net Preprocessing
## Irrigation Capstone Fall 2020
### TP Goter

This notebook is used to preprocess the GeoTiff files that contain the Sentinel-2 MSI data comprising the BigEarthNet dataset into TFRecords files. It is based on the preprocessing scripts from the BigEarthNet repo, but has been updated to work in Colaboratory with Python3.7+ and TensorFlow 2.3.

This version of the preprocessor is for specifically isolating the irrigated and non-irrigated examples.

In [1]:
import pandas as pd
import tensorflow as tf
from glob import glob
import os
#from matplotlib import pyplot as plt
#%matplotlib inline
import numpy as np
from tqdm import tqdm
#from google.colab import drive
#import seaborn as sns
#from matplotlib.cm import get_cmap
#import folium
#import gdal
import rasterio
import csv
import json

In [2]:
print(pd.__version__)
print(tf.__version__)


1.1.2
2.3.1


## Mount Google Drive and Set Paths

In [3]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [4]:
#base_path = '/content/gdrive/My Drive/Capstone Project'
big_earth_path ='./BigEarthNet-v1.0/'

## Create Symbolic Link(s)
Set up a symbolic link to allow for easy Python module imports. Then check to make sure the link works (it is a Unix link so check from shell)

In [5]:
!ln -s './bigearthnet-models/' bemodels

ln: bemodels/bigearthnet-models: File exists


In [6]:
!ls bemodels

README.md           bigearthnet-models  prep_splits.py      tensorflow_utils.py
[34m__pycache__[m[m         label_indices.json  [34msplits[m[m


In [7]:
from bemodels import tensorflow_utils

## Process All of the BigEarthNet data
This simple script will loop over all of the subfolders in the BigEarthNet-v1.0 folder. Currently this folder does not contain the entirety of the BigEarthNet Dataset. Due to this issue, the original scripting was modified to run through the train, test, val sets and only process files if they exist. The previous script simply aborted if a file was listed in the train.csv file and was not in the directory.

### Note: This processing takes a really long time. 
We need to determine if there is a better way to get this data ready for ingestion into our models.

In [8]:
with open('./bigearthnet-models/label_indices.json', 'rb') as f:
    label_indices = json.load(f)

root_folder = big_earth_path
out_folder = './tfrecords'
splits = glob(f'./bigearthnet-models/splits/train.csv')

# Checks the existence of patch folders and populate the list of patch folder paths
folder_path_list = []
if not os.path.exists(root_folder):
    print('ERROR: folder', root_folder, 'does not exist')




In [9]:
patch_names_list = []
split_names = []
for csv_file in splits:
    patch_names_list.append([])
    split_names.append(os.path.basename(csv_file).split('.')[0])
    with open(csv_file, 'r') as fp:
        csv_reader = csv.reader(fp, delimiter=',')
        for row in csv_reader:
            patch_names_list[-1].append(row[0].strip())    

# tensorflow_utils.prep_tf_record_files(
#     root_folder, out_folder, 
#     split_names, patch_names_list, 
#     label_indices)

In [10]:
len(patch_names_list[0])

269695

In [None]:
irrigated_examples = []
nonirrigated_examples = []
missing_count = 0
for patch_name in tqdm(patch_names_list[0]):
    patch_folder_path = os.path.join(root_folder, patch_name)
    patch_json_path = os.path.join(
                    patch_folder_path, patch_name + '_labels_metadata.json')
    try:
        with open(patch_json_path, 'rb') as f:
                        patch_json = json.load(f)
    except:
#         print(f'Missing Labels for {patch_name}')
        missing_count += 1
        continue

    if 'Permanently irrigated land' in patch_json['labels']:
        irrigated_examples.append(patch_folder_path)
    else:
        nonirrigated_examples.append(patch_folder_path)


  1%|          | 2515/269695 [00:26<45:44, 97.34it/s]  

## Check for Vineyards

In [11]:
vy_examples = []
nonvy_examples = []
missing_count = 0
for patch_name in tqdm(patch_names_list[0]):
    patch_folder_path = os.path.join(root_folder, patch_name)
    patch_json_path = os.path.join(
                    patch_folder_path, patch_name + '_labels_metadata.json')
    try:
        with open(patch_json_path, 'rb') as f:
                        patch_json = json.load(f)
    except:
#         print(f'Missing Labels for {patch_name}')
        missing_count += 1
        continue

    if 'Vineyards' in patch_json['labels']:
        vy_examples.append(patch_folder_path)
    else:
        nonvy_examples.append(patch_folder_path)

100%|██████████| 269695/269695 [1:04:08<00:00, 70.08it/s] 


In [12]:
len(vy_examples)

3488

In [13]:
len(nonvy_examples)

190178

In [17]:
pos_irr_df = pd.read_csv('./bigearthnet-models/splits/positive_train.csv')
neg_irr_df = pd.read_csv('./bigearthnet-models/splits/negative_train.csv')

In [14]:
pos_df = pd.DataFrame(vy_examples,columns=['file'])
neg_df = pd.DataFrame(nonvy_examples,columns=['file'])
pos_df.to_csv('./bigearthnet-models/splits/positive_vy_train.csv')
neg_df.to_csv('./bigearthnet-models/splits/negative_vy_train.csv')

# Create Data sets for finetuning. Make total dataset size divisible by 32 or 64 for easy batching

In [96]:
len(pos_irr_df)

4971

In [52]:
pos_df_1_percent = pos_irr_df.sample(frac=0.0065)
pos_df_3_percent = pos_irr_df.sample(frac=0.0258)
pos_df_10_percent = pos_irr_df.sample(frac=0.103)

In [54]:
print(len(pos_df_1_percent))
print(len(pos_df_3_percent))
print(len(pos_df_10_percent))

32
128
512


In [56]:
sample_frac_1p = len(pos_df_1_percent)/len(neg_irr_df)
sample_frac_3p = len(pos_df_3_percent)/len(neg_irr_df)
sample_frac_10p = len(pos_df_10_percent)/len(neg_irr_df)

In [58]:
subset_neg_df_1p = neg_irr_df.sample(frac=sample_frac_1p)
subset_neg_df_3p = neg_irr_df.sample(frac=sample_frac_3p)
subset_neg_df_10p = neg_irr_df.sample(frac=sample_frac_10p)

In [60]:
print(len(subset_neg_df_1p))
print(len(subset_neg_df_3p))
print(len(subset_neg_df_10p))

32
128
512


In [76]:
pos_vy_df_1_percent = pos_df.sample(frac=0.0092)
pos_vy_df_3_percent = pos_df.sample(frac=0.0366)

In [77]:
print(len(pos_vy_df_1_percent))
print(len(pos_vy_df_3_percent))

32
128


In [79]:
sample_frac_vy_1p = len(pos_vy_df_1_percent)/len(neg_df)
sample_frac_vy_3p = len(pos_vy_df_3_percent)/len(neg_df)

In [81]:
subset_neg_vy_df_1p = neg_df.sample(frac=sample_frac_vy_1p)
subset_neg_vy_df_3p = neg_df.sample(frac=sample_frac_vy_3p)

In [82]:
print(len(subset_neg_vy_df_1p))
print(len(subset_neg_vy_df_3p))

32
128


In [27]:
# start_index = 0
# stop_index = 0
# # for i in range(5):
# #     print(f'Start Index: {start_index}')
# #     stop_index = len(subset_neg_df)*(i+1)//5
# #     print(f'Stop Index: {stop_index}')
# #     balanced_df = pd.concat([pos_df, subset_neg_df[start_index:stop_index]])
# #     start_index = stop_index
# #     # Shuffle the examples
# #     balanced_df = balanced_df.sample(frac=1)
# #     balanced_df.to_csv(f'./bigearthnet-models/splits/balanced_val{i}.csv')

Start Index: 0
Stop Index: 4971
Start Index: 4971
Stop Index: 9942
Start Index: 9942
Stop Index: 14913
Start Index: 14913
Stop Index: 19884
Start Index: 19884
Stop Index: 24855


In [94]:
balanced_df = pd.concat([pos_vy_df_3_percent, subset_neg_vy_df_3p])
# Shuffle the examples
balanced_df = balanced_df.sample(frac=1)
balanced_df.to_csv(f'./bigearthnet-models/splits/final_balanced_train_vy_3percent.csv')

NameError: name 'pos_vy_df_10_percent' is not defined

In [93]:
splits = glob(f'./bigearthnet-models/splits/final_balanced_train_vy_10percent.*')
patch_names_list = []
split_names = []
for csv_file in splits:
    patch_names_list.append([])
    split_names.append(os.path.basename(csv_file).split('.')[0])
    csv_df = pd.read_csv(csv_file)
    patch_names_list[-1] = list(csv_df.file)
    patch_names_list[-1] = [name.split('/')[-1] for name in patch_names_list[-1]]
    

tensorflow_utils.prep_tf_record_files(
    root_folder, out_folder, 
    split_names, patch_names_list, 
    label_indices)

0it [00:00, ?it/s]

INFO: creating the split of final_balanced_train_vy_3percent is started
  1/256 [..............................] - ETA: 36s

2it [00:00, 13.99it/s]

  3/256 [..............................] - ETA: 22s

4it [00:00, 14.65it/s]

  5/256 [..............................] - ETA: 20s

6it [00:00, 14.48it/s]

  7/256 [..............................] - ETA: 19s

8it [00:00, 14.74it/s]

  9/256 [>.............................] - ETA: 18s

10it [00:00, 14.71it/s]

 11/256 [>.............................] - ETA: 19s

12it [00:00, 12.90it/s]

 13/256 [>.............................] - ETA: 18s

14it [00:01, 13.50it/s]

 15/256 [>.............................] - ETA: 18s

16it [00:01, 14.18it/s]

 17/256 [>.............................] - ETA: 17s

18it [00:01, 14.48it/s]

 19/256 [=>............................] - ETA: 17s

20it [00:01, 14.90it/s]

 21/256 [=>............................] - ETA: 16s

22it [00:01, 15.14it/s]

 23/256 [=>............................] - ETA: 16s

24it [00:01, 14.36it/s]

 25/256 [=>............................] - ETA: 16s

26it [00:01, 13.70it/s]

 27/256 [==>...........................] - ETA: 16s

28it [00:01, 13.83it/s]

 29/256 [==>...........................] - ETA: 16s

30it [00:02, 14.21it/s]

 31/256 [==>...........................] - ETA: 16s

32it [00:02, 14.47it/s]

 33/256 [==>...........................] - ETA: 16s

34it [00:02, 14.58it/s]

 35/256 [===>..........................] - ETA: 15s

36it [00:02, 14.43it/s]

 37/256 [===>..........................] - ETA: 15s

38it [00:02, 14.84it/s]

 39/256 [===>..........................] - ETA: 15s

40it [00:02, 14.61it/s]

 41/256 [===>..........................] - ETA: 15s

42it [00:02, 14.93it/s]

 43/256 [====>.........................] - ETA: 15s

44it [00:03, 14.89it/s]

 45/256 [====>.........................] - ETA: 14s

46it [00:03, 14.61it/s]

 47/256 [====>.........................] - ETA: 14s

48it [00:03, 14.86it/s]

 49/256 [====>.........................] - ETA: 14s

50it [00:03, 14.72it/s]

 51/256 [====>.........................] - ETA: 14s

52it [00:03, 15.22it/s]

 53/256 [=====>........................] - ETA: 14s

54it [00:03, 15.37it/s]

 55/256 [=====>........................] - ETA: 14s

56it [00:03, 13.44it/s]

 57/256 [=====>........................] - ETA: 14s

58it [00:04, 13.67it/s]

 59/256 [=====>........................] - ETA: 13s

60it [00:04, 14.23it/s]



62it [00:04, 15.00it/s]



64it [00:04, 15.21it/s]



66it [00:04, 14.46it/s]



68it [00:04, 15.10it/s]



70it [00:04, 15.43it/s]



72it [00:04, 15.74it/s]



74it [00:05, 15.73it/s]



76it [00:05, 15.62it/s]



78it [00:05, 15.50it/s]



80it [00:05, 15.10it/s]



82it [00:05, 15.05it/s]



84it [00:05, 15.60it/s]



86it [00:05, 15.80it/s]



88it [00:05, 15.76it/s]



90it [00:06, 15.36it/s]



92it [00:06, 15.04it/s]



94it [00:06, 14.22it/s]



96it [00:06, 14.74it/s]



98it [00:06, 14.96it/s]



100it [00:06, 15.16it/s]



102it [00:06, 12.88it/s]



104it [00:07, 13.53it/s]



106it [00:07, 13.90it/s]



108it [00:07, 14.61it/s]



110it [00:07, 15.11it/s]



112it [00:07, 14.56it/s]



114it [00:07, 14.58it/s]



116it [00:07, 15.09it/s]



118it [00:08, 15.39it/s]



120it [00:08, 14.79it/s]



122it [00:08, 14.95it/s]



124it [00:08, 15.29it/s]



126it [00:08, 16.45it/s]



128it [00:08, 16.50it/s]



130it [00:08, 16.82it/s]



132it [00:08, 16.56it/s]



134it [00:09, 16.19it/s]



136it [00:09, 15.96it/s]



138it [00:09, 15.99it/s]



140it [00:09, 15.26it/s]



142it [00:09, 15.66it/s]



144it [00:09, 14.39it/s]



146it [00:09, 14.43it/s]



148it [00:10, 12.90it/s]



150it [00:10, 13.69it/s]



152it [00:10, 14.41it/s]



154it [00:10, 15.02it/s]



156it [00:10, 14.85it/s]



158it [00:10, 15.15it/s]



160it [00:10, 15.54it/s]



162it [00:10, 15.81it/s]



164it [00:11, 15.28it/s]



166it [00:11, 14.46it/s]



168it [00:11, 13.22it/s]



170it [00:11, 13.33it/s]



172it [00:11, 13.66it/s]



174it [00:11, 12.99it/s]



176it [00:11, 13.55it/s]



178it [00:12, 13.93it/s]



180it [00:12, 13.85it/s]



182it [00:12, 13.72it/s]



184it [00:12, 11.81it/s]



186it [00:12, 12.67it/s]



188it [00:12, 13.34it/s]



190it [00:13, 14.08it/s]



192it [00:13, 13.03it/s]



194it [00:13, 13.83it/s]



196it [00:13, 13.02it/s]



198it [00:13, 13.78it/s]



200it [00:13, 14.28it/s]



202it [00:13, 14.68it/s]



204it [00:14, 14.02it/s]



206it [00:14, 14.40it/s]



208it [00:14, 14.08it/s]



210it [00:14, 13.62it/s]



212it [00:14, 13.68it/s]



214it [00:14, 13.42it/s]



216it [00:14, 13.48it/s]



218it [00:15, 14.36it/s]



220it [00:15, 14.55it/s]



222it [00:15, 14.58it/s]



224it [00:15, 12.75it/s]



226it [00:15, 13.28it/s]



228it [00:15, 13.99it/s]



230it [00:15, 14.03it/s]



232it [00:16, 14.24it/s]



234it [00:16, 12.31it/s]



236it [00:16, 13.21it/s]



238it [00:16, 13.68it/s]



240it [00:16,  8.73it/s]



242it [00:17, 10.04it/s]



244it [00:17, 11.03it/s]



246it [00:17, 11.94it/s]



248it [00:17, 12.91it/s]



250it [00:17, 12.46it/s]



252it [00:17, 13.25it/s]



254it [00:17, 12.01it/s]



256it [00:18, 14.14it/s]
