# Big Earth Net Preprocessing
## Irrigation Capstone Fall 2020
### TP Goter

This notebook is used to preprocess the GeoTiff files that contain the Sentinel-2 MSI data comprising the BigEarthNet dataset into dataframes.  We originally were using tfrecords, but after creating balanced datasets. wehave little enough data to make dataframe storage a reasonable solution. We will use the the same standardization routine as used by the root Big Earth Net data, but we will package the standardized/scaled data into a single dataframe with binary labels. It is based on the preprocessing scripts from the BigEarthNet repo, but has been updated to work in Colaboratory with Python3.7+ and TensorFlow 2.3.

This version of the preprocessor is for specifically isolating the irrigated and non-irrigated examples.

In [1]:
import pandas as pd
import tensorflow as tf
from glob import glob
import os
#from matplotlib import pyplot as plt
#%matplotlib inline
import numpy as np
from tqdm import tqdm
#from google.colab import drive
#import seaborn as sns
#from matplotlib.cm import get_cmap
#import folium
#import gdal
import rasterio
import csv
import json
from PIL import Image
import cv2

In [2]:
print(pd.__version__)
print(tf.__version__)



1.1.2
2.3.1


## Mount Google Drive and Set Paths

In [3]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [4]:
#base_path = '/content/gdrive/My Drive/Capstone Project'
big_earth_path ='./BigEarthNet-v1.0/'

## Convert data to dataframes instead of TFRecords

We already have our splits in csv files in the bigearthnet-models/splits folders. So we just need to read in these files, and concatenate them into one list. We can then convert that to a labeled dataframe.

In [None]:
FILE = 'balanced_train_4'
filenames_tif = list(pd.read_csv(f'./bigearthnet-models/splits/{FILE}.csv')['file'])
filenames_tif = [f'{file}/{file.split("/")[-1]}' for file in filenames_tif]
filenames_tif[:10] 

In [None]:
os.listdir('/'.join(filenames_tif[0].split('/')[:-1]))

In [None]:
BAND_NAMES = ['B01', 'B02', 'B03', 'B04', 'B05',
              'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12']

BAND_STATS = {
    'mean': {
        'B01': 340.76769064,
        'B02': 429.9430203,
        'B03': 614.21682446,
        'B04': 590.23569706,
        'B05': 950.68368468,
        'B06': 1792.46290469,
        'B07': 2075.46795189,
        'B08': 2218.94553375,
        'B8A': 2266.46036911,
        'B09': 2246.0605464,
        'B11': 1594.42694882,
        'B12': 1009.32729131
    },
    'std': {
        'B01': 554.81258967,
        'B02': 572.41639287,
        'B03': 582.87945694,
        'B04': 675.88746967,
        'B05': 729.89827633,
        'B06': 1096.01480586,
        'B07': 1273.45393088,
        'B08': 1365.45589904,
        'B8A': 1356.13789355,
        'B09': 1302.3292881,
        'B11': 1079.19066363,
        'B12': 818.86747235
    }
}

# Use this one-liner to standardize each feature prior to reshaping.
def standardize_feature(data, band_name):
        return ((tf.dtypes.cast(data, tf.float32) - BAND_STATS['mean'][band_name]) / BAND_STATS['std'][band_name])

In [None]:
data = []
std_data = []
for file in tqdm(filenames_tif):
    bands = {}
    std_bands = {}
    for band_name in BAND_NAMES:
        # First finds related GeoTIFF path and reads values as an array
        band_path = f'{file}_{band_name}.tif'
        band_ds = rasterio.open(band_path)
        bands[band_name] = np.array(band_ds.read(1))
        std_bands[band_name] = standardize_feature(np.array(band_ds.read(1)), band_name)

    #     print(f'B01 Shape: {bands["B01"].shape}')
    #     print(f'B02 Shape: {bands["B02"].shape}')
    #     print(f'B03 Shape: {bands["B03"].shape}')
    #     print(f'B04 Shape: {bands["B04"].shape}')
    #     print(f'B05 Shape: {bands["B05"].shape}')
    #     print(f'B06 Shape: {bands["B06"].shape}')
    #     print(f'B07 Shape: {bands["B07"].shape}')
    #     print(f'B08 Shape: {bands["B08"].shape}')
    #     print(f'B8A Shape: {bands["B8A"].shape}')
    #     print(f'B09 Shape: {bands["B09"].shape}')
    #     print(f'B11 Shape: {bands["B11"].shape}')
    #     print(f'B12 Shape: {bands["B12"].shape}')
    
#     bands_10m = np.stack([bands['B04'],
#                           bands['B03'],
#                           bands['B02'],
#                           bands['B08']], axis=2)

#     bands_20m = np.stack([bands['B05'],
#                           bands['B06'],
#                           bands['B07'],
#                           bands['B8A'],
#                           bands['B11'],
#                           bands['B12']], axis=2)
    
    std_bands_10m = np.stack([std_bands['B04'],
                          std_bands['B03'],
                          std_bands['B02'],
                          std_bands['B08']], axis=2)

    std_bands_20m = np.stack([std_bands['B05'],
                          std_bands['B06'],
                          std_bands['B07'],
                          std_bands['B8A'],
                          std_bands['B11'],
                          std_bands['B12']], axis=2)
    
    
#     msi_bands = np.concatenate([bands_10m, 
#                           cv2.resize(bands_20m, dsize=(120, 120), interpolation=cv2.INTER_CUBIC)],axis=2)
    
    msi_std_bands = np.concatenate([std_bands_10m, 
                          cv2.resize(std_bands_20m, dsize=(120, 120), interpolation=cv2.INTER_CUBIC)],axis=2)
    
#     break
    
    file_json_path =  f'{file}_labels_metadata.json'
   

    with open(file_json_path, 'rb') as f:
        patch_json = json.load(f)

    if 'Permanently irrigated land' in patch_json['labels']:
        label = np.array(1)
    else:
        label = np.array(0)
        
#     data.append((msi_bands, labels))
    std_data.append((msi_std_bands, label))
    
# df = pd.DataFrame(data, columns=['X', 'y'])
# del data

std_df = pd.DataFrame(std_data, columns=['X', 'y'])
del std_data

std_df.to_pickle(f'./bigearthnet-models/splits/{FILE}.pkl')
               

In [14]:
del std_df