In [31]:
import os
import numpy as np
import pandas as pd
import cv2
from PIL import Image
import rasterio
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, os.path.join(os.path.expanduser("~"),"Desktop","projects", "GlacierView","src","common","modules"))
import constants
import pickle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

In [24]:
gv_data_dir = os.path.join(os.path.expanduser("~"),"Desktop","projects","GlacierView", "data")
labels_path = os.path.join(gv_data_dir, "manual_annotations","labels.csv")
training_data_dir = os.path.join(gv_data_dir, "ee_data","training","UTM")

In [25]:
training_data_dir

'/Users/mattw/Desktop/projects/GlacierView/data/ee_data/training/UTM'

In [26]:
df = pd.read_csv(labels_path)

In [5]:
df['glims_id'] = [file_name.split("_")[0] for file_name in df.glacier_pk]

In [6]:
rasters = []
for i in range(len(df)):
    glims_id_and_file_name = os.path.join(df.glims_id[i],df.glacier_pk[i])
    image_path = os.path.join(training_data_dir, glims_id_and_file_name)
    with rasterio.open(image_path, "r+") as src:
        src.nodata = 0
        rasters.append(src.read())
rasters = [np.rollaxis(raster,0,3) for raster in rasters]
rasters = [np.where(raster < 0, 0, raster) for raster in rasters]

In [7]:
stackable_rasters = []
bands_to_keep = constants.l5_band_dict.keys()
l7_bands_to_keep = list(map(constants.l7_band_dict.get, bands_to_keep))
l8_bands_to_keep = list(map(constants.l8_band_dict.get, bands_to_keep))
n_bands_to_keep = len(bands_to_keep)
for raster in rasters:
    n_bands = raster.shape[2]
    if n_bands == 8:
        stackable_rasters.append(raster)
        
    if n_bands == 10:
        stackable_rasters.append(raster[:,:,l7_bands_to_keep])
        
    if n_bands == 12:
        stackable_rasters.append(raster[:,:,l8_bands_to_keep])

In [8]:
reshaped_rasters = []
HEIGHT = 128
WIDTH = 128
for raster in stackable_rasters:
    reshaped_rasters.append(cv2.resize(raster, dsize = (128,128), interpolation=cv2.INTER_LINEAR))

In [9]:
normalized_rasters = []
for raster in reshaped_rasters:
    n_bands = raster.shape[2]
    raster_bands = []
    for i in range(n_bands):
        data = raster[:,:,i]
        min_val = data.min()
        max_val = data.max()
        if min_val == max_val:
            raster_bands.append(data-min_val)
        else:
            raster_bands.append((data-min_val)/(max_val-min_val))
    stacked = np.stack(raster_bands)
    stacked = np.rollaxis(stacked, 0,3)
    normalized_rasters.append(stacked)


In [19]:
X = np.stack(normalized_rasters)
y = np.array(df.label)
y = np.where(y>7, 1, 0)

In [30]:
pd.value_counts(y)/min(pd.value_counts(y))

0    4.17757
1    1.00000
dtype: float64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
training_data = list(zip(X_train,y_train))
test_data = list(zip(X_test, y_test))

In [22]:
with open('training_data.pickle', 'wb') as handle:
    pickle.dump(training_data, handle)

with open('test_data.pickle', 'wb') as handle:
    pickle.dump(test_data, handle)