In [1]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

import plotly.graph_objects as go

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split

from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv3D, Activation, BatchNormalization, \
                                     Dropout, Flatten, Dense

2024-04-19 10:07:17.566179: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
target = 'tree_name'
df = pd.read_csv("data/Borough_tree_list_2021July.csv", usecols=[target])

In [3]:
df = df.dropna(subset=target)

In [4]:
def process_data(filepaths, selected_bands):
    for filepath in tqdm(filepaths):
        save_path = Path(filepath.parent).joinpath(f'processed_{filepath.name}')

        if save_path.is_file():
            continue
            
        with open(filepath, 'rb') as f:
            raw_data = np.load(filepath)
    
        for band in selected_bands:
            zero_fraction = (raw_data[band] == 0).sum()/raw_data[band].size
            
            if zero_fraction < 0.6:
                raw_data[band] = np.where(
                    raw_data[band] > 0, raw_data[band], raw_data[band][raw_data[band] > 0].mean())
            else:
                raise ValueError(f'Too many zeros: {zero_fraction:.2f} of data. '
                                 'Modify the condition or employ a different strategy.')
                
        data = raw_data.view((float, len(raw_data.dtype.names)))
        
        with open(save_path, 'wb') as f:
            np.save(f, data)

In [5]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [6]:
filepaths = sorted(list(Path('london').glob(f'london*.npy')), 
                   key=lambda s: int(s.stem.split('_')[-1]))
process_data(filepaths, selected_bands)

  0%|          | 0/34 [00:00<?, ?it/s]

In [19]:
class Conv3DModelCreator:
    def normalise(self, a, p=1):
        upper = np.percentile(a, 100-p)
        lower = np.percentile(a, p)
    
        bounded_a = np.where(a > upper, np.median(a), a)
        bounded_a = np.where(a < lower, np.median(a), bounded_a)
        
        scaled_a = (bounded_a - lower)/(upper - lower)
        return scaled_a
        
    def split_and_preprocess(self, y, X, random_state=42):
        """Split and max scale."""
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=random_state)

        for i in range(X_train.shape[-1]):
            X_train[...,i] = self.normalise(X_train[...,i])
            X_test[...,i] = self.normalise(X_test[...,i])
        
        return X_train, X_test, y_train, y_test

    def build_model_3d(self, num_classes, input_size):
        m = tf.keras.Sequential()
        m.add(Input(input_size))
        for i in range(3): 
            m.add(Conv3D(
                16, 
                strides=1, 
                padding='same',
                kernel_size=(3, 3, 3),
                # kernel_initializer='he_normal',
                # kernel_regularizer=l2(1e-6)
            ))
            m.add(BatchNormalization(axis=-1))
            m.add(Activation('relu'))
            # m.add(Dropout(0.25))
            
        m.add(Flatten())
        m.add(Dense(32, 
                    # kernel_initializer='he_normal', 
                    # kernel_regularizer=l2(1e-6)
                   )
             )
        m.add(BatchNormalization(axis=-1))
        m.add(Activation('relu'))
        # m.add(Dropout(0.25))
        
        m.add(Dense(
            num_classes, 
            activation='sigmoid', 
            # kernel_initializer='glorot_uniform',
            # kernel_regularizer=l2(1e-6)
        ))

        opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
        loss = tf.keras.losses.CategoricalCrossentropy()
        m.compile(
            optimizer=opt,
            loss=loss,
            metrics=['accuracy']
        )
        return m
        
    def run(self, y, X, num_classes, model_name):
        X_train, X_test, y_train, y_test = self.split_and_preprocess(y, X)
        model_path = Path('models').joinpath(model_name)

        if model_path.is_file():
            model = tf.keras.models.load_model(model_path)
        else:
            model = self.build_model_3d(num_classes, X_train.shape[1:])
            
        model.fit(X_train, y_train, epochs=10, verbose=1, batch_size=4,
                  validation_data=(X_test, y_test))
        model.save(model_path)

In [None]:
%%time
all_data = []

n_chunks = 50000

labels = pd.get_dummies(df).to_numpy()

chunks = [labels[i: i + n_chunks] for i in range(0, labels.shape[0], n_chunks)]

model_name = f'conv_london_spring.keras'

for i, chunk in enumerate(chunks):
    filepaths = sorted(list(Path('london').glob(f'processed*_{i}.npy')))
    all_parts = []
    for filepath in filepaths:
        with open(filepath, 'rb') as f:
            data = np.load(f)
        all_parts.append(data)
    features = np.stack(all_parts, axis=3)

    Conv3DModelCreator().run(chunk, features, labels.shape[1], model_name)


Epoch 1/10
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - accuracy: 0.2339 - loss: 2.7357 - val_accuracy: 0.2624 - val_loss: 2.5497
Epoch 2/10
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.2462 - loss: 2.6311 - val_accuracy: 0.2528 - val_loss: 2.5808
Epoch 3/10
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.2531 - loss: 2.6180 - val_accuracy: 0.2648 - val_loss: 2.5684
Epoch 4/10
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.2602 - loss: 2.5940 - val_accuracy: 0.2692 - val_loss: 2.5169
Epoch 5/10
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.2639 - loss: 2.5802 - val_accuracy: 0.2720 - val_loss: 2.5292
Epoch 6/10
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 4ms/step - accuracy: 0.2610 - loss: 2.5800 - val_accuracy: 0.2694 - val_loss: 2.529

In [9]:
# cm = tf.math.confusion_matrix(labels, predictions, num_classes=None)
# px.imshow(
#     np.array(cm),
#     # animation_frame=0,
#     labels=dict(color="Corr coef"),
#     x=cm[0].index,
#     y=cm[0].columns,
#     title='Confusion Matrix',
#     text_auto=True, aspect='auto', zmin=0, height=500
# )

In [10]:
# target = source.metadata['categories']['multi'] # multi / trinary
# labels = df[target].astype('category').cat.codes
# seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
# all_data = []
# for season in seasons:
#     filepath = sorted(list(Path('seasonal_median').glob(f'{season}.npy')))
#     with open(filepath[0], 'rb') as f:
#         all_data.append(np.load(f))
    
# model_name = f'conv_all_mean_seasons_multi.keras'

# features = np.stack(all_data, axis=3)
# score = ConvModelCreator().run(labels, features, model_name)
# score

In [11]:
# traces = [go.Scatter(x=seasons, y=seasonal_scores)]
# go.Figure(
#     data=traces,
#     layout={
#         "xaxis": {"title": "Season"},
#         "yaxis": {"title": "Accuracy"},
#         "title": "Conv2d accuracies"}
# )

In [12]:
# target = source.metadata['categories']['trinary'] # multi / trinary
# labels = df[target].astype('category').cat.codes
# seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
# all_data = []
# for season in tqdm(seasons):
#     filepath = sorted(list(Path('seasonal_median').glob(f'{season}.npy')))
#     with open(filepath[0], 'rb') as f:
#         all_data.append(np.load(f))
    
# model_name = f'conv_all_mean_seasons_trinary.keras'
# score = ConvModelCreator().run(labels, np.stack(all_data, axis=3), model_name)
# score

In [13]:
# target = source.metadata['categories']['trinary'] # multi / trinary

# mask = df[target] != 'cleared'

# labels = df[target][mask].astype('category').cat.codes

# seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
# all_data = []
# for season in tqdm(seasons):
#     filepath = sorted(list(Path('seasonal_median').glob(f'{season}.npy')))
#     with open(filepath[0], 'rb') as f:
#         data = np.load(f)
#         all_data.append(data[mask])
        
# model_name = f'conv_all_mean_seasons_binary.keras'
# score = ConvModelCreator().run(labels, np.stack(all_data, axis=3), model_name)
# score

In [14]:
# filepaths = sorted(list(Path('london').glob('*.npy')), key=lambda x: int(x.stem.split('_')[-1]))

# n_chunks = 50000
# chunks = [gdf[i: i + n_chunks] for i in range(0, gdf.shape[0], n_chunks)]
# scores = []

# chunk[target].cat.codes

# for chunk, filepath in tqdm(zip(chunks, filepaths), total=len(filepaths)
#                            ):
#     score = LightModelCreator().run_and_eval(chunk[target].cat.codes, [filepath])
#     scores.append(score)