In [None]:
import pandas as pd
import geopandas as gpd
import shapely
import numpy as np
import plotly.express as px
import plotly.offline as py_offline
import matplotlib.pyplot as plt
import json
import pickle
from tqdm import tqdm
import rasterio
import rasterio.mask
from pathlib import Path

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering

In [None]:
import ee
import geemap.plotlymap as geemap

In [None]:
ee.Authenticate()
ee.Initialize(project='sentinel-treeclassification')

In [None]:
target_column = 'tree_name'
usecols = ['latitude', 'longitude'] + [target_column, 'load_date']
trees_df = pd.read_csv("data/Borough_tree_list_2021July.csv", usecols=usecols, parse_dates=['load_date'])
trees_df.info()

In [None]:
trees_df[target_column].unique()

In [None]:
trees_df[target_column].isna().sum()

In [None]:
trees_df = trees_df.dropna(subset=target_column)
trees_df.isna().sum()

In [None]:
london_trees_mapbox = px.scatter_mapbox(trees_df.sample(n=10000), lat="latitude", lon="longitude", color=target_column,
                        zoom=10, mapbox_style="carto-darkmatter", height=800)
london_trees_mapbox

In [None]:
trees_df['load_date'].value_counts()

In [None]:
date_indices = trees_df['load_date'].value_counts().index

In [None]:
trees_df['load_date'][trees_df['load_date'] == date_indices[2]] = date_indices[1]
trees_df['load_date'].value_counts()

In [None]:
px.histogram(trees_df, x=target_column, text_auto=True).update_xaxes(categoryorder="total descending")

In [None]:
trees_gdf = gpd.GeoDataFrame(
    trees_df, geometry=gpd.points_from_xy(x=trees_df['longitude'], y=trees_df['latitude'], crs=4326)
)
# trees_gdf = trees_gdf.drop(['longitude', 'latitude'], axis=1)
trees_gdf.head(1)
trees_gdf['tree_name'].isna().sum()

In [None]:
total_bounds = trees_gdf.to_crs(epsg=6933).buffer(100, cap_style=3).to_crs(epsg=4326).geometry.total_bounds
london_trees_bbox = ee.Geometry.BBox(*total_bounds)

In [None]:
london_trees_centroid = trees_gdf.dissolve().to_crs(epsg=6933).centroid.to_crs(epsg=4326)[0]

In [None]:
class SentinelGetter:
    def mask_s2_clouds(self, image):
      # Quality assessment with resolution in meters
      qa = image.select('QA60')
      # Bits 10 and 11 are clouds and cirrus, respectively.
      cloud_bit_mask = 1 << 10
      cirrus_bit_mask = 1 << 11
      # Both flags should be set to zero, indicating clear conditions.
      mask = (
          qa.bitwiseAnd(cloud_bit_mask)
          .eq(0)
          .And(qa.bitwiseAnd(cirrus_bit_mask).eq(0))
      )
      return image.updateMask(mask)

    def get_image(self, center_date, bbox):
        modified_data = center_date.replace(month=9, year=center_date.year-1)
        month = pd.DateOffset(months=2)
        image = (
            ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
            .filterDate(modified_data - month, modified_data + month)
            # Pre-filter to get less cloudy granules.
            .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))
            .map(self.mask_s2_clouds)
            .mean()
            .clip(bbox)
        )
        return image

In [None]:
sentinel_image_2020 = SentinelGetter().get_image(date_indices[1], london_trees_bbox)
sentinel_image_2018 = SentinelGetter().get_image(date_indices[0], london_trees_bbox)
def visualise_ee_image(center_to_coords, sentinel_image, zoom=10):
    rgb_max = 3000
    rgb_bands = ['B4', 'B3', 'B2']
    visualization = {
        'min': 0.0,
        'max': rgb_max,
        'bands': rgb_bands,
        'layer': "below",
    }

    map = geemap.Map(center=center_to_coords, zoom=zoom)
    map.addLayer(sentinel_image, visualization, 'RGB')
    py_offline.iplot(map)

visualise_ee_image((london_trees_centroid.y, london_trees_centroid.x), sentinel_image_2018)

In [None]:
min_freq = 0.01
value_counts = trees_gdf[target_column].value_counts()
mask = (value_counts/value_counts.sum()).lt(min_freq)

grouped_minors = pd.Series(np.where(trees_gdf[target_column].isin(value_counts[mask].index), 'Other_minor', trees_gdf[target_column]))
trees_gdf[target_column] = grouped_minors.values

trees_gdf[target_column] = trees_gdf[target_column].astype('category')
px.histogram(trees_gdf, x=target_column, text_auto=True).update_xaxes(categoryorder="total descending")


In [None]:
trees_gdf = trees_gdf.sort_values(by='load_date')
trees_gdf.geometry = trees_gdf.to_crs(epsg=6933).buffer(30, cap_style=3).to_crs(epsg=4326)

In [None]:
trees_gdf.plot()

In [None]:
%%time
# trees_regions_gdf = gpd.GeoDataFrame(geometry=trees_regions)
date_mask_2018 = (trees_gdf['load_date'] == date_indices[0])
date_mask_2020 = (trees_gdf['load_date'] == date_indices[1])

In [None]:
trees_gdf['code'] = trees_gdf[target_column].cat.codes

In [None]:
%%time
selected_regions = trees_gdf[date_mask_2018]
selected_image = sentinel_image_2018.select('B[2-8]', 'B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B')

# task = ee.batch.Export.image.toDrive(
#     image=selected_image,
#     fileNamePrefix='london_trees',
#     description='london_trees',
#     folder='london_trees',
#     scale=10,
# )
# task.start()

In [None]:
%%time
def crop_center(img, cropx=6, cropy=6):
    bands, y, x = img.shape
    startx = x//2 - (cropx//2)
    starty = y//2 - (cropy//2)    
    return img[:, starty:starty+cropy, startx:startx+cropx]

tif_paths = Path('data').glob('london_trees*.tif')
save_path = Path('data').joinpath('london_trees.npy')

processed_labels = []
processed_images = []

bad_geos = []

if not save_path.is_file():
    for tif_name in tif_paths:
        with rasterio.open(tif_name) as src:
            for i, row in tqdm(selected_regions.iterrows(), total=selected_regions.shape[0]):
                try:
                    out_image, out_transform = rasterio.mask.mask(src, [row.geometry], crop=True)
                    out_image = crop_center(out_image)

                    # Due to the image being split, trees near
                    # the split don't generate proper sub-images
                    if out_image.shape != (13, 6, 6):
                        bad_geos.append(row.geometry)
                        continue
                    
                    processed_images.append(out_image)
                    processed_labels.append(row[target_column])
                    # prefix = f'{row[target_column]}_{row["code"]}_{row["load_date"].year}_{i}'         
                except ValueError as e:
                    pass
    data_list = list(zip(processed_labels, processed_images))
    with open(save_path, 'wb') as f:
        pickle.dump(data_list, f)

In [None]:
# gpd.GeoDataFrame(geometry=bad_geos).plot()

In [None]:
save_path = Path('data').joinpath('london_trees.npy')
with open(save_path, 'rb') as f:
    processed_data = pickle.load(f)

labels = [label for label, features in processed_data]
features = [features for label, features in processed_data]

In [None]:
y = labels
classes = np.unique(labels)

X = np.array(features)
X = X.reshape(len(y), -1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, average_precision_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from flaml import AutoML

def train_and_eval(X_train, X_test, y_train, y_test, classes, save_path):
    if not save_path.is_file():
        automl = AutoML(
            # time_budget=60*60,
            estimator_list=['lgbm'],
            n_jobs=1
        )
        clf = OneVsRestClassifier(automl, n_jobs=2)
        clf.fit(X_train, y_train)

        with save_path.open(mode='wb') as f:
            pickle.dump(clf, f)
    else:
        with save_path.open(mode='rb') as f:
            clf = pickle.load(f)
        
    y_pred = clf.predict(X_test)
    y_score = clf.predict_proba(X_test)

    accuracy = accuracy_score(y_pred, y_test)

    cr = classification_report(y_test, y_pred, target_names=classes, zero_division=0, output_dict=True)
    
    return y_pred, cr

In [None]:
%%time
save_path = Path('models').joinpath('londontrees_lgbm_10split.pkl')
y_pred, cr = train_and_eval(X_train, X_test, y_train, y_test, classes, save_path)


    
df = pd.DataFrame(cr).transpose()

df.insert(3, 'mAP', list(average_precision_score(y_test, y_score, average=None)) + [
    average_precision_score(y_test, y_score, average=n) for n in ['micro', 'macro', 'weighted', 'samples']
])

display(df.iloc[:-4].sort_index())
display(df.iloc[-4:])


In [None]:
# rgb_data = tif_data[1][[3,2,1],:,:]
# rasterio.plot.show(rgb_data/3000)

In [None]:
# download_image = sentinel_image_2018.select('B[2-8]', 'B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B')
# band_names = download_image.bandNames().getInfo()

# task = ee.batch.Export.image.toDrive(
#     image=download_image,
#     description='london_trees',
#     folder='london_trees',
#     # region=region,
#     scale=10,
#     # crs='EPSG:5070',
#     # maxPixels=1e13
# )
# task.start()

In [None]:
# trees_regions_2018_ee = geemap.geopandas_to_ee(selected_regions)
# for chunk in tqdm(np.array_split(selected_regions, 10)):
#     for i, row in tqdm(chunk.iterrows(), total=chunk.shape[0]):
#         region = np.asarray(row.geometry.exterior.coords).flatten()
#         ee_region = ee.Geometry.Polygon(*region)
    
#         prefix = f'{row[target_column]}_{row["code"]}_{row["load_date"].year}_{i}'
#         download_image(ee_region, prefix)

# trees_regions_2018_ee.iterate(download_image)
# clip = selected_bands.clipToBoundsAndScale(trees_regions_2018_ee)
# clip = selected_bands.clipToCollection(trees_regions_2018_ee)
# coords = selected.geometry.iloc[0].exterior.coords[0]
# visualise_ee_image(coords[::-1], selected_bands, zoom=16)

In [None]:
# task = ee.batch.Export.image.toCloudStorage(
#     image=selected_bands,
#     description='london_trees_task',
#     bucket='gs://gcp-public-data-sentinel-2',
#     fileNamePrefix='london_trees_task',
#     # region=region,
#     # crsTransform=[30, 0, -2493045, 0, -30, 3310005],
#     # crs='EPSG:5070'
# )

# task.start()

In [None]:
# from google.cloud import storage

# storage_client = storage.Client(project='sentinel-treeclassificatio')
# bucket = storage_client.get_bucket('gcp-public-data-sentinel-2')
# blob = bucket.get_blob('london_trees_task')
# print(blob.size)

In [None]:
# train_y, test_y = train_test_split(trees_gdf, train_size=0.7)
# # Train a 10-tree random forest classifier from the training sample.
# trained_classifier = ee.Classifier.smileRandomForest(10).train(
#     features=training_sample,
#     classProperty=label,
#     inputProperties=img.bandNames(),
# )

In [None]:
# points_in_m = trees_gdf.to_crs(epsg=6933)
# xy = list(map(list, zip(points_in_m.geometry.x, points_in_m.geometry.y)))
# cluster = AgglomerativeClustering(
#     n_clusters=None, 
#     linkage='single',
#     metric='euclidean',
#     distance_threshold=50)
    
# cluster.fit(xy)
# trees_squares['group'] = cluster.labels_
# counts = trees_squares.value_counts('group')
# counts.shape, trees_squares.shape