#### Imports & Setup

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import ee
import geemap
import matplotlib.pyplot as plt
import geopandas as gpd
from google.colab import drive
from datetime import datetime, timedelta
import concurrent.futures

In [None]:
drive.mount('/content/drive')

In [None]:
# authenticate & init GEE
ee.Authenticate()
ee.Initialize(project='')

#### Retrieve filtered data and extraction filters

In [None]:
# read in data created in 1_dataretrieval_2016, since it's already filtered by date
df = pd.read_csv("/content/drive/2016_LA_merged_scaled.csv", delimiter=",")
df.rename(columns={'latitude_x': 'latitude', 'longitude_x':'longitude'}, inplace=True)

In [None]:
def get_bounding_box(df):
  '''
  create bounding box based on available weather stations
  '''
  mean_lon, mean_lat = df.longitude.mean(), df.latitude.mean()
  min_lon, min_lat = df[['longitude', 'latitude']].values.min(axis=0)
  max_lon, max_lat = df[['longitude', 'latitude']].values.max(axis=0)
  bounding_box = ee.Geometry.Rectangle([min_lon, min_lat, max_lon, max_lat])
  print('min lon,lat:', min_lon, min_lat)
  print('max lon, lat:', max_lon, max_lat)
  return bounding_box


def get_landsat_tiles(landsat_path, bounding_box, start_date, end_date, overlap):
  '''
  fetch tiles from path via GEE and filter based on overlap with bounding box
  '''
  landsat = (ee.ImageCollection(landsat_path).filterBounds(bounding_box).filterDate(start_date, end_date))
  bounding_box_area = bounding_box.area()

  def compute_overlap(image):
      overlap = image.geometry().intersection(bounding_box).area().divide(bounding_box_area)
      return image.set('overlap_fraction', overlap)

  landsat_filt = landsat.map(compute_overlap).filter(ee.Filter.gte('overlap_fraction', overlap))
  print('number of days:', landsat_filt.size().getInfo())
  return landsat, landsat_filt


def get_avail_landsat_dates(landsat_filt):
  '''
  get dates at which landsat images are available (use correct timezone!!)
  '''
  avail_dates = landsat_filt.aggregate_array('system:time_start').map(
      lambda d: ee.Date(d).format('YYYY-MM-dd HH:mm:ss', 'America/Los_Angeles')
  ).distinct().getInfo()
  return avail_dates

In [None]:
start_date = '2016-01-01'
end_date = '2016-12-31'

In [None]:
bounding_box = get_bounding_box(df)
landsat, landsat_filt = get_landsat_tiles("LANDSAT/LC08/C02/T1_L2", bounding_box, start_date, end_date, 0.7)
avail_dates = get_avail_landsat_dates(landsat_filt)
print(avail_dates)

#### Extract data and compute statistics from buffered data

In [None]:
def extract_bands(landsat):
  '''
  extract, preprocess, and scale landsat bands
  '''
  # Develop masks for unwanted pixels (fill, cloud, cloud shadow).
  qa_mask = landsat.select('QA_PIXEL').bitwiseAnd(0b11111).eq(0)
  saturation_mask = landsat.select('QA_RADSAT').eq(0)

  # Apply the scaling factors to the appropriate bands.
  def _get_factor_img(factor_names):
      factor_list = landsat.toDictionary().select(factor_names).values()
      return ee.Image.constant(factor_list)

  scale_img = _get_factor_img([
      'REFLECTANCE_MULT_BAND_.|TEMPERATURE_MULT_BAND_ST_B10'])
  offset_img = _get_factor_img([
      'REFLECTANCE_ADD_BAND_.|TEMPERATURE_ADD_BAND_ST_B10'])
  scaled = landsat.select('SR_B.|ST_B10').multiply(scale_img).add(offset_img)

  landsat = landsat.addBands(scaled, None, True).updateMask(
      qa_mask).updateMask(saturation_mask)

  red = landsat.select('SR_B4').rename('Red')
  green = landsat.select('SR_B3').rename('Green')
  blue = landsat.select('SR_B2').rename('Blue')
  swir1 = landsat.select('SR_B6').rename('SWIR1')
  swir2 = landsat.select('SR_B7').rename('SWIR2')
  lst = landsat.select('ST_B10').rename('LST').subtract(273.15)

  # compute indices
  ndvi = landsat.normalizedDifference(['SR_B5', 'SR_B4']).rename('NDVI')
  ndbi = landsat.normalizedDifference(['SR_B6', 'SR_B5']).rename('NDBI')
  nbai = landsat.normalizedDifference(['SR_B7', 'SR_B5']).rename('NBAI')
  mndwi = landsat.normalizedDifference(['SR_B3', 'SR_B6']).rename('MNDWI')

  band_dict = {
      'NIR': landsat.select('SR_B5'),
      'RED': landsat.select('SR_B4'),
      'BLUE': landsat.select('SR_B2')
  }

  evi = landsat.expression('2.5 * ((NIR - RED) / (NIR + 6 * RED - 7.5 * BLUE + 1))', band_dict).rename('EVI')
  savi = landsat.expression('((NIR - RED) / (NIR + RED + 0.5)) * 1.5', band_dict).rename('SAVI')

  return lst.addBands([red, green, blue, swir1, swir2, ndvi, evi, savi, mndwi, nbai, ndbi])



def compute_buffer_stats(feature, bands_date):
  '''
  compute mean and standard deviation of each band for each buffer size
  '''
  point = ee.Geometry(feature['geometry'])
  buffered_geometries = {size: point.buffer(size) for size in buffer_sizes}
  stats = {}

  reduction = bands_date.reduceRegion(
      reducer=ee.Reducer.mean().combine(ee.Reducer.stdDev(), sharedInputs=True),
      geometry=ee.Geometry.MultiPolygon(list(buffered_geometries.values())),
      scale=30,
      bestEffort=True,
      maxPixels=1e13
  ).getInfo()

  for buffer_size, geom in buffered_geometries.items():
      for key, value in reduction.items():
          stats[f"{key}_{buffer_size}m"] = value

  return stats


def sample_bands_via_target_location(landsat_filt, df):
  '''
  for the given available date, extract landsat bands and terrain images, and sample buffer data given target locations
  '''
  total_arr = []
  for i in range(landsat_filt.size().getInfo()):
      image = ee.Image(landsat_filt.toList(1, i).get(0))
      datet = ee.Date(image.get('system:time_start')).format('YYYY-MM-dd').getInfo()
      print(datet)

      features = []
      # for _, row in df[df.date == datetime.strptime(datet, '%Y-%m-%d').date()].iterrows(): # if df.date is datetime
      for _, row in df[df.date == datet].iterrows(): # if df.date is string

          point = ee.Geometry.Point([row['longitude'], row['latitude']])
          feature = ee.Feature(point, {"temperature": row["temperature"], "sid": row["sid"], "date": str(row["date"])})
          features.append(feature)

      fc_date = ee.FeatureCollection(features)
      bands_date = extract_bands(image)

      # extra non-temporally dependent
      srtm = ee.Image("USGS/SRTMGL1_003").select("elevation").rename("Elevation")
      elevation = srtm.clip(bounding_box)
      slope = ee.Terrain.slope(srtm).rename("Slope")
      slope = slope.clip(bounding_box)
      # distance to water using ESA WorldCover (water mask = 80)
      water = ee.Image("ESA/WorldCover/v200/2021").select("Map").eq(80).selfMask()
      water_dist = water.fastDistanceTransform().rename("Water_dist")

      bands_date = bands_date.addBands([elevation, slope, water_dist])

      # parallelise for SPEEEED
      with concurrent.futures.ThreadPoolExecutor() as executor:
          futures = []
          for fcd_feature in fc_date.getInfo()['features']:
              props = fcd_feature['properties']
              sid = props['sid']
              date = props['date']

              future = executor.submit(compute_buffer_stats, fcd_feature, bands_date)
              futures.append((future, sid, date))

          for future, sid, date in futures:
              stats = future.result()
              total_arr.append({'sid': sid, 'date': date, **stats})
              print('.', end='', flush=True)

  return total_arr

In [None]:
buffer_sizes = [ 30, 100, 200, 500, 1000]
data_list = sample_bands_via_target_location(landsat_filt, df)
buffer_df = pd.DataFrame(data_list)

#### Merge with target data

In [None]:
# merge extracted buffer-computed features with target data
df_la_2016_buffer = pd.merge(df, buffer_df, on=['sid', 'date'], how='left').dropna()
df_la_2016_buffer

In [None]:
df_numeric = df_la_2016_buffer[list(df_la_2016_buffer.columns[31:])+['temperature']]
corr_matrix = df_numeric.select_dtypes(include='number').corr()

# variables where correlation is bigger than 0.1
corr_high = pd.DataFrame(corr_matrix[abs(corr_matrix) > 0.14]['temperature'].dropna().sort_values(ascending=False))
display(corr_high)

In [None]:
# save to file
df_la_2016_buffer.to_csv('/content/drive/2016_LA_merged_scaled_buffer.csv', index=False)