In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import geopandas as gpd

In [None]:
import sys
sys.path.append('..')

In [None]:
import sshutils
import mysecrets
import sqlite_db_utils
import rsutils.s2_grid_utils

In [None]:
LOCAL_DATACUBE_CATALOG_DB_PATH = '../data/datacubes/catalog.db'

In [None]:
sshutils.download_file_from_cluster(
    sshcreds = mysecrets.SSH_UMD,
    remotepath = '/gpfs/data1/cmongp2/sasirajann/fetch_satdata/data/datacubes/catalog.db',
    download_filepath = LOCAL_DATACUBE_CATALOG_DB_PATH,
    overwrite = True,
)

In [None]:
sqlite_db_utils.get_tables_in_db(
    database = LOCAL_DATACUBE_CATALOG_DB_PATH,
)

In [None]:
s2l2a_datacube_catalog_gdf = sqlite_db_utils.fetch_rows_from_db(
    database = LOCAL_DATACUBE_CATALOG_DB_PATH,
    table = 'sentinel-2-l2a',
    timestamp_cols = ['last_update', 'startdate', 'enddate']
)

In [None]:
scale = 5
aspect_ratio = 3
fig, ax = plt.subplots(figsize=(scale*aspect_ratio, scale))

g = sns.histplot(
    ax = ax,
    data = s2l2a_datacube_catalog_gdf[
        s2l2a_datacube_catalog_gdf['last_update'] > pd.Timestamp(2025, 2, 24, tz='UTC')
    ],
    x = 'last_update',
    bins = 100,
)

### Checking failed datacubes

In [None]:
inputs_df = pd.read_csv('../data/ethiopia/inference2019_wolayita2020-2021_s2l2a_datacube_input.csv')
inputs_df['startdate'] = inputs_df['startdate'].apply(sqlite_db_utils.str_to_ts)
inputs_df['enddate'] = inputs_df['enddate'].apply(sqlite_db_utils.str_to_ts)

In [None]:
failed_indexes = list(set(zip(
    inputs_df['roi'],
    inputs_df['startdate'],
    inputs_df['enddate'],
    inputs_df['config_id'],
)) - set(zip(
    s2l2a_datacube_catalog_gdf['roi'],
    s2l2a_datacube_catalog_gdf['startdate'],
    s2l2a_datacube_catalog_gdf['enddate'],
    s2l2a_datacube_catalog_gdf['config_id'],
)))
failed_indexes

In [None]:
failed_inputs_df = \
inputs_df.set_index([
    'roi',
    'startdate',
    'enddate',
    'config_id',
]).loc[failed_indexes].reset_index()

In [None]:
def get_shapes_gdf(roi:str):
    geom_type, geom_id = roi.split('=')
    if geom_type not in ['s2grid', 'geom']:
        raise ValueError(f'Invalid roi={roi}. roi must start with s2grid= or geom=')
    
    if geom_type == 's2grid':
        geom = rsutils.s2_grid_utils.get_grid_geometry_from_id(grid_id = geom_id)
    else:
        raise NotImplementedError()

    shapes_gdf = gpd.GeoDataFrame(
        data = {'geometry': [geom]},
        crs = 'epsg:4326',
    )

    return shapes_gdf

In [None]:
failed_inputs_df['geometry'] = failed_inputs_df['roi'].apply(lambda x: get_shapes_gdf(x)['geometry'][0])

In [None]:
failed_inputs_gdf = gpd.GeoDataFrame(failed_inputs_df, crs='epsg:4326')

In [None]:
failed_inputs_gdf.to_file('../data/ethiopia/failed_inputs.geojson')

### Creating training datacube ids list

In [None]:
with open('../data/ethiopia/training_datacube_ids.txt', 'w') as f:
    for _id in s2l2a_datacube_catalog_gdf[s2l2a_datacube_catalog_gdf['id'].str.startswith('geom=')]['id']:
        f.write(f'{_id}\n')

### Creating crops.npy

In [None]:
import numpy as np
import geopandas as gpd

In [None]:
data = np.load('../data/ethiopia/s2l2a_training_data/data.npy')
ids = np.load('../data/ethiopia/s2l2a_training_data/ids.npy', allow_pickle=True)[()]

In [None]:
data.shape

In [None]:
training_data_gdf = gpd.read_file('../data/ethiopia/normalised/combined.geojson')

In [None]:
ids[0].split('=')[1].split('_')[:-3]

In [None]:
training_data_gdf['id']

In [None]:
id_crop_dict = dict(zip(
    training_data_gdf['id'],
    training_data_gdf['c_class'],
))

In [None]:
crops = []
for _id in ids:
    _id = '_'.join(_id.split('=')[1].split('_')[:-3])
    crops.append(id_crop_dict[_id])

crops = np.array(crops)

In [None]:
np.save('../data/ethiopia/s2l2a_training_data/crops.npy', crops)