In [1]:
import os
import pandas as pd
import shapely
import ast
import geopandas as gpd

In [2]:
import sys
sys.path.append('..')

In [3]:
import rsutils.s2_grid_utils

In [4]:
YUVAL_2020 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2020-03-01_2021-01-31_10m_20Days_allBands_CIMMYT-wheat_1_EOS_1_PFS_1_train'
YUVAL_2021 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2021-03-01_2022-01-31_10m_20Days_allBands_CIMMYT-maize_1_EOS_1_PFS_1_train'
PUBLIC_2020 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2020-03-01_2021-01-31_10m_20Days_allBands_CIMMYT-maize_1_EOS_1_PFS_1_train_eth'

In [5]:
data_folderpaths_and_source_name = [
    (YUVAL_2020, 'yuval2020'),
    (YUVAL_2021, 'yuval2021'),
    (PUBLIC_2020, 'public2020'),
]

In [6]:
common_cols = None
combined_df = None
for folderpath, source in data_folderpaths_and_source_name:
    for filename in os.listdir(folderpath):
        filepath = os.path.join(folderpath, filename)
        _df = pd.read_csv(filepath)
        _df['folderpath'] = folderpath
        _df['source'] = source
        if common_cols is None:
            common_cols = set(_df.columns)
        common_cols = common_cols & set(_df.columns)
        if combined_df is None:
            combined_df = _df
        else:
            combined_df = pd.concat([combined_df, _df])

In [7]:
combined_df = combined_df.reset_index(drop=True)

In [8]:
combined_df['geometry'] = combined_df['.geo'].apply(lambda x: shapely.geometry.shape(ast.literal_eval(x)))

In [9]:
combined_gdf = gpd.GeoDataFrame(combined_df, crs='epsg:4326')

In [10]:
non_point_indexes = combined_gdf[combined_gdf.type != 'Point'].index
combined_gdf.loc[non_point_indexes, 'geometry'] = combined_gdf.loc[non_point_indexes, 'geometry'].apply(lambda x: x.buffer(0))

In [11]:
def get_s2_id(shape, res=30):
    x, y = shape.centroid.xy
    x, y = x[0], y[0]
    return rsutils.s2_grid_utils.get_id_from_latlon(
        lat = y, lon = x, res = res
    )

In [12]:
combined_gdf['s2_grid_id'] = combined_gdf['geometry'].apply(get_s2_id)

In [None]:
combined_gdf[['source', 'geometry', 's2_grid_id']].value_counts()

In [None]:
combined_gdf.shape

In [15]:
combined_gdf = combined_gdf.loc[combined_gdf[['source', 'geometry']].drop_duplicates().index]

In [16]:
combined_df = combined_df.reset_index(drop=True)

In [None]:
combined_gdf.shape

In [None]:
combined_gdf['source'].value_counts()

In [19]:
non_band_columns = [
    'FID', 'c_class', 'crop_number', 'x',
    'y', '.geo', 'folderpath', 'source',
    'id', 'c_group', 'c_sbcls', 'comment',
    'id_c_cl', 'id_c_gr', 'id_c_sb', 'id_src1',
    'id_src2', 'id_src3', 'lat', 'lnd_cvr',
    'locatin', 'long', 'quality', 'sorc_nm',
    'sub_dat', 'geometry'
]

In [None]:
combined_gdf[non_band_columns].isna().sum().sort_values(ascending=True)

In [21]:
important_cols = [
    'c_class', 'source', 'geometry', 's2_grid_id'
]

In [52]:
selected_source = 'yuval2021'

In [53]:
selected_gdf = combined_gdf[combined_gdf['source']==selected_source][important_cols]

In [54]:
selected_gdf['id'] = selected_gdf['source'] + '_' + selected_gdf['s2_grid_id']

In [None]:
selected_gdf

In [56]:
export_folderpath = '../data/ethiopia/normalised'
os.makedirs(export_folderpath, exist_ok=True)

In [57]:
selected_gdf.to_file(os.path.join(export_folderpath, f'{selected_source}.geojson'))

In [None]:
selected_gdf

In [None]:
selected_gdf