In [1]:
import os
import pandas as pd
import shapely
import ast
import geopandas as gpd

In [2]:
import sys
sys.path.append('..')

In [3]:
import rsutils.s2_grid_utils

In [4]:
YUVAL_2020 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2020-03-01_2021-01-31_10m_20Days_allBands_CIMMYT-wheat_1_EOS_1_PFS_1_train'
YUVAL_2021 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2021-03-01_2022-01-31_10m_20Days_allBands_CIMMYT-maize_1_EOS_1_PFS_1_train'
PUBLIC_2020 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2020-03-01_2021-01-31_10m_20Days_allBands_CIMMYT-maize_1_EOS_1_PFS_1_train_eth'

In [5]:
data_folderpaths_and_source_name = [
    (YUVAL_2020, 'yuval2020'),
    (YUVAL_2021, 'yuval2021'),
    (PUBLIC_2020, 'public2020'),
]

In [6]:
common_cols = None
combined_df = None
for folderpath, source in data_folderpaths_and_source_name:
    for filename in os.listdir(folderpath):
        filepath = os.path.join(folderpath, filename)
        _df = pd.read_csv(filepath)
        _df['folderpath'] = folderpath
        _df['source'] = source
        if common_cols is None:
            common_cols = set(_df.columns)
        common_cols = common_cols & set(_df.columns)
        if combined_df is None:
            combined_df = _df
        else:
            combined_df = pd.concat([combined_df, _df])

In [7]:
combined_df = combined_df.reset_index(drop=True)

In [8]:
combined_df['geometry'] = combined_df['.geo'].apply(lambda x: shapely.geometry.shape(ast.literal_eval(x)))

In [9]:
combined_gdf = gpd.GeoDataFrame(combined_df, crs='epsg:4326')

In [10]:
def get_s2_id(shape, res=30):
    x, y = shape.centroid.xy
    x, y = x[0], y[0]
    return rsutils.s2_grid_utils.get_id_from_latlon(
        lat = y, lon = x, res = res
    )

In [11]:
combined_gdf['s2_grid_id'] = combined_gdf['geometry'].apply(get_s2_id)

In [12]:
combined_gdf[['source', 'geometry', 's2_grid_id']].value_counts()

source      geometry                                           s2_grid_id      
public2020  POLYGON ((37.4596 10.55862, 37.45961 10.55862,...  164fe7d22ef1bd7b    2
            POLYGON ((40.06101 7.09661, 40.06101 7.09661, ...  17ca017e7b53194d    2
yuval2021   POLYGON ((38.89654 8.21337, 38.89734 8.21327, ...  17b4b136280acb51    2
            POLYGON ((38.71701 7.9797, 38.71756 7.97952, 3...  17b49ebdbdd49437    2
public2020  POLYGON ((37.80667 11.04923, 37.80668 11.04922...  16456d62577ebbd7    2
                                                                                  ..
            POLYGON ((38.10953 10.74947, 38.10954 10.74946...  1645879f3e9b4065    1
            POLYGON ((38.13726 10.75094, 38.13726 10.75094...  164587dcb264aca5    1
            POLYGON ((38.16417 10.70695, 38.16418 10.70695...  164f62bf47ddb6a5    1
            POLYGON ((38.18042 10.67223, 38.18043 10.67222...  164f630e3e241ab5    1
yuval2021   POLYGON ((38.67083 7.7359, 38.67086 7.73588, 3...  17b47f9

In [13]:
combined_gdf.shape

(3155, 249)

In [14]:
combined_gdf = combined_gdf.loc[combined_gdf[['source', 'geometry']].drop_duplicates().index]

In [15]:
combined_df = combined_df.reset_index(drop=True)

In [16]:
combined_gdf.shape

(3138, 249)

In [17]:
combined_gdf['source'].value_counts()

source
public2020    2788
yuval2021      183
yuval2020      167
Name: count, dtype: int64

In [18]:
non_band_columns = [
    'FID', 'c_class', 'crop_number', 'x',
    'y', '.geo', 'folderpath', 'source',
    'id', 'c_group', 'c_sbcls', 'comment',
    'id_c_cl', 'id_c_gr', 'id_c_sb', 'id_src1',
    'id_src2', 'id_src3', 'lat', 'lnd_cvr',
    'locatin', 'long', 'quality', 'sorc_nm',
    'sub_dat', 'geometry'
]

In [19]:
combined_gdf[non_band_columns].isna().sum().sort_values(ascending=True)

geometry          0
c_class           0
crop_number       0
.geo              0
folderpath        0
source            0
id              167
sorc_nm         350
quality         350
long            350
locatin         350
lnd_cvr         350
lat             350
id_c_cl         350
sub_dat         350
c_group         350
id_c_gr         350
id_src1        1875
id_src3        2342
id_src2        2409
y              2971
x              2971
FID            2971
id_c_sb        3062
c_sbcls        3062
comment        3138
dtype: int64

In [20]:
important_cols = [
    'c_class', 'source', 'geometry', 's2_grid_id'
]

In [21]:
selected_source = 'public2020'

In [22]:
selected_gdf = combined_gdf[combined_gdf['source']==selected_source][important_cols]

In [23]:
selected_gdf['id'] = selected_gdf['source'] + '_' + selected_gdf['s2_grid_id']

In [24]:
selected_gdf

Unnamed: 0,c_class,source,geometry,s2_grid_id,id
356,wheat,public2020,"POLYGON ((39.37279 9.61808, 39.37279 9.61808, ...",16499e920ea0c7f3,public2020_16499e920ea0c7f3
357,wheat,public2020,"POLYGON ((38.9196 9.79826, 38.91961 9.79826, 3...",16493ebb872f7a3d,public2020_16493ebb872f7a3d
358,wheat,public2020,"POLYGON ((38.89899 9.81235, 38.89899 9.81234, ...",16493955fdaec78d,public2020_16493955fdaec78d
359,wheat,public2020,"POLYGON ((37.92765 10.2445, 37.92765 10.2445, ...",164fa97b0eb8c749,public2020_164fa97b0eb8c749
360,wheat,public2020,"POLYGON ((37.05984 10.59111, 37.05985 10.59111...",16501617becc5c71,public2020_16501617becc5c71
...,...,...,...,...,...
3150,wheat,public2020,"POLYGON ((39.67921 9.91053, 39.67922 9.91052, ...",1649cac445c99695,public2020_1649cac445c99695
3151,wheat,public2020,"POLYGON ((39.67057 9.92064, 39.67057 9.92064, ...",1649cab5ba2946c5,public2020_1649cab5ba2946c5
3152,wheat,public2020,"POLYGON ((39.64905 9.9254, 39.64906 9.9254, 39...",1649b554e19478bd,public2020_1649b554e19478bd
3153,wheat,public2020,"POLYGON ((39.63647 9.93316, 39.63647 9.93316, ...",16484ab155b70d81,public2020_16484ab155b70d81


In [25]:
export_folderpath = '../data/ethiopia/normalised'
os.makedirs(export_folderpath, exist_ok=True)

In [26]:
selected_gdf.to_file(os.path.join(export_folderpath, f'{selected_source}.geojson'))