In [1]:
import os
import pandas as pd
import shapely
import ast
import geopandas as gpd

In [2]:
import sys
sys.path.append('..')

In [3]:
import rsutils.s2_grid_utils

In [4]:
YUVAL_2020 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2020-03-01_2021-01-31_10m_20Days_allBands_CIMMYT-wheat_1_EOS_1_PFS_1_train'
YUVAL_2021 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2021-03-01_2022-01-31_10m_20Days_allBands_CIMMYT-maize_1_EOS_1_PFS_1_train'
PUBLIC_2020 = '../data/ethiopia/S2L1COnlyCloudProb_ethiopia_2020-03-01_2021-01-31_10m_20Days_allBands_CIMMYT-maize_1_EOS_1_PFS_1_train_eth'

In [5]:
data_folderpaths_and_source_name = [
    (YUVAL_2020, 'yuval2020'),
    (YUVAL_2021, 'yuval2021'),
    (PUBLIC_2020, 'public2020'),
]

In [8]:
common_cols = None
combined_df = None
for folderpath, source in data_folderpaths_and_source_name:
    for filename in os.listdir(folderpath):
        if not filename.endswith('.csv'):
            continue
        filepath = os.path.join(folderpath, filename)
        _df = pd.read_csv(filepath)
        _df['folderpath'] = folderpath
        _df['source'] = source
        if common_cols is None:
            common_cols = set(_df.columns)
        common_cols = common_cols & set(_df.columns)
        if combined_df is None:
            combined_df = _df
        else:
            combined_df = pd.concat([combined_df, _df])

In [9]:
combined_df = combined_df.reset_index(drop=True)

In [10]:
combined_df['geometry'] = combined_df['.geo'].apply(lambda x: shapely.geometry.shape(ast.literal_eval(x)))

In [11]:
combined_gdf = gpd.GeoDataFrame(combined_df, crs='epsg:4326')

In [12]:
non_point_indexes = combined_gdf[combined_gdf.type != 'Point'].index
combined_gdf.loc[non_point_indexes, 'geometry'] = combined_gdf.loc[non_point_indexes, 'geometry'].apply(lambda x: x.buffer(0))

In [13]:
def get_s2_id(shape, res=30):
    x, y = shape.centroid.xy
    x, y = x[0], y[0]
    return rsutils.s2_grid_utils.get_id_from_latlon(
        lat = y, lon = x, res = res
    )

In [14]:
combined_gdf['s2_grid_id'] = combined_gdf['geometry'].apply(get_s2_id)

In [15]:
combined_gdf[['source', 'geometry', 's2_grid_id']].value_counts()

source      geometry                                           s2_grid_id      
yuval2021   POLYGON ((38.71701 7.9797, 38.71701 7.97973, 3...  17b49ebdbdd49437    3
            POLYGON ((38.80696 8.25267, 38.80704 8.25283, ...  17b4adef01e7a92f    3
            POLYGON ((38.76867 8.17473, 38.7687 8.17488, 3...  17b4af7d59cd634d    3
            POLYGON ((38.89618 8.22457, 38.89618 8.22459, ...  17b4b151a8fca7d5    3
            POLYGON ((38.89654 8.21337, 38.89661 8.21384, ...  17b4b136280acb51    3
                                                                                  ..
public2020  POLYGON ((38.10953 10.74947, 38.10953 10.74947...  1645879f3e9b4065    1
            POLYGON ((38.13726 10.75094, 38.13726 10.75095...  164587dcb264aca5    1
            POLYGON ((38.16417 10.70695, 38.16417 10.70696...  164f62bf47ddb6a5    1
            POLYGON ((38.18042 10.67223, 38.18042 10.67223...  164f630e3e241ab5    1
            POLYGON ((39.24192 8.16732, 39.24192 8.16733, ...  17b4d6c

In [16]:
combined_gdf.shape

(3338, 250)

In [17]:
combined_gdf = combined_gdf.loc[combined_gdf[['source', 'geometry']].drop_duplicates().index]

In [18]:
combined_df = combined_df.reset_index(drop=True)

In [19]:
combined_gdf.shape

(3138, 250)

In [20]:
combined_gdf['source'].value_counts()

source
public2020    2788
yuval2021      183
yuval2020      167
Name: count, dtype: int64

In [21]:
non_band_columns = [
    'FID', 'c_class', 'crop_number', 'x',
    'y', '.geo', 'folderpath', 'source',
    'id', 'c_group', 'c_sbcls', 'comment',
    'id_c_cl', 'id_c_gr', 'id_c_sb', 'id_src1',
    'id_src2', 'id_src3', 'lat', 'lnd_cvr',
    'locatin', 'long', 'quality', 'sorc_nm',
    'sub_dat', 'geometry'
]

In [22]:
combined_gdf[non_band_columns].isna().sum().sort_values(ascending=True)

geometry          0
c_class           0
crop_number       0
.geo              0
folderpath        0
source            0
id              167
sorc_nm         350
quality         350
long            350
locatin         350
lnd_cvr         350
lat             350
id_c_cl         350
sub_dat         350
c_group         350
id_c_gr         350
id_src1        1875
id_src3        2342
id_src2        2409
y              2971
x              2971
FID            2971
id_c_sb        3062
c_sbcls        3062
comment        3138
dtype: int64

In [23]:
important_cols = [
    'c_class', 'source', 'geometry', 's2_grid_id'
]

In [24]:
# selected_source = 'yuval2021'

In [25]:
# selected_gdf = combined_gdf[combined_gdf['source']==selected_source][important_cols]

In [26]:
# selected_gdf['id'] = selected_gdf['source'] + '_' + selected_gdf['s2_grid_id']

In [27]:
# selected_gdf

In [28]:
# export_folderpath = '../data/ethiopia/normalised'
# os.makedirs(export_folderpath, exist_ok=True)

In [29]:
# selected_gdf.to_file(os.path.join(export_folderpath, f'{selected_source}.geojson'))

In [30]:
# selected_gdf

In [31]:
combined_gdf = combined_gdf[important_cols]

In [32]:
combined_gdf

Unnamed: 0,c_class,source,geometry,s2_grid_id
0,wheat,yuval2020,POINT (39.26022 8.28079),164b2b14e558ac41
1,wheat,yuval2020,POINT (39.26197 8.30771),164b2bbc3f43010f
2,wheat,yuval2020,POINT (39.26895 8.27533),164b2b22d273a99b
3,wheat,yuval2020,POINT (39.27599 8.31466),164b2bcba373b0f7
4,wheat,yuval2020,POINT (39.28384 8.32974),164b2980ced1795b
...,...,...,...,...
3333,wheat,public2020,"POLYGON ((39.67921 9.91053, 39.67921 9.91053, ...",1649cac445c99695
3334,wheat,public2020,"POLYGON ((39.67057 9.92064, 39.67057 9.92065, ...",1649cab5ba2946c5
3335,wheat,public2020,"POLYGON ((39.64905 9.9254, 39.64905 9.92541, 3...",1649b554e19478bd
3336,wheat,public2020,"POLYGON ((39.63647 9.93316, 39.63647 9.93317, ...",16484ab155b70d81


In [33]:
3138 / 8 * 80 / 60 / 60

8.716666666666667

In [34]:
point_indexes = combined_gdf[combined_gdf['geometry'].type == 'Point'].index

In [35]:
point_indexes = combined_gdf[combined_gdf['geometry'].type == 'Point'].index

In [36]:
point_to_polygon_buffer = 0.0005 / 4.5

In [37]:
combined_gdf.loc[point_indexes, 'geometry'] = combined_gdf.loc[point_indexes, 'geometry'].buffer(point_to_polygon_buffer)


  combined_gdf.loc[point_indexes, 'geometry'] = combined_gdf.loc[point_indexes, 'geometry'].buffer(point_to_polygon_buffer)


In [38]:
y_2020_gdf = combined_gdf[
    combined_gdf['source'].isin([
        'yuval2020', 'public2020'
    ])
]

In [39]:
y_2020_gdf.shape[0] // 6

492

In [40]:
y_2020_gdf.to_file('../data/ethiopia/normalised/2020.geojson')

In [42]:
combined_gdf['id'] = combined_gdf['source'] + '_' + combined_gdf['s2_grid_id']

In [43]:
combined_gdf.to_file('../data/ethiopia/normalised/combined.geojson')