In [1]:
import numpy as np
import pandas as pd
import ast

## Prepare Senegal 2022 data in format for MLC
- Exported data is 16 timesteps x 18 bands, start date 01-01-2022. We want to subset to 12 months to match time series length of CropHarvest.
- We want to be confident about the labels in this test set, so only use unanimous label points.

In [2]:
senegal = pd.read_csv('../data/datasets/Senegal_CEO_2022.csv')
# Create an empty dataframe for the new data format
senegal_mlc = pd.DataFrame()

nan_count = 0
for idx, row in senegal.iterrows():
    data_string = senegal.loc[idx, 'eo_data']
    # check if nan
    if type(data_string) == float:
#         print('skipping nan at idx %d' % idx)
        nan_count += 1
        continue
    # Converting the string to a list of lists using ast.literal_eval
    list_of_lists = ast.literal_eval(data_string)
    # Converting the list of lists to a multi-dimensional numpy array
    numpy_array = np.array(list_of_lists)
    # subset to feb to feb (12 months)
    # start date is 01-01-2022, end date is 04-30-2023
    numpy_array = numpy_array[1:12+1]
    
    # Convert the numpy array back into a string
    string_array = np.array2string(numpy_array, separator=', ')

    # Since array2string adds additional characters like brackets and line breaks,
    # you might want to format it to look exactly like your original string
    formatted_string = string_array.replace('\n', '').replace(' ', '')
    # skip uncertain labels
    class_label = senegal.loc[idx, 'class_probability']
    if class_label != 0 and class_label != 1:
        continue
    else:
        senegal_mlc.loc[idx, 'label'] = class_label
        senegal_mlc.loc[idx, 'lat'] = senegal.loc[idx, 'lat']
        senegal_mlc.loc[idx, 'lon'] = senegal.loc[idx, 'lon']
    senegal_mlc.loc[idx, 'eo_data'] = formatted_string

print('Number of nans skipped (disagreement points) = %d' % nan_count)

Number of nans skipped (disagreement points) = 158


In [3]:
senegal_mlc

Unnamed: 0,label,lat,lon,eo_data
0,0.0,15.033306,-16.937735,"[[-1.76433033e+01,-2.25110991e+01,2.46100000e+..."
1,0.0,15.805857,-14.934492,"[[-2.17139820e+01,-2.99819205e+01,2.70000000e+..."
2,0.0,16.192133,-14.772795,"[[-3.18851894e+01,-2.54796631e+01,2.81900000e+..."
3,0.0,15.015340,-13.173794,"[[-1.31223512e+01,-2.21594728e+01,2.61800000e+..."
4,0.0,14.799744,-15.329750,"[[-2.40862310e+01,-2.52919268e+01,2.72400000e+..."
...,...,...,...,...
1495,0.0,12.841417,-15.841790,"[[-1.06019017e+01,-1.62267535e+01,2.13700000e+..."
1496,0.0,12.491074,-12.688703,"[[-1.22743027e+01,-1.62596847e+01,2.24800000e+..."
1497,1.0,13.766682,-13.613968,"[[-1.79771286e+01,-2.97583562e+01,2.87800000e+..."
1498,1.0,13.227693,-14.242789,"[[-1.68223114e+01,-2.33634491e+01,2.48900000e+..."


In [4]:
# Question: should we balance? should we even use this set for the competition?
senegal_mlc['label'].value_counts()

0.0    1235
1.0     105
Name: label, dtype: int64

In [13]:
# Save to file
senegal_mlc.to_csv('/Users/hkerner/data/mlcommons/cropharvest/test-senegal.csv')

## Prepare Tigray 2021 data in format for MLC
- Exported data is 24 timesteps x 18 bands, start date 01-01-2021. We want to do 12 months starting March 2021.
- We want to be confident about the labels in this test set, so only use unanimous label points.

In [5]:
tigray2021 = pd.read_csv('../data/datasets/Ethiopia_Tigray_2021.csv')
# Create an empty dataframe for the new data format
tigray2021_mlc = pd.DataFrame()

nan_count = 0
for idx, row in tigray2021.iterrows():
    data_string = tigray2021.loc[idx, 'eo_data']
    # check if nan
    if type(data_string) == float:
#         print('skipping nan at idx %d' % idx)
        nan_count += 1
        continue
    # Converting the string to a list of lists using ast.literal_eval
    list_of_lists = ast.literal_eval(data_string)
    # Converting the list of lists to a multi-dimensional numpy array
    numpy_array = np.array(list_of_lists)
    # subset to feb to feb (12 months)
    # start date is 01-01-2021
    numpy_array = numpy_array[1:12+1]
    
    # Convert the numpy array back into a string
    string_array = np.array2string(numpy_array, separator=', ')

    # Since array2string adds additional characters like brackets and line breaks,
    # you might want to format it to look exactly like your original string
    formatted_string = string_array.replace('\n', '').replace(' ', '')
    # skip uncertain labels
    class_label = tigray2021.loc[idx, 'class_probability']
    if class_label != 0 and class_label != 1:
        continue
    else:
        tigray2021_mlc.loc[idx, 'label'] = class_label
        tigray2021_mlc.loc[idx, 'lat'] = tigray2021.loc[idx, 'lat']
        tigray2021_mlc.loc[idx, 'lon'] = tigray2021.loc[idx, 'lon']
    tigray2021_mlc.loc[idx, 'eo_data'] = formatted_string

print('Number of nans skipped (disagreement points) = %d' % nan_count)

Number of nans skipped (disagreement points) = 168


In [6]:
tigray2021_mlc

Unnamed: 0,label,lat,lon,eo_data
0,0.0,13.254093,39.9645,"[[-1.76221770e+00,-8.27914225e+00,1.50500000e+..."
1,0.0,14.219648,37.0535,"[[-3.11058206e+00,-9.87578374e+00,1.34700000e+..."
4,0.0,14.494778,38.7575,"[[-1.27793761e+01,-2.01185000e+01,1.36500000e+..."
5,0.0,12.354055,39.8225,"[[-6.83410548e+00,-9.96168927e+00,1.20500000e+..."
6,0.0,13.668388,37.8345,"[[-8.94170731e+00,-1.57011376e+01,1.15500000e+..."
...,...,...,...,...
881,0.0,14.288462,36.6275,"[[-1.14470958e+01,-2.01212620e+01,1.45300000e+..."
882,1.0,12.908305,36.9825,"[[-9.19253733e+00,-1.55409617e+01,1.27600000e+..."
883,0.0,14.219648,39.4675,"[[3.17237204e-01,-1.02145322e+01,1.16900000e+0..."
884,0.0,14.563507,37.2665,"[[-1.07267844e+01,-1.96903184e+01,1.51100000e+..."


In [7]:
tigray2021_mlc['label'].value_counts()

0.0    475
1.0    181
Name: label, dtype: int64

In [12]:
# Save to file
tigray2021_mlc.to_csv('/Users/hkerner/data/mlcommons/cropharvest/test-tigray2021.csv')

## Prepare Tigray 2020 data in format for MLC
- Exported data is 24 timesteps x 18 bands, start date 01-01-2021. We want to do 12 months starting March 2021.
- We want to be confident about the labels in this test set, so only use unanimous label points.

In [8]:
tigray2020 = pd.read_csv('../data/datasets/Ethiopia_Tigray_2020.csv')
# Create an empty dataframe for the new data format
tigray2020_mlc = pd.DataFrame()

nan_count = 0
for idx, row in tigray2020.iterrows():
    data_string = tigray2020.loc[idx, 'eo_data']
    # check if nan
    if type(data_string) == float:
#         print('skipping nan at idx %d' % idx)
        nan_count += 1
        continue
    # Converting the string to a list of lists using ast.literal_eval
    list_of_lists = ast.literal_eval(data_string)
    # Converting the list of lists to a multi-dimensional numpy array
    numpy_array = np.array(list_of_lists)
    # subset to march to march (12 months)
    # start date is 01-01-2020
    numpy_array = numpy_array[2:2+12]
    
    # Convert the numpy array back into a string
    string_array = np.array2string(numpy_array, separator=', ')

    # Since array2string adds additional characters like brackets and line breaks,
    # you might want to format it to look exactly like your original string
    formatted_string = string_array.replace('\n', '').replace(' ', '')
    # skip uncertain labels
    class_label = tigray2020.loc[idx, 'class_probability']
    if class_label != 0 and class_label != 1:
        continue
    else:
        tigray2020_mlc.loc[idx, 'label'] = class_label
        tigray2020_mlc.loc[idx, 'lat'] = tigray2020.loc[idx, 'lat']
        tigray2020_mlc.loc[idx, 'lon'] = tigray2020.loc[idx, 'lon']
    tigray2020_mlc.loc[idx, 'eo_data'] = formatted_string

print('Number of nans skipped (disagreement points) = %d' % nan_count)

Number of nans skipped (disagreement points) = 173


In [9]:
tigray2020_mlc

Unnamed: 0,label,lat,lon,eo_data
0,0.0,14.767569,39.679153,"[[-3.40507540e+00,-1.19441144e+01,1.33200000e+..."
1,0.0,14.422460,39.469386,"[[-1.01614959e+01,-1.50998810e+01,1.05700000e+..."
2,0.0,14.152642,39.965847,"[[-1.37047025e+01,-1.96595139e+01,1.57200000e+..."
3,0.0,13.733266,39.463458,"[[-7.53476129e+00,-1.52523661e+01,1.16300000e+..."
4,1.0,13.388338,39.328105,"[[-1.02791819e+01,-2.28819453e+01,1.08400000e+..."
...,...,...,...,...
1195,0.0,12.283548,37.972997,"[[-1.29620620e+01,-2.10993888e+01,1.16900000e+..."
1196,0.0,12.768717,37.761434,"[[-1.35332022e+01,-2.75207233e+01,1.21100000e+..."
1197,0.0,13.043966,37.759098,"[[-1.91325054e+00,-1.07038918e+01,1.16900000e+..."
1198,1.0,13.321531,37.765566,"[[-1.01801434e+01,-2.09272196e+01,1.24800000e+..."


In [10]:
tigray2020_mlc['label'].value_counts()

0.0    736
1.0    291
Name: label, dtype: int64

In [11]:
# Save to file
tigray2020_mlc.to_csv('/Users/hkerner/data/mlcommons/cropharvest/test-tigray2020.csv')