In [1]:
import numpy as np
import pandas as pd
import ast

## Prepare Senegal 2022 data in format for MLC
- Exported data is 16 timesteps x 18 bands, start date 01-01-2022. We want to subset to 12 months to match time series length of CropHarvest.
- We want to be confident about the labels in this test set, so only use unanimous label points.

In [2]:
senegal = pd.read_csv('../data/datasets/Senegal_CEO_2022.csv')
# Create an empty dataframe for the new data format
senegal_mlc = pd.DataFrame()

nan_count = 0
for idx, row in senegal.iterrows():
    data_string = senegal.loc[idx, 'eo_data']
    # check if nan
    if type(data_string) == float:
        print('skipping nan at idx %d' % idx)
        nan_count += 1
        continue
    # Converting the string to a list of lists using ast.literal_eval
    list_of_lists = ast.literal_eval(data_string)
    # Converting the list of lists to a multi-dimensional numpy array
    numpy_array = np.array(list_of_lists)
    # subset to march to march (12 months)
    # start date is 01-01-2022, end date is 04-30-2023
    # want to skip first two and last two
    numpy_array = numpy_array[2:-2]
    
    # Convert the numpy array back into a string
    string_array = np.array2string(numpy_array, separator=', ')

    # Since array2string adds additional characters like brackets and line breaks,
    # you might want to format it to look exactly like your original string
    formatted_string = string_array.replace('\n', '').replace(' ', '')
    # skip uncertain labels
    class_label = senegal.loc[idx, 'class_probability']
    if class_label != 0 and class_label != 1:
        continue
    else:
        senegal_mlc.loc[idx, 'label'] = class_label
    senegal_mlc.loc[idx, 'eo_data'] = formatted_string

print('Number of nans skipped = %d' % nan_count)

skipping nan at idx 25
skipping nan at idx 49
skipping nan at idx 52
skipping nan at idx 65
skipping nan at idx 67
skipping nan at idx 68
skipping nan at idx 76
skipping nan at idx 80
skipping nan at idx 106
skipping nan at idx 130
skipping nan at idx 132
skipping nan at idx 134
skipping nan at idx 145
skipping nan at idx 147
skipping nan at idx 158
skipping nan at idx 159
skipping nan at idx 162
skipping nan at idx 165
skipping nan at idx 167
skipping nan at idx 169
skipping nan at idx 170
skipping nan at idx 172
skipping nan at idx 176
skipping nan at idx 177
skipping nan at idx 178
skipping nan at idx 190
skipping nan at idx 197
skipping nan at idx 203
skipping nan at idx 220
skipping nan at idx 228
skipping nan at idx 261
skipping nan at idx 271
skipping nan at idx 305
skipping nan at idx 306
skipping nan at idx 307
skipping nan at idx 319
skipping nan at idx 322
skipping nan at idx 336
skipping nan at idx 355
skipping nan at idx 382
skipping nan at idx 383
skipping nan at idx 389


In [3]:
senegal_mlc

Unnamed: 0,label,eo_data
0,0.0,"[[-1.42355510e+01,-2.15492665e+01,2.63500000e+..."
1,0.0,"[[-2.44485617e+01,-2.88332132e+01,2.89100000e+..."
2,0.0,"[[-2.38402360e+01,-2.93729312e+01,3.01000000e+..."
3,0.0,"[[-1.47781415e+01,-1.87598883e+01,2.85700000e+..."
4,0.0,"[[-1.96500718e+01,-2.58454098e+01,2.93000000e+..."
...,...,...
1495,0.0,"[[-1.14005453e+01,-1.74335046e+01,2.51100000e+..."
1496,0.0,"[[-1.00196942e+01,-1.59960228e+01,2.35000000e+..."
1497,1.0,"[[-1.77268047e+01,-3.07639144e+01,2.84400000e+..."
1498,1.0,"[[-1.60961355e+01,-2.38341798e+01,2.72400000e+..."


In [4]:
# Question: should we balance? should we even use this set for the competition?
senegal_mlc['label'].value_counts()

0.0    1235
1.0     105
Name: label, dtype: int64

## Prepare Tigray 2021 data in format for MLC
- Exported data is 24 timesteps x 18 bands, start date 01-01-2021. We want to do 12 months starting March 2021.
- We want to be confident about the labels in this test set, so only use unanimous label points.

In [5]:
tigray2021 = pd.read_csv('../data/datasets/Ethiopia_Tigray_2021.csv')
# Create an empty dataframe for the new data format
tigray2021_mlc = pd.DataFrame()

nan_count = 0
for idx, row in tigray2021.iterrows():
    data_string = tigray2021.loc[idx, 'eo_data']
    # check if nan
    if type(data_string) == float:
        print('skipping nan at idx %d' % idx)
        nan_count += 1
        continue
    # Converting the string to a list of lists using ast.literal_eval
    list_of_lists = ast.literal_eval(data_string)
    # Converting the list of lists to a multi-dimensional numpy array
    numpy_array = np.array(list_of_lists)
    # subset to march to march (12 months)
    # start date is 01-01-2021
    # want to skip first two and last two
    numpy_array = numpy_array[2:2+12]
    
    # Convert the numpy array back into a string
    string_array = np.array2string(numpy_array, separator=', ')

    # Since array2string adds additional characters like brackets and line breaks,
    # you might want to format it to look exactly like your original string
    formatted_string = string_array.replace('\n', '').replace(' ', '')
    # skip uncertain labels
    class_label = tigray2021.loc[idx, 'class_probability']
    if class_label != 0 and class_label != 1:
        continue
    else:
        tigray2021_mlc.loc[idx, 'label'] = class_label
    tigray2021_mlc.loc[idx, 'eo_data'] = formatted_string

print('Number of nans skipped = %d' % nan_count)

skipping nan at idx 2
skipping nan at idx 3
skipping nan at idx 8
skipping nan at idx 10
skipping nan at idx 17
skipping nan at idx 18
skipping nan at idx 19
skipping nan at idx 20
skipping nan at idx 24
skipping nan at idx 26
skipping nan at idx 28
skipping nan at idx 32
skipping nan at idx 33
skipping nan at idx 35
skipping nan at idx 42
skipping nan at idx 44
skipping nan at idx 46
skipping nan at idx 48
skipping nan at idx 65
skipping nan at idx 66
skipping nan at idx 73
skipping nan at idx 79
skipping nan at idx 80
skipping nan at idx 81
skipping nan at idx 83
skipping nan at idx 84
skipping nan at idx 88
skipping nan at idx 98
skipping nan at idx 99
skipping nan at idx 124
skipping nan at idx 125
skipping nan at idx 127
skipping nan at idx 128
skipping nan at idx 138
skipping nan at idx 141
skipping nan at idx 142
skipping nan at idx 149
skipping nan at idx 151
skipping nan at idx 152
skipping nan at idx 153
skipping nan at idx 169
skipping nan at idx 171
skipping nan at idx 172


In [6]:
tigray2021_mlc

Unnamed: 0,label,eo_data
0,0.0,"[[-3.96095435e+00,-9.43330254e+00,1.55000000e+..."
1,0.0,"[[-4.12571337e+00,-9.21231625e+00,1.39000000e+..."
4,0.0,"[[-1.23317485e+01,-1.92344105e+01,1.38400000e+..."
5,0.0,"[[-7.71946565e+00,-1.06734701e+01,1.26000000e+..."
6,0.0,"[[-8.36950661e+00,-1.63944270e+01,1.13800000e+..."
...,...,...
881,0.0,"[[-1.44614578e+01,-1.96913773e+01,1.41600000e+..."
882,1.0,"[[-9.77523749e+00,-1.78364651e+01,1.21500000e+..."
883,0.0,"[[-5.64229765e-01,-8.84975504e+00,1.22900000e+..."
884,0.0,"[[-1.08297359e+01,-1.92155071e+01,1.57900000e+..."


In [7]:
tigray2021_mlc['label'].value_counts()

0.0    475
1.0    181
Name: label, dtype: int64

## Prepare Tigray 2020 data in format for MLC
- Exported data is 24 timesteps x 18 bands, start date 01-01-2021. We want to do 12 months starting March 2021.
- We want to be confident about the labels in this test set, so only use unanimous label points.

In [8]:
tigray2020 = pd.read_csv('../data/datasets/Ethiopia_Tigray_2020.csv')
# Create an empty dataframe for the new data format
tigray2020_mlc = pd.DataFrame()

nan_count = 0
for idx, row in tigray2020.iterrows():
    data_string = tigray2020.loc[idx, 'eo_data']
    # check if nan
    if type(data_string) == float:
        print('skipping nan at idx %d' % idx)
        nan_count += 1
        continue
    # Converting the string to a list of lists using ast.literal_eval
    list_of_lists = ast.literal_eval(data_string)
    # Converting the list of lists to a multi-dimensional numpy array
    numpy_array = np.array(list_of_lists)
    # subset to march to march (12 months)
    # start date is 01-01-2020
    numpy_array = numpy_array[2:2+12]
    
    # Convert the numpy array back into a string
    string_array = np.array2string(numpy_array, separator=', ')

    # Since array2string adds additional characters like brackets and line breaks,
    # you might want to format it to look exactly like your original string
    formatted_string = string_array.replace('\n', '').replace(' ', '')
    # skip uncertain labels
    class_label = tigray2020.loc[idx, 'class_probability']
    if class_label != 0 and class_label != 1:
        continue
    else:
        tigray2020_mlc.loc[idx, 'label'] = class_label
    tigray2020_mlc.loc[idx, 'eo_data'] = formatted_string

print('Number of nans skipped = %d' % nan_count)

skipping nan at idx 8
skipping nan at idx 9
skipping nan at idx 36
skipping nan at idx 39
skipping nan at idx 45
skipping nan at idx 47
skipping nan at idx 60
skipping nan at idx 66
skipping nan at idx 86
skipping nan at idx 88
skipping nan at idx 93
skipping nan at idx 94
skipping nan at idx 100
skipping nan at idx 105
skipping nan at idx 109
skipping nan at idx 114
skipping nan at idx 115
skipping nan at idx 125
skipping nan at idx 128
skipping nan at idx 131
skipping nan at idx 139
skipping nan at idx 154
skipping nan at idx 166
skipping nan at idx 175
skipping nan at idx 192
skipping nan at idx 199
skipping nan at idx 203
skipping nan at idx 206
skipping nan at idx 207
skipping nan at idx 209
skipping nan at idx 214
skipping nan at idx 228
skipping nan at idx 232
skipping nan at idx 238
skipping nan at idx 241
skipping nan at idx 245
skipping nan at idx 247
skipping nan at idx 254
skipping nan at idx 262
skipping nan at idx 269
skipping nan at idx 272
skipping nan at idx 273
skippi

In [9]:
tigray2020_mlc

Unnamed: 0,label,eo_data
0,0.0,"[[-3.40507540e+00,-1.19441144e+01,1.33200000e+..."
1,0.0,"[[-1.01614959e+01,-1.50998810e+01,1.05700000e+..."
2,0.0,"[[-1.37047025e+01,-1.96595139e+01,1.57200000e+..."
3,0.0,"[[-7.53476129e+00,-1.52523661e+01,1.16300000e+..."
4,1.0,"[[-1.02791819e+01,-2.28819453e+01,1.08400000e+..."
...,...,...
1195,0.0,"[[-1.29620620e+01,-2.10993888e+01,1.16900000e+..."
1196,0.0,"[[-1.35332022e+01,-2.75207233e+01,1.21100000e+..."
1197,0.0,"[[-1.91325054e+00,-1.07038918e+01,1.16900000e+..."
1198,1.0,"[[-1.01801434e+01,-2.09272196e+01,1.24800000e+..."


In [10]:
# Question: should we balance this a bit more?
tigray2020_mlc['label'].value_counts()

0.0    736
1.0    291
Name: label, dtype: int64