## Area of Interest - Germany

In [None]:
# Jupyter notebook related
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# ensure you have the required python packages
import sys
! pip install -q -r requirements.txt

[K     |████████████████████████████████| 962 kB 7.0 MB/s 
[K     |████████████████████████████████| 18.3 MB 487 kB/s 
[K     |████████████████████████████████| 196 kB 73.9 MB/s 
[K     |████████████████████████████████| 40 kB 6.6 MB/s 
[K     |████████████████████████████████| 67.4 MB 91 kB/s 
[K     |████████████████████████████████| 10.5 MB 2.6 MB/s 
[K     |████████████████████████████████| 69 kB 8.7 MB/s 
[K     |████████████████████████████████| 15.4 MB 59.9 MB/s 
[K     |████████████████████████████████| 6.3 MB 57.0 MB/s 
[K     |████████████████████████████████| 131 kB 71.1 MB/s 
[K     |████████████████████████████████| 8.5 MB 55.4 MB/s 
[K     |████████████████████████████████| 128 kB 70.4 MB/s 
[K     |████████████████████████████████| 132 kB 74.7 MB/s 
[K     |████████████████████████████████| 62 kB 952 kB/s 
[K     |████████████████████████████████| 133 kB 59.0 MB/s 
[K     |████████████████████████████████| 53.9 MB 78 kB/s 
[K     |███████████████████████

In [None]:
# Built-in modules
import os
import time
import random
import glob
import json
from typing import Tuple, List
from datetime import datetime, timedelta
import pickle
import shutil
from pathlib import Path
from scipy.stats import gmean
import warnings
warnings.filterwarnings('ignore')

# Basics of Python data handling and visualization
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.colors import ListedColormap
from tqdm.auto import tqdm

# Data reding for training validation purposes:
from utils import unzipper
from utils.utils_meoteq import S1Extractor, S2Extractor, PlanetExtractor

 # Machine learning
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedShuffleSplit,StratifiedKFold, KFold
from boostaroota import BoostARoota
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [None]:
def seed_setter(seed_value):
    # Setting SEED to Reproduce Same Results even with "GPU"
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    SEED=seed_value

seed_setter(2021)

## Paths

In [None]:
# Choose the output path where to save downloaded data and processing data
output_path = Path('./Germany')

# Data Path - where to save downloaded data (Sentinel-2, Sentinel-1, Planet 5 days)
data_path = output_path/'data'
data_path.mkdir(exist_ok=True, parents=True)

## Data Frames Paths - created dataframes will be saved here
df_path = output_path/'data_frames'
df_path.mkdir(exist_ok=True, parents=True)


## Download the data
You do not need to run this section if you already downloaded the data - but make sure to modify the data path above to where you saved the data.

In [None]:
#CREATE THE FOLDER FOR THE DATA TO BE DOWNLOADED AND SET DOWNLOAD CREDENTIALS
from radiant_mlhub import Dataset
from radiant_mlhub import Collection

os.environ['MLHUB_API_KEY'] = 'ac55f7d60f86044b9d6229b038f1352e75026b57cec007a23cbc9f3a702716b5'

In [None]:
ds = Dataset.fetch('dlr_fusion_competition_germany')
for c in ds.collections:
  if c.id != 'dlr_fusion_competition_germany_train_source_planet' and c.id !='dlr_fusion_competition_germany_test_source_planet':
    #if c.id != 'dlr_fusion_competition_germany_train_source_planet_5day' and c.id != 'dlr_fusion_competition_germany_test_source_planet_5day':
    print(f'Downloading {c.id} ...')
    coll = Collection.fetch(c.id)
    coll.download(data_path)
    time.sleep(60)      

Downloading dlr_fusion_competition_germany_train_source_sentinel_1 ...


  0%|          | 0/8852.4 [00:00<?, ?M/s]

Downloading dlr_fusion_competition_germany_train_source_sentinel_2 ...


  0%|          | 0/11835.6 [00:00<?, ?M/s]

Downloading dlr_fusion_competition_germany_test_source_sentinel_1 ...


  0%|          | 0/8745.7 [00:00<?, ?M/s]

Downloading dlr_fusion_competition_germany_test_source_sentinel_2 ...


  0%|          | 0/11681.4 [00:00<?, ?M/s]

Downloading dlr_fusion_competition_germany_train_labels ...


  0%|          | 0/2.3 [00:00<?, ?M/s]

Downloading dlr_fusion_competition_germany_test_labels ...


  0%|          | 0/1.5 [00:00<?, ?M/s]

In [None]:
# Unzip the dwnloaded files
zip_files = glob.glob(f'{data_path}/*.gz')
unzipper(zip_files)

INFO: Found folder in drive/MyDrive/Germany/data/dlr_fusion_competition_germany_train_source_sentinel_1, no need to unzip
INFO: Found folder in drive/MyDrive/Germany/data/dlr_fusion_competition_germany_train_source_sentinel_2, no need to unzip
INFO: Found folder in drive/MyDrive/Germany/data/dlr_fusion_competition_germany_test_source_sentinel_1, no need to unzip
INFO: Found folder in drive/MyDrive/Germany/data/dlr_fusion_competition_germany_test_source_sentinel_2, no need to unzip
INFO: Found folder in drive/MyDrive/Germany/data/dlr_fusion_competition_germany_train_labels, no need to unzip
INFO: Found folder in drive/MyDrive/Germany/data/dlr_fusion_competition_germany_test_labels, no need to unzip
INFO: Unzipping drive/MyDrive/Germany/data/dlr_fusion_competition_germany_test_source_planet_5day.tar.gz to drive/MyDrive/Germany/data
INFO: Unzipping drive/MyDrive/Germany/data/dlr_fusion_competition_germany_train_source_planet_5day.tar.gz to drive/MyDrive/Germany/data


## Extract the fields 
Extract the fields from each data source (Sentinel-1, Sentinel-2, Planet 5 days) as numpy and store them in folders

#### Labels

In [None]:
# Train labels - file names
tr_labels_dir = 'dlr_fusion_competition_germany_train_labels'
tr_labels_tile = 'dlr_fusion_competition_germany_train_labels_33N_18E_242N'

# Directory of train labels
tr_labels = gpd.read_file(f'{data_path}/{tr_labels_dir}/{tr_labels_tile}/labels.geojson')

# Test labels - file names
te_labels_dir = 'dlr_fusion_competition_germany_test_labels'
te_labels_tile = 'dlr_fusion_competition_germany_test_labels_33N_17E_243N'

# Directory of test labels
te_labels = gpd.read_file(f'{data_path}/{te_labels_dir}/{te_labels_tile}/labels.geojson')

In [None]:
print('Number of training fields: ', tr_labels.shape[0])
print('Number of testing fields: ', te_labels.shape[0])

Number of training fields:  2534
Number of testing fields:  2064


In [None]:
tr_labels['crop_id'].unique()

array([9, 1, 8, 2, 3, 5, 6, 7, 4])

#### Sentinel-1

In [None]:
# Path to save extracted fields from Sentinel-1 data - location of exported fields images
npy_dir = [f'{output_path}/train/s1', f'{output_path}/test/s1']
for directory in npy_dir:
  if not os.path.isdir(directory):
      os.makedirs(directory)
      if directory == f'{output_path}/train/s1':
        for c in range(9):
          os.mkdir(f"{directory}/{c}")

#Files names OF SENTINEL-1 TRAINING DATA:
train_s1_folder = 'dlr_fusion_competition_germany_train_source_sentinel_1'
train_s1_tile1 = 'dlr_fusion_competition_germany_train_source_sentinel_1_asc_33N_18E_242N_2018'

#Files names OF SENTINEL-2 TESTING DATA:
test_s1_folder = 'dlr_fusion_competition_germany_test_source_sentinel_1'
test_s1_tile1 = 'dlr_fusion_competition_germany_test_source_sentinel_1_asc_33N_17E_243N_2019'

In [None]:
# Extract Train data
S1Extractor(rootpath=f'{data_path}/{train_s1_folder}/{train_s1_tile1}/',              # Path of train sentinel-1
            label_dir=f'{data_path}/{tr_labels_dir}/{tr_labels_tile}/labels.geojson', # Path of train labels
            npyfolder= output_path,                                                   # Where to save the extracted fields
            data_type='train')                                                   


# Extract Test data
S1Extractor(rootpath=f'{data_path}/{test_s1_folder}/{test_s1_tile1}/',               # Path of test sentinel-1
            label_dir=f'{data_path}/{te_labels_dir}/{te_labels_tile}/labels.geojson',# Path of test labels
            npyfolder= output_path,                                                  # Where to save the extracted fields
            data_type='test')


INFO: Extracting Sentinel-1 time series: 100%|██████████| 2534/2534 [01:09<00:00, 36.46it/s]
INFO: Extracting Sentinel-1 time series: 100%|██████████| 2064/2064 [00:52<00:00, 39.17it/s]


In [None]:
# Check the number of extracted fields
s1_train = glob.glob(f'{output_path}/train/s1/*/*.npz')
print('Sentinel-1 train fields: ',len(s1_train))

s1_test = glob.glob(f'{output_path}/test/s1/*.npz')
print('Sentinel-1 test fields: ',len(s1_test))

Sentinel-1 train fields:  2534
Sentinel-1 test fields:  2064


#### Sentinel-2

In [None]:
# Path to save extracted fields from Sentinel-2 data - location of exported fields images
npy_dir = [f'{output_path}/train/s2', f'{output_path}/test/s2']
for directory in npy_dir:
  if not os.path.isdir(directory):
      os.makedirs(directory)
      if directory == f'{output_path}/train/s2':
        for c in range(9):
          os.mkdir(f"{directory}/{c}")


#Files names OF SENTINEL-2 TRAINING DATA:
s2_train_folder = 'dlr_fusion_competition_germany_train_source_sentinel_2'
s2_train_tile = 'dlr_fusion_competition_germany_train_source_sentinel_2_33N_18E_242N_2018'


#Files names OF SENTINEL-2 TESTING DATA:
s2_test_folder = 'dlr_fusion_competition_germany_test_source_sentinel_2'
s2_test_tile = 'dlr_fusion_competition_germany_test_source_sentinel_2_33N_17E_243N_2019'


In [None]:
# Extract train Sentinel-2 data
S2Extractor(rootpath= f"{data_path}/{s2_train_folder}/{s2_train_tile}/",             # Path of train sentinel-2
            label_dir=f'{data_path}/{tr_labels_dir}/{tr_labels_tile}/labels.geojson',# Path of train labels
            npyfolder= output_path,                                                  # Where to save the extracted fields
            data_type='train')                                       


# Extract Test Sentinel-2 data
S2Extractor(rootpath= f"{data_path}/{s2_test_folder}/{s2_test_tile}/",               # Path of test sentinel-2
            label_dir=f'{data_path}/{te_labels_dir}/{te_labels_tile}/labels.geojson',# Path of test labels
            npyfolder= output_path,                                                  # Where to save the extracted fields
            data_type='test')

INFO: Extracting Sentinel-1 time series: 100%|██████████| 2534/2534 [07:26<00:00,  5.68it/s]
INFO: Extracting Sentinel-1 time series: 100%|██████████| 2064/2064 [04:57<00:00,  6.94it/s]


In [None]:
# Check the number of extracted fields
s2_train = glob.glob(f'{output_path}/train/s2/*/*.npz')
print('Sentinel-2 train fields: ',len(s2_train))

s2_test = glob.glob(f'{output_path}/test/s2/*.npz')
print('Sentinel-2 test fields: ',len(s2_test))

Sentinel-2 train fields:  2534
Sentinel-2 test fields:  2064


#### Planet 5 days

In [None]:
# Path to save extracted fields from Sentinel-2 data - location of exported fields images
npy_dir = [f'{output_path}/train/planet_5day', f'{output_path}/test/planet_5day']
for directory in npy_dir:
  if not os.path.isdir(directory):
      os.makedirs(directory)
      if directory == f'{output_path}/train/planet_5day':
        for c in range(9):
          os.mkdir(f"{directory}/{c}")


#Files name of Planet-5days TRAINING DATA:
planet_5days_train_folder = 'dlr_fusion_competition_germany_train_source_planet_5day'
planet_5day_train_tile = '33N_18E_242N'


#Files names of Planet-5days TESTING DATA:
planet_5days_test_folder = 'dlr_fusion_competition_germany_test_source_planet_5day'
planet_5day_test_tile = '33N_18E_242N'

In [None]:
# Extract train planet_5days
PlanetExtractor(rootpath= f'{data_path}/{planet_5days_train_folder}',                # Path of train planet 5days
            label_dir=f'{data_path}/{tr_labels_dir}/{tr_labels_tile}/labels.geojson',# Path of train labels
            npyfolder= output_path,                                                  # Where to save the extracted fields
            tile=planet_5day_train_tile,                                             # Train tile
            aoi='germany',                                                           # Area of interest
            data_type='train',                                                       # Data type (train/test)
            planet='planet_5day')                                                    # Planet data (Planet for daily, planet_5day for 5days interval )

# Extract test planet_5days
PlanetExtractor(rootpath= f'{data_path}/{planet_5days_test_folder}/',                    # Path of test planet 5days
                label_dir=f'{data_path}/{te_labels_dir}/{te_labels_tile}/labels.geojson',# Path of test labels
                npyfolder= output_path,                                                  # Where to save the extracted fields
                tile=planet_5day_test_tile,                                              # Test tile
                aoi='germany',                                                           # Area of interest
                data_type='test',                                                        # Data type (train/test)
                planet='planet_5day')                                                    # Planet data (Planet for daily, planet_5day for 5days interval )

In [None]:
# Check the number of extracted fields
planet5days_train = glob.glob(f'{output_path}/train/planet_5day/*/*.npz')
print('planet5days train fields: ',len(planet5days_train))

planet5days_test = glob.glob(f'{output_path}/test/planet_5day/*.npz')
print('planet5days test fields: ',len(planet5days_test))

planet5days train fields:  2534
planet5days test fields:  2064


## Prepare the data for Tree models


### Planet 5 days

In [None]:
# If you run the code for the first time, set "Preprocess" value to  True 
Preprocess = True

In [None]:
# Prepare Planet 5 days train data
if Preprocess:
    planet5days_train = glob.glob(f'{output_path}/train/planet_5day/*/*.npz')
    planet5days_train = sorted(planet5days_train)

    traininig_data = None

    for field_path in tqdm(planet5days_train):
      object = np.load(field_path)
      features = object['image_stack']
      features=features.transpose(0, 2, 3, 1)
      label = int(field_path.split('/')[-2]) +1
      fid = int(field_path.split('/')[-1][:-4])
      mask = object["mask"]
      t, h, w, f = features.shape

      bands = []
      for i in range(t):
        t1 = features[i]
        t1 = t1.reshape(h*w, f)
        bands.append(t1)
      features = np.hstack(bands)
      mask = np.reshape(mask ,(w * h))

      # Remove pixels with no field_ids (0)
      mask_background = mask == 0
      features = features[~mask_background]
      
      if traininig_data is None:
        # group features by field id
        train_data = pd.DataFrame(features)
        train_data['field_id'] = fid
        train_data['label'] = label

        # group field's features based on mean value of pixels
        grouped_train_data = train_data.groupby('field_id').mean().reset_index()
        traininig_data = grouped_train_data
        
      else:
        # group features by field id
        train_data = pd.DataFrame(features)
        train_data['field_id'] = fid
        train_data['label'] = label

        # group field's features based on mean value of pixels
        grouped_train_data = train_data.groupby('field_id').mean().reset_index()
        traininig_data = pd.concat([traininig_data,grouped_train_data],ignore_index=True)
        

    traininig_data.to_csv(f"{df_path}/planet_5days_train_df.csv", index=False)

planet_5days_train_df = pd.read_csv(f"{df_path}/planet_5days_train_df.csv")
planet_5days_train_df.shape

  0%|          | 0/2534 [00:00<?, ?it/s]

(2534, 294)

In [None]:
# Prepare Planet 5 days test data 
if Preprocess:
  planet5days_test = glob.glob(f'{output_path}/test/planet_5day/*.npz')

  testing_data = None

  for field_path in tqdm(planet5days_test):
    object = np.load(field_path)
    features = object['image_stack']
    features=features.transpose(0, 2, 3, 1)
    fid = int(field_path.split('/')[-1][:-4])
    mask = object["mask"]
    t, h, w, f = features.shape

    bands = []
    for i in range(t):
      t1 = features[i]
      t1 = t1.reshape(h*w, f)
      bands.append(t1)
    features = np.hstack(bands)
    mask = np.reshape(mask ,(w * h))

    # Remove pixels with no field_ids (0)
    mask_background = mask == 0
    features = features[~mask_background]

    if testing_data is None:
      # group features by field id
      test_data = pd.DataFrame(features)
      test_data['field_id'] = fid
      
      # group field's features based on mean value of pixels
      grouped_test_data = test_data.groupby('field_id').mean().reset_index()
      testing_data = grouped_test_data

    else:
      # group features by field id
      test_data = pd.DataFrame(features)
      test_data['field_id'] = fid

      # group field's features based on mean value of pixels
      grouped_test_data = test_data.groupby('field_id').mean().reset_index()
      testing_data = pd.concat([testing_data,grouped_test_data],ignore_index=True)
      
  testing_data.to_csv(f"{df_path}/planet_5days_test_df.csv", index=False)

planet_5days_test_df = pd.read_csv(f"{df_path}/planet_5days_test_df.csv") 
planet_5days_test_df.shape

  0%|          | 0/2064 [00:00<?, ?it/s]

(2064, 293)

### Sentinel-2

In [None]:
# Prepare Sentinel-2 train data
if Preprocess:
    s2_train = glob.glob(f'{output_path}/train/s2/*/*.npz')
    s2_train = sorted(s2_train)

    traininig_data = None

    for field_path in tqdm(s2_train):
      fid = int(field_path.split('/')[-1][:-4])
      object = np.load(field_path)
      features = object['image_stack']
      features=features.transpose(0, 2, 3, 1)
      label = int(field_path.split('/')[-2]) +1
      mask = object["mask"]
      t, h, w, f = features.shape

      bands = []
      for i in range(t):
        t1 = features[i]
        t1 = t1.reshape(h*w, f)
        bands.append(t1)
      features = np.hstack(bands)
      mask = np.reshape(mask ,(w * h))

      # Remove pixels with no field_ids (0)
      mask_background = mask == 0
      features = features[~mask_background]
      #if features.size == 0:
        #print(fid)    
      if traininig_data is None:
        # group features by field id
        train_data = pd.DataFrame(features)
        train_data['field_id'] = fid
        train_data['label'] = label

        # group field's features based on mean value of pixels
        grouped_train_data = train_data.groupby('field_id').mean().reset_index()
        traininig_data = grouped_train_data
        
      else:
        # group features by field id
        train_data = pd.DataFrame(features)
        train_data['field_id'] = fid
        train_data['label'] = label

        # group field's features based on mean value of pixels
        grouped_train_data = train_data.groupby('field_id').mean().reset_index()
        traininig_data = pd.concat([traininig_data,grouped_train_data],ignore_index=True)
        
    traininig_data.to_csv(f"{df_path}/s2_train_df.csv", index=False)

s2_train_df = pd.read_csv(f"{df_path}/s2_train_df.csv")
s2_train_df.shape

  0%|          | 0/2534 [00:00<?, ?it/s]

(2532, 1730)

In [None]:
# Prepare Sentinel-2 days test data 
if Preprocess:
  s2_test = glob.glob(f'{output_path}/test/s2/*.npz')

  testing_data = None

  for field_path in tqdm(s2_test):
    object = np.load(field_path)
    features = object['image_stack']
    features=features.transpose(0, 2, 3, 1)
    fid = int(field_path.split('/')[-1][:-4])
    mask = object["mask"]
    t, h, w, f = features.shape

    bands = []
    for i in range(t):
      t1 = features[i]
      t1 = t1.reshape(h*w, f)
      bands.append(t1)
    features = np.hstack(bands)
    mask = np.reshape(mask ,(w * h))

    # Remove pixels with no field_ids (0)
    if fid != 739 and fid != 12278:
      mask_background = mask == 0
      features = features[~mask_background]


    if testing_data is None:
      # group features by field id
      test_data = pd.DataFrame(features)
      test_data['field_id'] = fid
      
      # group field's features based on mean value of pixels
      grouped_test_data = test_data.groupby('field_id').mean().reset_index()
      testing_data = grouped_test_data

    else:
      # group features by field id
      test_data = pd.DataFrame(features)
      test_data['field_id'] = fid

      # group field's features based on mean value of pixels
      grouped_test_data = test_data.groupby('field_id').mean().reset_index()
      testing_data = pd.concat([testing_data,grouped_test_data],ignore_index=True)

  testing_data.to_csv(f"{df_path}/s2_test_df.csv", index=False)

s2_test_df = pd.read_csv(f"{df_path}/s2_test_df.csv") 
s2_test_df.shape

  0%|          | 0/2064 [00:00<?, ?it/s]

(2064, 1729)

### Acending Sentinel-1

In [None]:
# Prepare Sentinel-1 days train data
if Preprocess:
    s1_train = glob.glob(f'{output_path}/train/s1/*/*.npz')
    s1_train = sorted(s1_train)

    traininig_data = None

    for field_path in tqdm(s1_train):
      fid = int(field_path.split('/')[-1][:-4])
      #if fid in l2:
      #print(fid)
      object = np.load(field_path)
      features = object['image_stack']
      features=features.transpose(0, 2, 3, 1)
      label = int(field_path.split('/')[-2]) +1
      fid = int(field_path.split('/')[-1][:-4])
      mask = object["mask"]
      t, h, w, f = features.shape

      bands = []
      for i in range(t):
        t1 = features[i]
        t1 = t1.reshape(h*w, f)
        bands.append(t1)
      features = np.hstack(bands)
      mask = np.reshape(mask ,(w * h))


      # Remove pixels with no field_ids (0)
      mask_background = mask == 0
      features = features[~mask_background]
      
      if traininig_data is None:
        # group features by field id
        train_data = pd.DataFrame(features)
        train_data['field_id'] = fid
        train_data['label'] = label

        # group field's features based on mean value of pixels
        grouped_train_data = train_data.groupby('field_id').mean().reset_index()
        traininig_data = grouped_train_data
        
      else:
        # group features by field id
        train_data = pd.DataFrame(features)
        train_data['field_id'] = fid
        train_data['label'] = label

        # group field's features based on mean value of pixels
        grouped_train_data = train_data.groupby('field_id').mean().reset_index()
        traininig_data = pd.concat([traininig_data,grouped_train_data],ignore_index=True)
        
    traininig_data= traininig_data.drop(columns=[240, 241, 242, 243]) 
    traininig_data.to_csv(f"{df_path}/asc_s1_train_df.csv", index=False)

asc_s1_train_df = pd.read_csv(f"{df_path}/asc_s1_train_df.csv")

asc_s1_train_df.shape

  0%|          | 0/2534 [00:00<?, ?it/s]

(2532, 242)

In [None]:
# Prepare Sentinel-1 days test data 
if Preprocess:
  no_fields = list(te_labels['fid'])

  testing_data = None

  for no_field in tqdm(no_fields):
    field_path = f'{output_path}/test/s1/{no_field}.npz'
    object = np.load(field_path)
    features = object['image_stack']
    features=features.transpose(0, 2, 3, 1)
    fid = int(field_path.split('/')[-1][:-4])
    mask = object["mask"]
    t, h, w, f = features.shape

    bands = []
    for i in range(t):
      t1 = features[i]
      t1 = t1.reshape(h*w, f)
      bands.append(t1)
    features = np.hstack(bands)
    mask = np.reshape(mask ,(w * h))


    # Remove pixels with no field_ids (0)
    if fid != 739 and fid != 12278:
      mask_background = mask == 0
      features = features[~mask_background]

    if testing_data is None:
      # group features by field id
      test_data = pd.DataFrame(features)
      test_data['field_id'] = fid
      
      # group field's features based on mean value of pixels
      grouped_test_data = test_data.groupby('field_id').mean().reset_index()
      testing_data = grouped_test_data

    else:
      # group features by field id
      test_data = pd.DataFrame(features)
      test_data['field_id'] = fid

      # group field's features based on mean value of pixels
      grouped_test_data = test_data.groupby('field_id').mean().reset_index()
      testing_data = pd.concat([testing_data,grouped_test_data],ignore_index=True)
      
  testing_data.to_csv(f"{df_path}/asc_s1_test_df.csv", index=False)

asc_s1_test_df = pd.read_csv(f"{df_path}/asc_s1_test_df.csv") 
asc_s1_test_df.shape

  0%|          | 0/2064 [00:00<?, ?it/s]

(2064, 241)

## Preprcessing & More Features
Rename the columns "with the bands name" and compute more feaures (vegitation indices, RedEdge indices FLOWERING PHENOLOGY growing rate of the crops) 

In [None]:
# Rename the columns - interpolate the missing values
def rename_cols(df, source='s2', s1='asc'):
    if source == 's2':
      df = df.replace(0, np.nan)
      features = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07','B08', 'B8A', 'B09', 'B11', 'B12']
      time_step = 144 

    if source == 'planet':
      df = df.replace(0, np.nan)
      features = ['B01', 'B02', 'B03', 'NIR']
      time_step = 73
       
    elif source == 's1':
      features = ['VV', 'VH']
      if s1 == 'asc':
        time_step = 120
      elif s1 == 'des':
        time_step = 118

    for i in range(len(features)):
      ind = i
      cols = []
      for t in range(time_step):
        df = df.rename(columns={f'{ind}': f'{features[i]}_time_{t+1}'})
        cols.append(f'{features[i]}_time_{t+1}')
        ind+=len(features)
      df[cols] = df[cols].interpolate(axis=0)
    return df

In [None]:
# Rename Planet5days dataframe
planet_5days_train_df = rename_cols(planet_5days_train_df, source='planet')
planet_5days_test_df = rename_cols(planet_5days_test_df, source='planet')

# Rename Sentinel-2 dataframe
s2_train_df = rename_cols(s2_train_df, source='s2')
s2_test_df = rename_cols(s2_test_df, source='s2')

# Rename Ascending Sentinel-1 dataframe
asc_s1_train_df = rename_cols(asc_s1_train_df, source='s1', s1='asc')
asc_s1_test_df = rename_cols(asc_s1_test_df, source='s1', s1='asc')


### Compute more Features - Sentinel-2 Indices
#### Vegetation Indices

* ```NDSI: B03 /B11 ```
* ```NDMI: (B08 - B11) / (B08 + B11)```
* ```NDBI: (B11 - B08) / (B11 + B08)```
* ```NDCI: (B05 - B04) / (B05 + B04)```
* ```SAVI: (B01 - B02) / (B01 + B02 + 0.248) * (1+ 0.428)```
* ```BSI: (B11 - B04) / (B08 + B02)```
* ```NDVI_R: (B08 - B07) / (B08 + B07)```
* ```CHL: (B07 / B05) - 1```

In [None]:
def s2_veg_indices(df ,times, data_type='train'):
  veg_df = pd.DataFrame()
  for time in times:
    veg_df[f'NDSI_time_{time}'] =  df[f'B03_time_{time}']  / (df[f'B11_time_{time}'])
    veg_df[f'NDMI_time_{time}'] = (df[f'B08_time_{time}'] - df[f'B11_time_{time}'] )  / (df[f'B08_time_{time}'] +df[f'B11_time_{time}' ])
    veg_df[f'NDCI_time_{time}'] = (df[f'B05_time_{time}'] - df[f'B04_time_{time}'] )  / (df[f'B05_time_{time}'] +df[f'B04_time_{time}'])
    veg_df[f'NDBI_time_{time}'] = (df[f'B11_time_{time}'] - df[f'B08_time_{time}'] )  / (df[f'B11_time_{time}'] +df[f'B08_time_{time}'])
    veg_df[f'SAVI_time_{time}'] = (df[f'B01_time_{time}'] - df[f'B02_time_{time}'] )  / (df[f'B01_time_{time}'] +df[f'B02_time_{time}'] + 0.428 ) * (1.0 + 0.428)
    veg_df[f'BSI_time_{time}'] =  (df[f'B11_time_{time}'] - df[f'B04_time_{time}'] )  / (df[f'B08_time_{time}'] +df[f'B02_time_{time}'])
    veg_df[f'NDVI_R_time_{time}'] =  (df[f'B08_time_{time}'] - df[f'B07_time_{time}'] )  / (df[f'B08_time_{time}'] +df[f'B07_time_{time}'])
    veg_df[f'CHL_time_{time}'] =  (df[f'B07_time_{time}'] / (df[f'B05_time_{time}']))  - 1

  veg_df['field_id'] = list(df['field_id'])
  if data_type == 'train':
    veg_df['label'] = list(df['label'])
  return veg_df

In [None]:
train_veg_indices = s2_veg_indices(s2_train_df ,[i+1 for i in range(144)])
test_veg_indices = s2_veg_indices(s2_test_df ,[i+1 for i in range(144)], data_type='test')

train_veg_indices.head()

Unnamed: 0,NDSI_time_1,NDMI_time_1,NDCI_time_1,NDBI_time_1,SAVI_time_1,BSI_time_1,NDVI_R_time_1,CHL_time_1,NDSI_time_2,NDMI_time_2,NDCI_time_2,NDBI_time_2,SAVI_time_2,BSI_time_2,NDVI_R_time_2,CHL_time_2,NDSI_time_3,NDMI_time_3,NDCI_time_3,NDBI_time_3,SAVI_time_3,BSI_time_3,NDVI_R_time_3,CHL_time_3,NDSI_time_4,NDMI_time_4,NDCI_time_4,NDBI_time_4,SAVI_time_4,BSI_time_4,NDVI_R_time_4,CHL_time_4,NDSI_time_5,NDMI_time_5,NDCI_time_5,NDBI_time_5,SAVI_time_5,BSI_time_5,NDVI_R_time_5,CHL_time_5,...,NDCI_time_140,NDBI_time_140,SAVI_time_140,BSI_time_140,NDVI_R_time_140,CHL_time_140,NDSI_time_141,NDMI_time_141,NDCI_time_141,NDBI_time_141,SAVI_time_141,BSI_time_141,NDVI_R_time_141,CHL_time_141,NDSI_time_142,NDMI_time_142,NDCI_time_142,NDBI_time_142,SAVI_time_142,BSI_time_142,NDVI_R_time_142,CHL_time_142,NDSI_time_143,NDMI_time_143,NDCI_time_143,NDBI_time_143,SAVI_time_143,BSI_time_143,NDVI_R_time_143,CHL_time_143,NDSI_time_144,NDMI_time_144,NDCI_time_144,NDBI_time_144,SAVI_time_144,BSI_time_144,NDVI_R_time_144,CHL_time_144,field_id,label
0,2.058622,0.367159,0.021572,-0.367159,0.130345,-0.191187,0.018518,0.045822,1.720013,0.259857,0.016642,-0.259857,0.101597,-0.159653,0.006839,0.001989,0.928618,0.151477,0.03197,-0.151477,-8.3e-05,0.017165,0.047847,0.204953,1.378876,0.261198,0.030193,-0.261198,0.052439,-0.103335,0.041706,0.113304,1.638435,0.240001,0.003341,-0.240001,0.027666,-0.184753,0.014219,-0.038766,...,0.006672,-0.222198,0.151262,-0.133639,0.006251,0.076456,1.553483,0.257524,-0.028949,-0.257524,-0.020805,-0.172681,0.028947,0.065067,1.403827,0.122214,0.009196,-0.122214,0.093938,-0.103139,0.005497,-0.042155,1.259391,0.176342,0.015827,-0.176342,0.001077,-0.105093,0.038902,-0.007688,1.69907,0.272725,0.023862,-0.272725,0.019333,-0.201224,0.006268,-0.045248,121094,1
1,1.329108,0.21125,0.03445,-0.21125,0.114124,-0.092139,0.023319,0.057421,2.2898,0.344875,0.016359,-0.344875,0.133023,-0.225924,-0.013764,-0.035441,1.042345,0.131419,0.010995,-0.131419,0.010906,-0.025713,0.052877,0.078787,1.252337,0.207469,0.023126,-0.207469,-0.003873,-0.070431,0.055366,0.084772,1.455243,0.185277,0.006484,-0.185277,0.029152,-0.147423,0.017896,-0.046814,...,0.013146,-0.237137,0.047531,-0.054511,0.029542,0.284405,1.527209,0.25202,-0.028509,-0.25202,-0.019078,-0.172166,0.027287,0.063731,2.678426,0.289515,-0.00955,-0.289515,0.163778,-0.209777,-0.002959,-0.11589,1.224045,0.168749,0.014709,-0.168749,-0.018646,-0.104971,0.033955,-0.0035,1.514308,0.21293,0.021856,-0.21293,0.006243,-0.153236,0.015185,-0.042288,165496,1
2,1.475215,0.201508,0.018602,-0.201508,0.155486,-0.141423,0.004058,-0.014496,1.397573,0.229189,0.031677,-0.229189,0.013123,-0.174844,-0.002806,-0.014801,0.868283,0.106229,0.031356,-0.106229,0.092697,0.029048,0.021841,0.186595,0.775066,0.07647,0.052916,-0.07647,0.087676,0.100281,0.024764,0.227264,1.525408,0.215272,0.008004,-0.215272,0.023978,-0.173507,0.008027,-0.035323,...,0.00223,-0.27277,0.141088,-0.178323,0.013961,0.025937,1.319555,0.213259,-0.022665,-0.213259,-0.058033,-0.121367,0.03973,0.092818,1.63355,0.196412,0.00796,-0.196412,0.084493,-0.162787,0.008706,-0.064855,1.420164,0.229778,0.010046,-0.229778,-0.048518,-0.158782,0.044287,-0.034974,1.675869,0.246175,0.015462,-0.246175,0.009029,-0.156173,0.041122,-0.060869,165687,1
3,1.600419,0.246196,0.016198,-0.246196,0.099012,-0.154337,0.022252,-0.011866,1.234554,0.17721,0.030384,-0.17721,0.031722,-0.107131,0.013089,0.011983,1.346606,0.185041,-0.007436,-0.185041,0.031305,-0.108871,0.052941,0.004469,0.543612,0.094402,0.110576,-0.094402,-0.147874,0.248131,0.031186,0.523664,2.348261,0.331343,0.00199,-0.331343,0.092988,-0.237797,0.000309,-0.082982,...,0.005937,-0.283458,0.127198,-0.167096,0.019547,0.06135,1.204778,0.234179,-0.005359,-0.234179,-0.069876,-0.083916,0.034851,0.222179,1.374716,0.155472,0.015639,-0.155472,0.023812,-0.117794,0.02567,-0.055911,1.248558,0.173101,0.018079,-0.173101,-0.005263,-0.101883,0.034554,-0.001152,1.388499,0.208919,0.026024,-0.208919,-0.016875,-0.138311,0.022116,-0.021641,166028,1
4,1.345095,0.194576,0.024924,-0.194576,0.110235,-0.099922,0.022979,0.028283,1.610861,0.247129,0.022058,-0.247129,0.100605,-0.130528,0.004133,0.052942,1.114196,0.148235,0.005799,-0.148235,0.013766,-0.044047,0.05834,0.064405,1.277774,0.2186,0.031302,-0.2186,0.082404,-0.080513,0.032223,0.114307,1.670754,0.233528,0.00223,-0.233528,0.041067,-0.182982,0.012957,-0.048999,...,0.005041,-0.196953,0.06513,-0.071199,0.028523,0.159709,1.486965,0.239481,-0.029803,-0.239481,-0.021512,-0.163365,0.027659,0.066444,3.225905,0.309997,-0.021754,-0.309997,0.184523,-0.225002,-0.014497,-0.152731,1.096512,0.136562,0.019791,-0.136562,-0.030003,-0.067005,0.034585,0.015573,1.569643,0.237945,0.020953,-0.237945,-0.015377,-0.166856,0.024456,-0.049577,166513,1


### RedEdge indices - add more indices from rededge bands (vegetation bands)

In [None]:
def s2_rededge_indices(df ,times, data_type='train'):
  rededge_df = pd.DataFrame()
  for time in times:
    # Redge Edge Indices
    rededge_df[f'NDVIre1_time_{time}'] =  (df[f'B08_time_{time}'] - df[f'B05_time_{time}'])  / (df[f'B08_time_{time}'] + df[f'B05_time_{time}'])
    rededge_df[f'NDVIre2_time_{time}'] =  (df[f'B08_time_{time}'] - df[f'B06_time_{time}'])  / (df[f'B08_time_{time}'] + df[f'B06_time_{time}'])
    rededge_df[f'NDVIre3_time_{time}'] =  (df[f'B08_time_{time}'] - df[f'B07_time_{time}'])  / (df[f'B08_time_{time}'] + df[f'B07_time_{time}'])

    rededge_df[f'NDRE1_time_{time}'] =  (df[f'B06_time_{time}'] - df[f'B05_time_{time}'])  / (df[f'B06_time_{time}'] + df[f'B05_time_{time}'])
    rededge_df[f'NDRE2_time_{time}'] =  (df[f'B07_time_{time}'] - df[f'B05_time_{time}'])  / (df[f'B07_time_{time}'] + df[f'B05_time_{time}'])
    rededge_df[f'NDRE3_time_{time}'] =  (df[f'B07_time_{time}'] - df[f'B06_time_{time}'])  / (df[f'B07_time_{time}'] + df[f'B06_time_{time}'])

    rededge_df[f'CIre1_time_{time}'] =  (df[f'B08_time_{time}'] /(df[f'B05_time_{time}']))  - 1 
    rededge_df[f'CIre2_time_{time}'] =  (df[f'B08_time_{time}'] /(df[f'B06_time_{time}']))  - 1
    rededge_df[f'CIre3_time_{time}'] =  (df[f'B08_time_{time}'] /(df[f'B07_time_{time}']))  - 1

    rededge_df[f'MCARI1_time_{time}'] =  ((df[f'B05_time_{time}'] - df[f'B04_time_{time}']) - 0.2*(df[f'B05_time_{time}'] - df[f'B03_time_{time}'])) * (df[f'B05_time_{time}'] / (df[f'B04_time_{time}']))
    rededge_df[f'MCARI2_time_{time}'] =  ((df[f'B06_time_{time}'] - df[f'B04_time_{time}']) - 0.2*(df[f'B06_time_{time}'] - df[f'B03_time_{time}'])) * (df[f'B06_time_{time}'] / (df[f'B04_time_{time}']))
    rededge_df[f'MCARI3_time_{time}'] =  ((df[f'B07_time_{time}'] - df[f'B04_time_{time}']) - 0.2*(df[f'B07_time_{time}'] - df[f'B03_time_{time}'])) * (df[f'B07_time_{time}'] / (df[f'B04_time_{time}']))

    
    rededge_df[f'TCARI1_time_{time}'] =  3*((df[f'B05_time_{time}'] - df[f'B04_time_{time}']) - 0.2*(df[f'B05_time_{time}'] - df[f'B03_time_{time}'])) * (df[f'B05_time_{time}'] / (df[f'B04_time_{time}']))
    rededge_df[f'TCARI2_time_{time}'] =  3*((df[f'B06_time_{time}'] - df[f'B04_time_{time}']) - 0.2*(df[f'B06_time_{time}'] - df[f'B03_time_{time}'])) * (df[f'B06_time_{time}'] / (df[f'B04_time_{time}']))
    rededge_df[f'TCARI3_time_{time}'] =  3*((df[f'B07_time_{time}'] - df[f'B04_time_{time}']) - 0.2*(df[f'B07_time_{time}'] - df[f'B03_time_{time}'])) * (df[f'B07_time_{time}'] / (df[f'B04_time_{time}']))

    rededge_df[f'MTCI1_time_{time}'] =  (df[f'B06_time_{time}'] - df[f'B05_time_{time}'])  / (df[f'B05_time_{time}'] - df[f'B04_time_{time}'])
    rededge_df[f'MTCI2_time_{time}'] =  (df[f'B07_time_{time}'] - df[f'B05_time_{time}'])  / (df[f'B05_time_{time}'] - df[f'B04_time_{time}'])
    rededge_df[f'MTCI3_time_{time}'] =  (df[f'B07_time_{time}'] - df[f'B06_time_{time}'])  / (df[f'B06_time_{time}'] - df[f'B04_time_{time}']) 
  
  rededge_df['field_id'] = list(df['field_id'])
  if data_type == 'train':
    rededge_df['label'] = list(df['label'])
  return rededge_df

In [None]:
train_rededge_indices = s2_rededge_indices(s2_train_df ,[i+1 for i in range(144)])
test_rededge_indices = s2_rededge_indices(s2_test_df ,[i+1 for i in range(144)], data_type='test')

train_rededge_indices.head()

Unnamed: 0,NDVIre1_time_1,NDVIre2_time_1,NDVIre3_time_1,NDRE1_time_1,NDRE2_time_1,NDRE3_time_1,CIre1_time_1,CIre2_time_1,CIre3_time_1,MCARI1_time_1,MCARI2_time_1,MCARI3_time_1,TCARI1_time_1,TCARI2_time_1,TCARI3_time_1,MTCI1_time_1,MTCI2_time_1,MTCI3_time_1,NDVIre1_time_2,NDVIre2_time_2,NDVIre3_time_2,NDRE1_time_2,NDRE2_time_2,NDRE3_time_2,CIre1_time_2,CIre2_time_2,CIre3_time_2,MCARI1_time_2,MCARI2_time_2,MCARI3_time_2,TCARI1_time_2,TCARI2_time_2,TCARI3_time_2,MTCI1_time_2,MTCI2_time_2,MTCI3_time_2,NDVIre1_time_3,NDVIre2_time_3,NDVIre3_time_3,NDRE1_time_3,...,MTCI2_time_142,MTCI3_time_142,NDVIre1_time_143,NDVIre2_time_143,NDVIre3_time_143,NDRE1_time_143,NDRE2_time_143,NDRE3_time_143,CIre1_time_143,CIre2_time_143,CIre3_time_143,MCARI1_time_143,MCARI2_time_143,MCARI3_time_143,TCARI1_time_143,TCARI2_time_143,TCARI3_time_143,MTCI1_time_143,MTCI2_time_143,MTCI3_time_143,NDVIre1_time_144,NDVIre2_time_144,NDVIre3_time_144,NDRE1_time_144,NDRE2_time_144,NDRE3_time_144,CIre1_time_144,CIre2_time_144,CIre3_time_144,MCARI1_time_144,MCARI2_time_144,MCARI3_time_144,TCARI1_time_144,TCARI2_time_144,TCARI3_time_144,MTCI1_time_144,MTCI2_time_144,MTCI3_time_144,field_id,label
0,0.040899,0.013048,0.018518,0.027865,0.022398,-0.005471,0.085285,0.026441,0.037734,125.762629,257.257852,229.783524,377.287888,771.773556,689.350573,1.357455,1.084991,-0.115575,0.007833,0.006717,0.006839,0.001115,0.000993,-0.000122,0.015789,0.013526,0.013773,92.186435,96.732882,96.234874,276.559305,290.198647,288.704621,0.068207,0.06075,-0.006981,0.140175,0.066623,0.047847,0.074245,...,-2.313,3953.878947,0.035049,0.033817,0.038902,0.001233,-0.003859,-0.005092,0.072644,0.070001,0.080954,90.956811,100.040718,62.957371,272.870433,300.122154,188.872114,0.079252,-0.246718,-0.302033,-0.016882,-0.000147,0.006268,-0.016735,-0.023148,-0.006415,-0.033204,-0.000294,0.012615,297.335814,67.054091,-15.327543,892.007442,201.162272,-45.982629,-0.70625,-0.970756,-0.900446,121094,1
1,0.051195,0.024476,0.023319,0.026753,0.027909,0.001157,0.107915,0.050179,0.047752,109.252614,202.085068,206.422253,327.757841,606.255203,619.266758,0.825424,0.86212,0.020103,-0.031797,-0.020213,-0.013764,-0.011591,-0.01804,-0.006451,-0.061634,-0.039626,-0.027155,109.279656,59.638049,33.440832,327.838968,178.914147,100.322497,-0.71185,-1.100913,-1.350208,0.090596,0.060549,0.052877,0.030212,...,6.009658,0.536681,0.032204,0.031141,0.033955,0.001064,-0.001753,-0.002817,0.066551,0.064283,0.070297,86.704961,96.970813,69.939866,260.114882,290.912439,209.819598,0.073509,-0.120712,-0.180922,-0.006418,0.007229,0.015185,-0.013646,-0.0216,-0.007957,-0.012753,0.014564,0.030838,230.320897,92.930346,17.789478,690.962691,278.791037,53.368435,-0.629412,-0.98855,-0.969102,165496,1
2,-0.003243,-0.000331,0.004058,-0.002912,-0.007301,-0.004389,-0.006465,-0.000662,0.008149,97.698088,82.729537,60.647817,293.094264,248.188611,181.943451,-0.159,-0.396896,-0.282872,-0.010262,-0.005151,-0.002806,-0.005112,-0.007456,-0.002344,-0.020315,-0.010248,-0.005597,503.487858,375.96181,318.741793,1510.463575,1127.88543,956.22538,-0.165628,-0.241026,-0.090365,0.106978,0.040685,0.021841,0.066582,...,-4.106082,1.237344,0.02651,0.035762,0.044287,-0.009261,-0.017798,-0.008538,0.054463,0.074178,0.092679,64.182907,-60.789084,-169.974439,192.548722,-182.367252,-509.923318,-0.922647,-1.758234,-10.802247,0.009744,0.029532,0.041122,-0.019793,-0.03139,-0.011604,0.01968,0.060861,0.08577,150.289312,23.842531,-43.648736,450.867937,71.527594,-130.946209,-1.274703,-1.99882,2.635997,165687,1
3,0.016285,0.018046,0.022252,-0.001762,-0.005969,-0.004207,0.03311,0.036756,0.045516,135.793594,123.381564,94.255953,407.380782,370.144693,282.767859,-0.110329,-0.37222,-0.294369,0.019043,0.014002,0.013089,0.005043,0.005956,0.000913,0.038826,0.028401,0.026525,288.00718,352.608654,364.507019,864.021541,1057.825962,1093.521056,0.171887,0.203191,0.026712,0.055163,0.048769,0.052941,0.006411,...,-1.81545,16.708366,0.033979,0.031897,0.034554,0.002084,-0.000576,-0.00266,0.070347,0.065896,0.071582,111.223935,127.292351,106.814599,333.671806,381.877054,320.443796,0.117597,-0.032437,-0.134247,0.011179,0.016589,0.022116,-0.005411,-0.010939,-0.005528,0.022612,0.033738,0.045231,219.553419,166.031806,113.071806,658.660258,498.095419,339.215417,-0.212175,-0.426608,-0.272183,166028,1
4,0.036912,0.019551,0.022979,0.017374,0.013944,-0.00343,0.076653,0.039881,0.047039,116.123818,197.670272,180.924157,348.371454,593.010815,542.772471,0.727072,0.581538,-0.084266,0.029918,0.012163,0.004133,0.017761,0.025788,0.008031,0.061681,0.024625,0.008299,68.290847,111.994436,133.246044,204.872542,335.983307,399.738131,0.837861,1.226548,0.211489,0.089375,0.062683,0.05834,0.026843,...,3.434014,0.48551,0.0423,0.033757,0.034585,0.008555,0.007726,-0.000829,0.088337,0.069874,0.071648,99.521491,170.66049,163.614112,298.564473,511.98147,490.842335,0.444618,0.401226,-0.030037,-0.000963,0.013909,0.024456,-0.014871,-0.025419,-0.010552,-0.001924,0.028209,0.050139,229.292575,68.746291,-36.864834,687.877724,206.238872,-110.594503,-0.713984,-1.207827,-1.726629,166513,1


In [None]:
##  Delete bands exist in planet data
bands = ['B02','B03','B04','B08']
for band in bands:
  cols=[]
  for i in range(144):
    cols.append(f'{band}_time_{i+1}')
  s2_train_df = s2_train_df.drop(cols, axis=1)
  s2_test_df = s2_test_df.drop(cols, axis=1)

print(s2_train_df.shape)
print(s2_test_df.shape)

(2532, 1154)
(2064, 1153)


In [None]:
# Merge Sentinel-2 train data
s2_train_df = pd.merge(s2_train_df, train_veg_indices, on=['field_id','label'], how='inner')
s2_train_df = pd.merge(s2_train_df, train_rededge_indices, on=['field_id','label'], how='inner')

s2_train_df.to_csv(f"{df_path}/all_s2_train_df.csv", index=False)

# Merge Sentinel-2 test data
s2_test_df = pd.merge(s2_test_df, test_veg_indices, on=['field_id'], how='inner')
s2_test_df = pd.merge(s2_test_df, test_rededge_indices, on=['field_id'], how='inner')

s2_test_df.to_csv(f"{df_path}/all_s2_test_df.csv", index=False)

s2_train_df.head()

Unnamed: 0,field_id,B01_time_1,B05_time_1,B06_time_1,B07_time_1,B8A_time_1,B09_time_1,B11_time_1,B12_time_1,B01_time_2,B05_time_2,B06_time_2,B07_time_2,B8A_time_2,B09_time_2,B11_time_2,B12_time_2,B01_time_3,B05_time_3,B06_time_3,B07_time_3,B8A_time_3,B09_time_3,B11_time_3,B12_time_3,B01_time_4,B05_time_4,B06_time_4,B07_time_4,B8A_time_4,B09_time_4,B11_time_4,B12_time_4,B01_time_5,B05_time_5,B06_time_5,B07_time_5,B8A_time_5,B09_time_5,B11_time_5,...,TCARI3_time_142,MTCI1_time_142,MTCI2_time_142,MTCI3_time_142,NDVIre1_time_143,NDVIre2_time_143,NDVIre3_time_143,NDRE1_time_143,NDRE2_time_143,NDRE3_time_143,CIre1_time_143,CIre2_time_143,CIre3_time_143,MCARI1_time_143,MCARI2_time_143,MCARI3_time_143,TCARI1_time_143,TCARI2_time_143,TCARI3_time_143,MTCI1_time_143,MTCI2_time_143,MTCI3_time_143,NDVIre1_time_144,NDVIre2_time_144,NDVIre3_time_144,NDRE1_time_144,NDRE2_time_144,NDRE3_time_144,CIre1_time_144,CIre2_time_144,CIre3_time_144,MCARI1_time_144,MCARI2_time_144,MCARI3_time_144,TCARI1_time_144,TCARI2_time_144,TCARI3_time_144,MTCI1_time_144,MTCI2_time_144,MTCI3_time_144
0,121094,3822.568,2454.766,2595.494,2567.2476,2542.249,2153.8801,1233.1891,772.88916,3528.5916,2344.9028,2350.1392,2349.5667,2268.649,1987.5536,1399.3394,991.9104,2692.1885,2823.6792,3276.5957,3402.4004,3492.8528,2904.61,2759.213,2563.0083,8667.653,7749.298,8518.308,8627.324,8675.181,10597.699,5493.732,4714.4204,8882.884,7720.5303,7538.8486,7421.24,7338.408,5934.457,4679.707,...,-25.907551,-1.000332,-2.313,3953.878947,0.035049,0.033817,0.038902,0.001233,-0.003859,-0.005092,0.072644,0.070001,0.080954,90.956811,100.040718,62.957371,272.870433,300.122154,188.872114,0.079252,-0.246718,-0.302033,-0.016882,-0.000147,0.006268,-0.016735,-0.023148,-0.006415,-0.033204,-0.000294,0.012615,297.335814,67.054091,-15.327543,892.007442,201.162272,-45.982629,-0.70625,-0.970756,-0.900446
1,165496,2444.2097,1746.6251,1842.6497,1846.9187,1875.3254,1592.8134,1260.1198,991.6572,4059.0557,2546.7883,2488.426,2456.528,2358.276,2363.9595,1164.1466,726.82086,4115.821,3894.0317,4136.658,4200.83,4258.538,3856.8054,3585.0312,3318.3296,7488.0005,7167.703,7746.467,7775.3228,7813.6235,9038.668,5701.6113,4869.6177,9481.183,8262.272,8030.0835,7875.484,7788.585,7806.93,5610.652,...,-263.574001,3.561557,6.009658,0.536681,0.032204,0.031141,0.033955,0.001064,-0.001753,-0.002817,0.066551,0.064283,0.070297,86.704961,96.970813,69.939866,260.114882,290.912439,209.819598,0.073509,-0.120712,-0.180922,-0.006418,0.007229,0.015185,-0.013646,-0.0216,-0.007957,-0.012753,0.014564,0.030838,230.320897,92.930346,17.789478,690.962691,278.791037,53.368435,-0.629412,-0.98855,-0.969102
2,165687,4300.2734,3004.0,2986.5547,2960.4531,3000.586,3175.289,1983.4766,1623.8359,12709.789,14264.602,14119.516,14053.469,13873.008,15741.57,8763.453,6886.742,1966.6016,1906.1562,2178.0938,2261.836,2338.0078,1833.1719,1909.0469,1609.1953,2548.2344,2913.4297,3418.5469,3575.5469,3802.3281,3411.4219,3223.336,2490.461,10961.023,10158.117,9926.75,9799.305,9788.141,8672.977,6430.0312,...,-297.623138,-2.38829,-4.106082,1.237344,0.02651,0.035762,0.044287,-0.009261,-0.017798,-0.008538,0.054463,0.074178,0.092679,64.182907,-60.789084,-169.974439,192.548722,-182.367252,-509.923318,-0.922647,-1.758234,-10.802247,0.009744,0.029532,0.041122,-0.019793,-0.03139,-0.011604,0.01968,0.060861,0.08577,150.289312,23.842531,-43.648736,450.867937,71.527594,-130.946209,-1.274703,-1.99882,2.635997
3,166028,5637.3276,4120.636,4106.1426,4071.739,4063.0957,5176.7485,2575.0322,2271.7961,7167.6567,7085.44,7157.267,7170.348,7131.3276,7606.131,5144.5195,3835.0999,8097.0938,6728.5874,6815.423,6758.655,6698.0625,6084.665,5167.607,4315.8223,959.7526,2113.7473,2979.4573,3220.6404,3428.4282,1831.1112,2836.5945,2054.1912,5392.4272,3550.7275,3376.053,3256.081,3137.0083,2762.2017,1636.3524,...,-258.320441,-1.046049,-1.81545,16.708366,0.033979,0.031897,0.034554,0.002084,-0.000576,-0.00266,0.070347,0.065896,0.071582,111.223935,127.292351,106.814599,333.671806,381.877054,320.443796,0.117597,-0.032437,-0.134247,0.011179,0.016589,0.022116,-0.005411,-0.010939,-0.005528,0.022612,0.033738,0.045231,219.553419,166.031806,113.071806,658.660258,498.095419,339.215417,-0.212175,-0.426608,-0.272183
4,166513,3463.5852,2515.372,2604.319,2586.515,2597.3494,2451.7751,1825.9476,1505.064,2059.1777,1316.0975,1363.6938,1385.7739,1351.0142,1049.0452,843.511,602.31976,4749.586,4243.3916,4477.4834,4516.686,4553.7085,4216.984,3765.6558,3471.274,8404.627,7396.8667,8139.51,8242.377,8329.189,10383.05,5637.194,4766.5605,8290.138,6802.4316,6607.612,6469.1216,6367.3027,5508.4297,4125.2163,...,-387.796062,1.984843,3.434014,0.48551,0.0423,0.033757,0.034585,0.008555,0.007726,-0.000829,0.088337,0.069874,0.071648,99.521491,170.66049,163.614112,298.564473,511.98147,490.842335,0.444618,0.401226,-0.030037,-0.000963,0.013909,0.024456,-0.014871,-0.025419,-0.010552,-0.001924,0.028209,0.050139,229.292575,68.746291,-36.864834,687.877724,206.238872,-110.594503,-0.713984,-1.207827,-1.726629


### Compute more Features - Planet Indices
#### Vegitation Indices

* ```MNDVI: (NIR - B03)/(NIR + B03 - 2*B01)```
* ```NDVI: (NIR - B03)/(NIR + B03)```
* ```NDWI: (B02 - NIR) / (B02 + NIR)```
* ```GNDVI: (NIR - B02) / (NIR + B02)```
* ```EVI2:  2.4 * (NIR - B03) / (NIR + B03 + 1.0)```
* ```EVI:   2.5 * (NIR - B03) / ((NIR + 6.0 * B03 - 7.5 * B01) + 1.0)```
* ```CVI: (NIR / B02) * (NIR / B02)```
* ```BI: (B03**2 + B02**2 + B01*2) /3```
* ```SI: (B03 - B01) /(B03 + B01)```

In [None]:
def planet_veg_indices(df ,times, data_type='train'):
  veg_df = pd.DataFrame()
  for time in times:
    veg_df[f'MNDVI_time_{time}'] = (df[f'NIR_time_{time}'] - df[f'B03_time_{time}'] )  / (df[f'NIR_time_{time}'] +df[f'B03_time_{time}'] - 2*df[f'B01_time_{time}'])
    veg_df[f'NDVI_time_{time}'] =  (df[f'NIR_time_{time}'] - df[f'B03_time_{time}'] )  / (df[f'NIR_time_{time}'] +df[f'B03_time_{time}'] )
    veg_df[f'NDWI_time_{time}'] = (df[f'B02_time_{time}'] -  df[f'NIR_time_{time}'] )  / (df[f'B02_time_{time}'] +df[f'NIR_time_{time}']) 
    veg_df[f'GNDVI_time_{time}'] = (df[f'NIR_time_{time}'] - df[f'B02_time_{time}'] )  / (df[f'NIR_time_{time}'] +df[f'B02_time_{time}'] )
    veg_df[f'EVI2_time_{time}'] = 2.4*((df[f'NIR_time_{time}'] - df[f'B03_time_{time}'] )  / (df[f'NIR_time_{time}'] +df[f'B03_time_{time}'] + 1.0))
    veg_df[f'EVI_time_{time}'] = 2.5*((df[f'NIR_time_{time}'] - df[f'B03_time_{time}'] )  / (df[f'NIR_time_{time}'] + 6 * df[f'B03_time_{time}'] - 7.5 * df[f'B01_time_{time}'] + 1.0))
    veg_df[f'CVI_time_{time}'] =  (df[f'NIR_time_{time}'] / (df[f'B02_time_{time}']))  * (df[f'B03_time_{time}'] / (df[f'B02_time_{time}']))
    veg_df[f'BI_time_{time}'] =  (df[f'B03_time_{time}'] **2+ df[f'B02_time_{time}']**2+ df[f'B01_time_{time}']*2) /3
    veg_df[f'SI_time_{time}'] =  (df[f'B03_time_{time}'] - df[f'B01_time_{time}'])  / (df[f'B03_time_{time}'] + df[f'B01_time_{time}'])

  veg_df['field_id'] = list(df['field_id'])
  if data_type == 'train':
    veg_df['label'] = list(df['label'])
  return veg_df


In [None]:
planet_train_veg_indices = planet_veg_indices(planet_5days_train_df ,[i+1 for i in range(73)])
planet_test_veg_indices = planet_veg_indices(planet_5days_test_df ,[i+1 for i in range(73)], data_type='test')

planet_train_veg_indices.head()

Unnamed: 0,MNDVI_time_1,NDVI_time_1,NDWI_time_1,GNDVI_time_1,EVI2_time_1,EVI_time_1,CVI_time_1,BI_time_1,SI_time_1,MNDVI_time_2,NDVI_time_2,NDWI_time_2,GNDVI_time_2,EVI2_time_2,EVI_time_2,CVI_time_2,BI_time_2,SI_time_2,MNDVI_time_3,NDVI_time_3,NDWI_time_3,GNDVI_time_3,EVI2_time_3,EVI_time_3,CVI_time_3,BI_time_3,SI_time_3,MNDVI_time_4,NDVI_time_4,NDWI_time_4,GNDVI_time_4,EVI2_time_4,EVI_time_4,CVI_time_4,BI_time_4,SI_time_4,MNDVI_time_5,NDVI_time_5,NDWI_time_5,GNDVI_time_5,...,BI_time_69,SI_time_69,MNDVI_time_70,NDVI_time_70,NDWI_time_70,GNDVI_time_70,EVI2_time_70,EVI_time_70,CVI_time_70,BI_time_70,SI_time_70,MNDVI_time_71,NDVI_time_71,NDWI_time_71,GNDVI_time_71,EVI2_time_71,EVI_time_71,CVI_time_71,BI_time_71,SI_time_71,MNDVI_time_72,NDVI_time_72,NDWI_time_72,GNDVI_time_72,EVI2_time_72,EVI_time_72,CVI_time_72,BI_time_72,SI_time_72,MNDVI_time_73,NDVI_time_73,NDWI_time_73,GNDVI_time_73,EVI2_time_73,EVI_time_73,CVI_time_73,BI_time_73,SI_time_73,field_id,label
0,0.789569,0.466436,-0.503388,0.503388,1.11917,1.458598,3.334506,711757.63833,0.131851,0.791195,0.46656,-0.5031,0.5031,1.119467,1.466829,3.328308,713882.398489,0.130469,0.770074,0.446546,-0.49153,0.49153,1.071448,1.380944,3.292179,767538.119771,0.136946,0.778189,0.456226,-0.499613,0.499613,1.094668,1.411269,3.353795,711524.104607,0.135811,0.797266,0.479733,-0.519254,0.519254,...,341048.376519,0.268211,0.598939,0.358824,-0.469943,0.469943,0.860845,0.815545,3.62886,362557.417669,0.230575,0.618231,0.367669,-0.472609,0.472609,0.882071,0.86602,3.604735,370001.152108,0.218811,0.63377,0.37553,-0.475324,0.475324,0.900935,0.908352,3.589501,377399.517586,0.210287,0.520695,0.320829,-0.467135,0.467135,0.769692,0.637123,3.897973,385016.064199,0.27782,121094,1
1,0.70535,0.38963,-0.460252,0.460252,0.934882,1.149156,3.214896,869635.176478,0.153843,0.702502,0.386821,-0.457022,0.457022,0.928141,1.140301,3.183727,882718.837655,0.154169,0.669186,0.358733,-0.440767,0.440767,0.860753,1.038381,3.132611,994662.99297,0.160461,0.687431,0.377503,-0.451491,0.451491,0.905783,1.090038,3.164516,902264.321138,0.159917,0.718147,0.408698,-0.471468,0.471468,...,208574.95399,0.027207,0.97527,0.714982,-0.672189,0.672189,1.715453,2.510444,4.324505,188000.597188,0.032849,0.96893,0.718448,-0.680495,0.680495,1.723777,2.444556,4.532524,184881.472874,0.042658,0.96809,0.72063,-0.684251,0.684251,1.72901,2.43406,4.619793,180213.781859,0.0444,0.960883,0.711326,-0.673104,0.673104,1.706725,2.377374,4.418773,227117.092382,0.052805,165496,1
2,0.709501,0.396034,-0.489543,0.489543,0.949973,1.159826,3.683866,173077.459352,0.155054,0.734588,0.435133,-0.535333,0.535333,1.043797,1.230608,4.297093,168547.15236,0.16166,0.72736,0.434556,-0.538124,0.538124,1.042456,1.201122,4.371221,200118.967205,0.168271,0.718987,0.432133,-0.538328,0.538328,1.036677,1.169245,4.402446,234188.937625,0.174691,0.724366,0.439617,-0.547108,0.547108,...,84304.931275,0.047879,0.933192,0.537721,-0.535405,0.535405,1.289705,2.37027,3.283391,87790.835265,0.043446,0.936715,0.545686,-0.548067,0.548067,1.308834,2.389257,3.448783,88934.076356,0.04229,0.927122,0.545917,-0.555807,0.555807,1.309402,2.297911,3.603435,90240.808816,0.049595,0.903224,0.564053,-0.584887,0.584887,1.352983,2.066885,4.063003,98916.525046,0.074478,165687,1
3,0.870226,0.628149,-0.640277,0.640277,1.507024,1.770795,4.748686,176671.607486,0.144108,0.877396,0.635755,-0.645788,0.645788,1.525279,1.809854,4.807256,175249.925923,0.138885,0.85026,0.602421,-0.623994,0.623994,1.445323,1.67114,4.628363,216592.940902,0.153967,0.806644,0.557104,-0.600392,0.600392,1.33662,1.469967,4.562126,280896.529693,0.177521,0.810494,0.567656,-0.611929,0.611929,...,247136.772406,0.068636,0.958451,0.734928,-0.679429,0.679429,1.763391,2.330503,4.19329,244517.205309,0.063937,0.954904,0.724948,-0.673435,0.673435,1.739444,2.309288,4.187141,255780.367034,0.066365,0.950768,0.710953,-0.665668,0.665668,1.705866,2.287524,4.193261,277233.5884,0.068013,0.931723,0.681177,-0.648368,0.648368,1.63442,2.158612,4.167423,314448.537166,0.084931,166028,1
4,0.861764,0.553602,-0.574516,0.574516,1.328297,1.785174,3.934689,459957.918998,0.110453,0.861651,0.552456,-0.573751,0.573751,1.325548,1.785642,3.929718,461812.783789,0.110002,0.825032,0.513334,-0.553842,0.553842,1.231681,1.603826,3.900607,526627.902967,0.125933,0.818699,0.509413,-0.552394,0.552394,1.222268,1.572133,3.909477,512112.213229,0.12991,0.843997,0.535591,-0.567358,0.567358,...,249420.648553,0.074998,0.944529,0.680139,-0.653946,0.653946,1.631845,2.266337,4.348831,210949.565892,0.066598,0.93503,0.678719,-0.659325,0.659325,1.628441,2.187874,4.540347,209677.230101,0.079207,0.932106,0.678478,-0.662097,0.662097,1.627858,2.164276,4.634713,203074.274822,0.083252,0.886744,0.629583,-0.63477,0.63477,1.510564,1.876425,4.55401,272029.559585,0.121756,166513,1


#### FLOWERING PHENOLOGY
Flowering is an essential phenological period, so we added indices to able to reflect the spectral performance of different flowers, because the crops have different flowers color

* ```Normalized Differences Yellwoness index "NDYI" ```
* ```Yellwoness ratio "DYI" ```
* ``` Normalized Differences Greeness index "NDGI" ``` 
* ``` Normalized Differences Purpleness index "NDPI" ```
* ```The Enhanced Bloom Index for yellow flowers 'YEBI'``` 
* ```The Enhanced Bloom Index for purple flowers 'PEBI'``` 

In [None]:
def planet_bloom_indices(df ,times, data_type='train'):
  bloom_df = pd.DataFrame()
  for time in times:    
    # Blooming Indices (to detect flowers colors (purple, yellow) of different crops)
    bloom_df[f'NDGI_time_{time}'] =  (df[f'B03_time_{time}'] - df[f'B02_time_{time}'] )  / (df[f'B03_time_{time}'] +df[f'B02_time_{time}'] )
    bloom_df[f'DYI_time_{time}'] =  df[f'B03_time_{time}']  / df[f'B02_time_{time}']
    bloom_df[f'NDPI_time_{time}'] =  (0.5*(df[f'B03_time_{time}'] + df[f'B01_time_{time}']) - df[f'B02_time_{time}'])  / (0.5*(df[f'B03_time_{time}'] + df[f'B01_time_{time}']) + df[f'B02_time_{time}'])
    
    bloom_df[f'PEBI_time_{time}'] =  bloom_df[f'NDPI_time_{time}'] / ((bloom_df[f'NDGI_time_{time}'] +1) * df[f'NIR_time_{time}'])
    bloom_df[f'NDYI_time_{time}'] =  (0.5*(df[f'B03_time_{time}'] + df[f'B02_time_{time}']) - df[f'B01_time_{time}'])  / (0.5*(df[f'B03_time_{time}'] + df[f'B02_time_{time}']) + df[f'B01_time_{time}'])
    bloom_df[f'YEBI_time_{time}'] =  bloom_df[f'NDYI_time_{time}'] / ((bloom_df[f'NDGI_time_{time}'] +1) * df[f'NIR_time_{time}']) 
  
  bloom_df['field_id'] = list(df['field_id'])
  if data_type == 'train':
    bloom_df['label'] = list(df['label'])
  return bloom_df

In [None]:
planet_train_bloom_indices = planet_bloom_indices(planet_5days_train_df ,[i+1 for i in range(73)])
planet_test_bloom_indices = planet_bloom_indices(planet_5days_test_df ,[i+1 for i in range(73)], data_type='test')

planet_train_bloom_indices.head()

Unnamed: 0,NDGI_time_1,DYI_time_1,NDPI_time_1,PEBI_time_1,NDYI_time_1,YEBI_time_1,NDGI_time_2,DYI_time_2,NDPI_time_2,PEBI_time_2,NDYI_time_2,YEBI_time_2,NDGI_time_3,DYI_time_3,NDPI_time_3,PEBI_time_3,NDYI_time_3,YEBI_time_3,NDGI_time_4,DYI_time_4,NDPI_time_4,PEBI_time_4,NDYI_time_4,YEBI_time_4,NDGI_time_5,DYI_time_5,NDPI_time_5,PEBI_time_5,NDYI_time_5,YEBI_time_5,NDGI_time_6,DYI_time_6,NDPI_time_6,PEBI_time_6,NDYI_time_6,YEBI_time_6,NDGI_time_7,DYI_time_7,NDPI_time_7,PEBI_time_7,...,NDYI_time_67,YEBI_time_67,NDGI_time_68,DYI_time_68,NDPI_time_68,PEBI_time_68,NDYI_time_68,YEBI_time_68,NDGI_time_69,DYI_time_69,NDPI_time_69,PEBI_time_69,NDYI_time_69,YEBI_time_69,NDGI_time_70,DYI_time_70,NDPI_time_70,PEBI_time_70,NDYI_time_70,YEBI_time_70,NDGI_time_71,DYI_time_71,NDPI_time_71,PEBI_time_71,NDYI_time_71,YEBI_time_71,NDGI_time_72,DYI_time_72,NDPI_time_72,PEBI_time_72,NDYI_time_72,YEBI_time_72,NDGI_time_73,DYI_time_73,NDPI_time_73,PEBI_time_73,NDYI_time_73,YEBI_time_73,field_id,label
0,0.048291,1.101482,-0.013598,-4e-06,0.108613,3.5e-05,0.047748,1.100284,-0.013531,-4e-06,0.107479,3.4e-05,0.057635,1.122319,-0.006474,-2.067956e-06,0.109355,3.5e-05,0.056197,1.119086,-0.007417,-2.408043e-06,0.108885,3.5e-05,0.052632,1.111113,-0.009668,-3e-06,0.107532,3.6e-05,0.049943,1.105136,-0.011773,-3.994239e-06,0.10745,3.6e-05,0.041901,1.087467,-0.01726,-6e-06,...,0.216993,0.000104,0.185811,1.456432,0.05774,3e-05,0.217926,0.000113,0.166176,1.398587,0.048889,2.6e-05,0.19552,0.000103,0.133657,1.308555,0.030711,1.5e-05,0.170393,8.6e-05,0.12701,1.290976,0.028753,1.4e-05,0.161205,7.9e-05,0.121478,1.27655,0.026646,1.3e-05,0.154891,7.5e-05,0.172099,1.415747,0.051206,2.6e-05,0.203058,0.000102,121094,1
1,0.086053,1.188311,0.014716,5e-06,0.113311,3.7e-05,0.085278,1.186455,0.013794,5e-06,0.113993,3.7e-05,0.097441,1.215921,0.023338,7.525132e-06,0.114861,3.7e-05,0.08919,1.195849,0.015252,5.015703e-06,0.118032,3.9e-05,0.077752,1.168614,0.005029,2e-06,0.120203,4.1e-05,0.074615,1.161263,0.002544,8.779713e-07,0.120075,4.1e-05,0.072346,1.155975,0.006119,2e-06,...,0.155513,5.1e-05,-0.081791,0.848785,-0.10424,-3.5e-05,0.088796,3e-05,-0.091705,0.831997,-0.104996,-3.8e-05,0.075165,2.7e-05,-0.08239,0.847763,-0.098418,-3.7e-05,0.075707,2.8e-05,-0.074259,0.861749,-0.094995,-3.5e-05,0.081086,3e-05,-0.071766,0.866079,-0.093338,-3.4e-05,0.081483,3e-05,-0.073334,0.863353,-0.098871,-3.3e-05,0.090685,3.1e-05,165496,1
2,0.115998,1.262439,0.04442,3.1e-05,0.101094,6.9e-05,0.130628,1.300512,0.056394,3.5e-05,0.101354,6.3e-05,0.135178,1.312615,0.058182,3.280151e-05,0.106092,6e-05,0.138388,1.321231,0.058712,3.061901e-05,0.111232,5.8e-05,0.141532,1.32973,0.061588,3e-05,0.110636,5.4e-05,0.142959,1.33361,0.063032,2.918553e-05,0.110037,5.1e-05,0.146471,1.343212,0.065339,3e-05,...,0.083988,6.1e-05,0.011998,1.024287,-0.022292,-1.7e-05,0.06506,5e-05,0.003924,1.007879,-0.019458,-1.6e-05,0.045925,3.8e-05,-0.003253,0.993515,-0.024513,-2e-05,0.045072,3.8e-05,0.003397,1.006816,-0.017312,-1.4e-05,0.040598,3.2e-05,0.014198,1.028805,-0.010003,-8e-06,0.042561,3.3e-05,0.031092,1.06418,-0.004815,-3e-06,0.059237,4e-05,165687,1
3,0.020288,1.041415,-0.046987,-2e-05,0.13426,5.7e-05,0.017022,1.034633,-0.047965,-2e-05,0.130599,5.5e-05,0.034568,1.071611,-0.037004,-1.506819e-05,0.137336,5.6e-05,0.065043,1.139136,-0.016569,-6.417844e-06,0.146845,5.7e-05,0.067837,1.145548,-0.015378,-6e-06,0.149413,5.6e-05,0.071841,1.154802,-0.012357,-4.604656e-06,0.149979,5.6e-05,0.076904,1.166622,-0.006722,-2e-06,...,0.130695,4.2e-05,-0.088253,0.837808,-0.128704,-4e-05,0.130971,4.1e-05,-0.105277,0.809501,-0.137974,-4.4e-05,0.123727,4e-05,-0.110852,0.80042,-0.141345,-4.5e-05,0.122157,3.9e-05,-0.100652,0.817105,-0.132341,-4.2e-05,0.11894,3.8e-05,-0.085971,0.841669,-0.118524,-3.7e-05,0.112585,3.5e-05,-0.058761,0.889,-0.099259,-3.1e-05,0.114906,3.6e-05,166028,1
4,0.030669,1.063278,-0.021703,-7e-06,0.09551,3.1e-05,0.031177,1.064361,-0.02099,-7e-06,0.094812,3.1e-05,0.056599,1.11999,-0.002646,-8.59328e-07,0.098754,3.2e-05,0.059811,1.127232,-0.001187,-3.927029e-07,0.101255,3.4e-05,0.045635,1.095633,-0.010685,-4e-06,0.097251,3.3e-05,0.041927,1.087523,-0.013921,-4.835172e-06,0.097934,3.4e-05,0.04042,1.084244,-0.014654,-5e-06,...,0.170065,5.8e-05,-0.025453,0.950357,-0.074714,-2.6e-05,0.116569,4e-05,-0.044491,0.914809,-0.080505,-2.9e-05,0.097583,3.6e-05,-0.047174,0.909902,-0.079279,-3e-05,0.090609,3.4e-05,-0.035102,0.932177,-0.073099,-2.7e-05,0.096934,3.6e-05,-0.029742,0.942235,-0.069621,-2.6e-05,0.098224,3.6e-05,0.00864,1.01743,-0.048769,-1.7e-05,0.117517,4.1e-05,166513,1


In [None]:
# Merge Planet train data
planet_5days_train_df = pd.merge(planet_5days_train_df, planet_train_veg_indices, on=['field_id','label'], how='inner')
planet_5days_train_df = pd.merge(planet_5days_train_df, planet_train_bloom_indices, on=['field_id','label'], how='inner')

planet_5days_train_df.to_csv(f"{df_path}/all_planet_5days_train_df.csv", index=False)

# Merge Planet test data
planet_5days_test_df = pd.merge(planet_5days_test_df, planet_test_veg_indices, on=['field_id'], how='inner')
planet_5days_test_df = pd.merge(planet_5days_test_df, planet_test_bloom_indices, on=['field_id'], how='inner')

planet_5days_test_df.to_csv(f"{df_path}/all_planet_5days_test_df.csv", index=False)

planet_5days_train_df.head()

Unnamed: 0,field_id,B01_time_1,B02_time_1,B03_time_1,NIR_time_1,B01_time_2,B02_time_2,B03_time_2,NIR_time_2,B01_time_3,B02_time_3,B03_time_3,NIR_time_3,B01_time_4,B02_time_4,B03_time_4,NIR_time_4,B01_time_5,B02_time_5,B03_time_5,NIR_time_5,B01_time_6,B02_time_6,B03_time_6,NIR_time_6,B01_time_7,B02_time_7,B03_time_7,NIR_time_7,B01_time_8,B02_time_8,B03_time_8,NIR_time_8,B01_time_9,B02_time_9,B03_time_9,NIR_time_9,B01_time_10,B02_time_10,B03_time_10,...,NDPI_time_67,PEBI_time_67,NDYI_time_67,YEBI_time_67,NDGI_time_68,DYI_time_68,NDPI_time_68,PEBI_time_68,NDYI_time_68,YEBI_time_68,NDGI_time_69,DYI_time_69,NDPI_time_69,PEBI_time_69,NDYI_time_69,YEBI_time_69,NDGI_time_70,DYI_time_70,NDPI_time_70,PEBI_time_70,NDYI_time_70,YEBI_time_70,NDGI_time_71,DYI_time_71,NDPI_time_71,PEBI_time_71,NDYI_time_71,YEBI_time_71,NDGI_time_72,DYI_time_72,NDPI_time_72,PEBI_time_72,NDYI_time_72,YEBI_time_72,NDGI_time_73,DYI_time_73,NDPI_time_73,PEBI_time_73,NDYI_time_73,YEBI_time_73
0,121094,829.514192,981.841604,1081.480907,2972.319504,832.68147,983.892062,1082.560965,2976.227191,859.700259,1009.096067,1132.528184,2960.052318,828.57998,973.124867,1089.010168,2916.364069,770.038691,905.305129,1005.896355,2860.947196,719.395199,848.025342,937.183495,2807.397129,663.778093,785.846888,854.582674,2753.778145,642.053986,770.133176,843.345662,2702.038082,585.587837,701.022074,789.547599,2579.081188,527.087445,644.862392,732.05157,...,0.037105,1.8e-05,0.216993,0.000104,0.185811,1.456432,0.05774,3e-05,0.217926,0.000113,0.166176,1.398587,0.048889,2.6e-05,0.19552,0.000103,0.133657,1.308555,0.030711,1.5e-05,0.170393,8.6e-05,0.12701,1.290976,0.028753,1.4e-05,0.161205,7.9e-05,0.121478,1.27655,0.026646,1.3e-05,0.154891,7.5e-05,0.172099,1.415747,0.051206,2.6e-05,0.203058,0.000102
1,165496,905.975359,1039.637854,1235.413579,2812.669405,911.567244,1048.390806,1243.869003,2813.245038,964.899954,1096.899355,1333.743223,2825.971516,913.784926,1055.049635,1261.679698,2791.926499,837.801812,983.792188,1149.673529,2738.940953,794.2823,935.618994,1086.499736,2696.088765,759.706384,874.582473,1010.99577,2625.259033,739.483238,852.6038,988.227412,2580.906582,689.696619,807.159446,938.252265,2471.907146,618.219657,755.727183,901.820496,...,-0.051022,-1.7e-05,0.155513,5.1e-05,-0.081791,0.848785,-0.10424,-3.5e-05,0.088796,3e-05,-0.091705,0.831997,-0.104996,-3.8e-05,0.075165,2.7e-05,-0.08239,0.847763,-0.098418,-3.7e-05,0.075707,2.8e-05,-0.074259,0.861749,-0.094995,-3.5e-05,0.081086,3e-05,-0.071766,0.866079,-0.093338,-3.4e-05,0.081483,3e-05,-0.073334,0.863353,-0.098871,-3.3e-05,0.090685,3.1e-05
2,165687,412.864954,447.065452,564.392709,1304.561723,406.484673,433.100249,563.251864,1431.030655,438.472245,469.209611,615.891466,1562.544325,469.24855,505.511185,667.897266,1684.402651,491.746479,527.178956,701.0058,1800.874896,505.481359,540.346313,720.611433,1889.555095,513.179785,548.026512,736.11599,1916.689312,500.160729,524.541839,707.391881,1778.516156,452.377796,466.012428,641.68517,1587.591549,423.183927,445.163215,613.545153,...,-0.028244,-2e-05,0.083988,6.1e-05,0.011998,1.024287,-0.022292,-1.7e-05,0.06506,5e-05,0.003924,1.007879,-0.019458,-1.6e-05,0.045925,3.8e-05,-0.003253,0.993515,-0.024513,-2e-05,0.045072,3.8e-05,0.003397,1.006816,-0.017312,-1.4e-05,0.040598,3.2e-05,0.014198,1.028805,-0.010003,-8e-06,0.042561,3.3e-05,0.031092,1.06418,-0.004815,-3e-06,0.059237,4e-05
3,166028,392.54795,503.867531,524.735397,2297.554644,393.910711,503.535314,520.974226,2339.596151,431.790209,549.593389,588.950209,2373.732803,481.589874,605.264603,689.478912,2424.023431,474.127448,597.233305,684.159331,2480.72477,492.054644,617.868368,713.5159,2503.632971,519.313975,643.7759,751.043431,2498.676067,548.556485,675.952552,806.908703,2463.211046,538.321255,681.393975,841.955314,2397.71523,558.180921,716.089958,901.12159,...,-0.129341,-4.1e-05,0.130695,4.2e-05,-0.088253,0.837808,-0.128704,-4e-05,0.130971,4.1e-05,-0.105277,0.809501,-0.137974,-4.4e-05,0.123727,4e-05,-0.110852,0.80042,-0.141345,-4.5e-05,0.122157,3.9e-05,-0.100652,0.817105,-0.132341,-4.2e-05,0.11894,3.8e-05,-0.085971,0.841669,-0.118524,-3.7e-05,0.112585,3.5e-05,-0.058761,0.889,-0.099259,-3.1e-05,0.114906,3.6e-05
4,166513,685.128797,804.372286,855.271141,2976.602474,687.465512,805.557068,857.403774,2974.189483,727.52094,836.756449,937.159171,2914.183792,713.673059,822.178534,926.78571,2851.488153,671.455243,778.87974,853.366594,2821.697742,614.90597,717.043675,779.801732,2763.256233,574.642932,669.758924,726.182307,2669.150387,536.882029,636.781998,690.867677,2654.18837,587.331581,693.999196,733.937148,2711.638788,567.482895,708.050665,762.116424,...,-0.03808,-1.3e-05,0.170065,5.8e-05,-0.025453,0.950357,-0.074714,-2.6e-05,0.116569,4e-05,-0.044491,0.914809,-0.080505,-2.9e-05,0.097583,3.6e-05,-0.047174,0.909902,-0.079279,-3e-05,0.090609,3.4e-05,-0.035102,0.932177,-0.073099,-2.7e-05,0.096934,3.6e-05,-0.029742,0.942235,-0.069621,-2.6e-05,0.098224,3.6e-05,0.00864,1.01743,-0.048769,-1.7e-05,0.117517,4.1e-05


### Compute more Features - Vegetation Indices
#### Sentinel-1 Indices
* ``` Radar Vegetation Index (RVI): (4*VH)/(VV+VH) ```
* ``` Radar Vegetation Index for Sentinel-1 (RVI4): (Sqrt(DOP))*((4*(VH))/(VV+VH))```
* ``` Polar Ration : VV/VH ```

In [None]:
def s1_feature_calc(df ,times):
  for time in times:
    df[f'PolarRation_time_{time}'] = df[f'VV_time_{time}'] /df[f'VH_time_{time}']

    RVI = (4*df[f'VH_time_{time}']) / (df[f'VV_time_{time}'] + df[f'VH_time_{time}'])
    df[f'RVI_time_{time}'] = list(RVI)

    DOP = df[f'VV_time_{time}'] / (df[f'VV_time_{time}'] + df[f'VH_time_{time}'])
    RVI4 = list(np.sqrt(DOP) * RVI)
    df[f'RVI4_time_{time}'] = list(RVI4)

In [None]:
# Ascending Sentinel-1 Indices 
s1_feature_calc(asc_s1_train_df ,[i+1 for i in range(120)] )
s1_feature_calc(asc_s1_test_df ,[i+1 for i in range(120)] )

asc_s1_train_df.head()

Unnamed: 0,field_id,VV_time_1,VH_time_1,VV_time_2,VH_time_2,VV_time_3,VH_time_3,VV_time_4,VH_time_4,VV_time_5,VH_time_5,VV_time_6,VH_time_6,VV_time_7,VH_time_7,VV_time_8,VH_time_8,VV_time_9,VH_time_9,VV_time_10,VH_time_10,VV_time_11,VH_time_11,VV_time_12,VH_time_12,VV_time_13,VH_time_13,VV_time_14,VH_time_14,VV_time_15,VH_time_15,VV_time_16,VH_time_16,VV_time_17,VH_time_17,VV_time_18,VH_time_18,VV_time_19,VH_time_19,VV_time_20,...,RVI4_time_107,PolarRation_time_108,RVI_time_108,RVI4_time_108,PolarRation_time_109,RVI_time_109,RVI4_time_109,PolarRation_time_110,RVI_time_110,RVI4_time_110,PolarRation_time_111,RVI_time_111,RVI4_time_111,PolarRation_time_112,RVI_time_112,RVI4_time_112,PolarRation_time_113,RVI_time_113,RVI4_time_113,PolarRation_time_114,RVI_time_114,RVI4_time_114,PolarRation_time_115,RVI_time_115,RVI4_time_115,PolarRation_time_116,RVI_time_116,RVI4_time_116,PolarRation_time_117,RVI_time_117,RVI4_time_117,PolarRation_time_118,RVI_time_118,RVI4_time_118,PolarRation_time_119,RVI_time_119,RVI4_time_119,PolarRation_time_120,RVI_time_120,RVI4_time_120
0,121094,0.144321,0.036296,0.181967,0.031217,0.059237,0.008695,0.177003,0.031214,0.054992,0.008112,0.125797,0.017551,0.124405,0.026362,0.12715,0.016713,0.11863,0.024262,0.173587,0.024368,0.12274,0.021871,0.160504,0.026443,0.051557,0.005679,0.065052,0.007036,0.08666,0.010929,0.087472,0.011115,0.071677,0.010417,0.147778,0.016806,0.039328,0.003933,0.048468,...,0.312848,11.876715,0.310638,0.298332,9.358376,0.386161,0.367048,12.811652,0.289611,0.278929,12.572266,0.294719,0.283654,11.473646,0.320676,0.307553,9.719448,0.373154,0.355322,13.004754,0.285617,0.275231,8.852885,0.405972,0.38482,12.127054,0.304714,0.292878,9.646911,0.375696,0.357617,14.218779,0.262833,0.254051,9.910344,0.366625,0.349419,12.591159,0.294309,0.283275
1,165496,0.184594,0.025344,0.238136,0.024025,0.069043,0.007745,0.233886,0.021156,0.065121,0.007313,0.149054,0.013867,0.156768,0.018294,0.155999,0.012245,0.154372,0.018402,0.216814,0.020122,0.155352,0.0169,0.191102,0.023964,0.054972,0.005175,0.076151,0.006924,0.093693,0.00897,0.128834,0.011988,0.078689,0.009199,0.154193,0.011886,0.043881,0.00411,0.055115,...,0.415178,7.524655,0.469227,0.440847,6.783175,0.513929,0.479779,8.533974,0.419552,0.39694,10.667311,0.342838,0.327817,6.568197,0.528527,0.492373,7.627107,0.463655,0.435955,7.151274,0.490721,0.459635,5.129788,0.652551,0.596955,6.718861,0.518211,0.483479,6.654909,0.52254,0.487215,7.105923,0.493466,0.462026,5.921699,0.577893,0.53452,7.578971,0.466256,0.43824
2,165687,0.168936,0.024238,0.164856,0.024545,0.062988,0.010448,0.159546,0.021471,0.054443,0.010933,0.120313,0.014618,0.112157,0.014539,0.10453,0.014609,0.115862,0.018517,0.121658,0.015449,0.138479,0.014726,0.133911,0.01428,0.051727,0.009419,0.047044,0.009429,0.108717,0.012469,0.086564,0.011639,0.120433,0.01302,0.128434,0.013402,0.041324,0.007968,0.042799,...,0.523932,7.424357,0.474814,0.445743,6.224553,0.553668,0.513922,8.32373,0.429013,0.405354,6.551415,0.529702,0.493384,5.643402,0.602101,0.554938,5.423836,0.622681,0.572165,8.911357,0.403577,0.382677,5.959283,0.574772,0.531876,8.55385,0.418679,0.396162,4.946114,0.672708,0.613539,8.125638,0.438326,0.413613,6.122332,0.561614,0.520697,8.751221,0.410205,0.388603
3,166028,0.185036,0.033611,0.239769,0.027374,0.07451,0.010086,0.226098,0.02773,0.061799,0.007029,0.178301,0.018057,0.195989,0.023489,0.162283,0.013406,0.176302,0.021086,0.241195,0.022323,0.172506,0.018544,0.225709,0.023237,0.053169,0.00535,0.070852,0.005261,0.139535,0.012071,0.123792,0.01036,0.084169,0.009414,0.20343,0.015552,0.044119,0.004514,0.053864,...,0.480777,6.327891,0.54586,0.507249,5.90014,0.579698,0.536049,8.469361,0.422415,0.399489,3.652484,0.859756,0.761775,6.154217,0.559111,0.518565,5.643925,0.602054,0.554898,7.600374,0.465096,0.437221,5.499576,0.615425,0.566105,5.756385,0.592033,0.546466,6.509126,0.532685,0.495949,7.246486,0.485055,0.454695,5.713592,0.595806,0.549645,7.60473,0.464861,0.437015
4,166513,0.171757,0.030449,0.23991,0.031049,0.077332,0.009734,0.224602,0.029467,0.073443,0.009549,0.149486,0.018127,0.157776,0.024009,0.162193,0.018953,0.151128,0.022189,0.226194,0.025733,0.152527,0.019404,0.20196,0.029159,0.061727,0.006106,0.07858,0.009056,0.095968,0.010902,0.129076,0.01485,0.08321,0.010844,0.151281,0.015984,0.049842,0.004945,0.057064,...,0.47198,6.003266,0.571162,0.528814,5.341733,0.630742,0.578881,6.778375,0.514246,0.480053,6.88507,0.507288,0.47403,5.285515,0.636384,0.583569,7.093943,0.494197,0.462662,6.870651,0.508217,0.474835,4.553257,0.720298,0.652228,5.655472,0.601009,0.554021,6.207687,0.554963,0.515028,7.463785,0.472602,0.443805,5.65626,0.600938,0.553961,6.298774,0.548037,0.509112


In [None]:
# save data frames with new computimg features
asc_s1_train_df.to_csv(f"{df_path}/asc_s1_train_df.csv", index=False)
asc_s1_test_df.to_csv(f"{df_path}/asc_s1_test_df.csv", index=False)

## Modeling

### Read final dataframes

In [None]:
# Read DFs 
## Asc Sentinel-1
asc_s1_train_df = pd.read_csv(f"{df_path}/asc_s1_train_df.csv")
asc_s1_test_df = pd.read_csv(f"{df_path}/asc_s1_test_df.csv")

## Sentinel-2
s2_train_df = pd.read_csv(f"{df_path}/all_s2_train_df.csv")
s2_test_df = pd.read_csv(f"{df_path}/all_s2_test_df.csv")

## Planet-5days
planet_5days_train_df = pd.read_csv(f"{df_path}/all_planet_5days_train_df.csv")
planet_5days_test_df = pd.read_csv(f"{df_path}/all_planet_5days_test_df.csv")


In [None]:
## Merge the data 
# Merge S2&S1 train data
train_df = pd.merge(asc_s1_train_df, s2_train_df, on=['field_id','label'], how='inner')
train_df = pd.merge(train_df, planet_5days_train_df, on=['field_id','label'], how='inner')


# Merge S2&S1 test data
test_df = pd.merge(asc_s1_test_df, s2_test_df, on=['field_id'], how='inner')
test_df = pd.merge(test_df, planet_5days_test_df, on=['field_id'], how='inner')


### Catboost Classifier

In [None]:
seed_setter(2021)
seeds = np.random.randint(low=1, high=3000, size=10)
seed = 2021 # seed

skf = StratifiedKFold(n_splits=10,shuffle=True, random_state=seed) # for cross validation
sklearnscores = []
catboostpreds= []

X = train_df.drop(columns=['field_id','label'])
y = train_df['label'].astype(int)


X_test = test_df.drop(columns=['field_id']) 
test_fields = test_df['field_id']

#creating a for loop for the stratified k fold
i = 0
for train, val in skf.split(X, y):
    print('########### Fold number {} '.format(i+1))
    # spliting the data
    x_train, x_val, y_train, y_val = X.iloc[train], X.iloc[val], y.iloc[train], y.iloc[val]

    estimator = CatBoostClassifier(iterations=2000,  has_time=True ,bootstrap_type='No',random_strength=0,
                                   learning_rate=0.08,objective='MultiClass',use_best_model=True,
                                   reg_lambda=4,random_seed=seed, task_type='GPU', loss_function='MultiClass')

    # fitting on train data
    estimator.fit( x_train, y_train, eval_set = (x_val,y_val),verbose=500 ,early_stopping_rounds=300)
        
    # Check the loss
    score = log_loss(y_val, estimator.predict_proba(x_val)) # checking the cross_entropy loss
    print('Logloss score: {}'.format(score))
 
    # Making prediction probabities
    catboostpred = estimator.predict_proba(X_test) # making prediction probabities
    sklearnscores.append(score)
    catboostpreds.append(catboostpred)
    i += 1
print('mean logloss scores from sklearn: {} '.format(np.mean(sklearnscores)))

########### Fold number 1 
0:	learn: 1.8958902	test: 1.9024323	best: 1.9024323 (0)	total: 147ms	remaining: 4m 54s
500:	learn: 0.0714490	test: 0.3483220	best: 0.3473282 (430)	total: 52.1s	remaining: 2m 36s
bestTest = 0.3418485987
bestIteration = 677
Shrink model to first 678 iterations.
Logloss score: 0.3418485284644767
########### Fold number 2 
0:	learn: 1.8834946	test: 1.8914425	best: 1.8914425 (0)	total: 139ms	remaining: 4m 37s
500:	learn: 0.0632017	test: 0.4290421	best: 0.4290421 (500)	total: 53.5s	remaining: 2m 40s
1000:	learn: 0.0237159	test: 0.4259552	best: 0.4238121 (929)	total: 1m 46s	remaining: 1m 45s
bestTest = 0.4238121453
bestIteration = 929
Shrink model to first 930 iterations.
Logloss score: 0.4238121270964064
########### Fold number 3 
0:	learn: 1.8978794	test: 1.9229545	best: 1.9229545 (0)	total: 150ms	remaining: 4m 59s
500:	learn: 0.0639448	test: 0.4006864	best: 0.4005332 (496)	total: 1m	remaining: 3m 1s
bestTest = 0.3995844423
bestIteration = 583
Shrink model to firs

### Feature Selection

In [None]:
X = train_df.drop(columns=['field_id','label'])
y = train_df['label'].astype(int)

seed_setter(2021)

#EXCEPTION: multi-class currently only supports "mlogloss" so much be passed in as eval_metric
br = BoostARoota(metric='logloss',max_rounds =1)

#Fit the model for the subset of variables
br.fit(X, y)


# Select just importance features
X=br.transform(X)
test_fields = test_df['field_id']
test_df = br.transform(test_df)
x_test = test_df

Round:  1  iteration:  1
Round:  1  iteration:  2
Round:  1  iteration:  3
Round:  1  iteration:  4
Round:  1  iteration:  5
Round:  1  iteration:  6
Round:  1  iteration:  7
Round:  1  iteration:  8
Round:  1  iteration:  9
Round:  1  iteration:  10
BoostARoota ran successfully! Algorithm went through  1  rounds.


### XGBClassifier

In [None]:
seed_setter(2021)
seed = 1140 # seed

skf = StratifiedKFold(n_splits=10,shuffle=True, random_state=seed) # for cross validation
sklearnscores = []
xgbmpreds = []

#creating a for loop for the stratified k fold
i = 0
for train, val in skf.split(X, y):
    print('########### Fold number {} '.format(i+1))
    # spliting the data
    x_train, x_val, y_train, y_val = X.iloc[train], X.iloc[val], y.iloc[train], y.iloc[val]

    model = XGBClassifier(colsample_bytree=0.10, colsample_bylevel=0.10, 
                          learning_rate=0.03,n_estimators=2000,tree_method='gpu_hist', gpu_id=0,
                          objective="multi:softmax", reg_alpha=2.0, reg_lambda=0.5,
                          seed=seed, silent=True, subsample=0.7)
    

    # fitting on train data
    model.fit( x_train, y_train,verbose=100)
  
    
    # Check the loss
    score = log_loss(y_val, model.predict_proba(x_val))
    print('Logloss score: {}'.format(score))
 
    # Making prediction probabities
    xgbmpred = model.predict_proba(x_test) # making prediction probabities
    sklearnscores.append(score)
    xgbmpreds.append(xgbmpred)
    i += 1
print('mean logloss scores from sklearn: {} '.format(np.mean(sklearnscores)))

########### Fold number 1 
Logloss score: 0.36705454058283354
########### Fold number 2 
Logloss score: 0.327290473442147
########### Fold number 3 
Logloss score: 0.21236259491816015
########### Fold number 4 
Logloss score: 0.3138876910663758
########### Fold number 5 
Logloss score: 0.41509952589459054
########### Fold number 6 
Logloss score: 0.376616487673874
########### Fold number 7 
Logloss score: 0.35240269486889936
########### Fold number 8 
Logloss score: 0.32182589658672217
########### Fold number 9 
Logloss score: 0.37293363745041785
########### Fold number 10 
Logloss score: 0.3942186530824868
mean logloss scores from sklearn: 0.34536921955665073 


### LGBMClassifier

In [None]:
seed_setter(2021)
seed = 257 # seed

skf = StratifiedKFold(n_splits=10,shuffle=True, random_state=seed) # for cross validation
sklearnscores = []
lgbmpreds = []

#creating a for loop for the stratified k fold
i = 0
for train, val in skf.split(X, y):
    print('########### Fold number {} '.format(i+1))
    # spliting the data
    x_train, x_val, y_train, y_val = X.iloc[train], X.iloc[val], y.iloc[train], y.iloc[val]

    model = LGBMClassifier(boosting_type='gbdt', num_class =9,learning_rate=0.05, n_estimators=2000,
                           max_depth= -1, num_leaves= 30,subsample_for_bin=300000,deterministic=True,
                           objective='multiclass',class_weight='balanced',
                           subsample=0.65,subsample_freq=20, colsample_bytree=0.05, reg_alpha=0.5, 
                           random_state=seed, n_jobs=- 1) 


    # fitting on train data
    model.fit( x_train, y_train, eval_set = (x_val,y_val),verbose=100 ,early_stopping_rounds=200)
    
    
    # Check the loss
    score = log_loss(y_val, model.predict_proba(x_val))
    print('Logloss score: {}'.format(score))
 
    # Making prediction probabities
    lgbmpred = model.predict_proba(x_test) # making prediction probabities
    sklearnscores.append(score)
    lgbmpreds.append(lgbmpred)
    i += 1
print('mean log scores from sklearn: {} '.format(np.mean(sklearnscores)))

########### Fold number 1 
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.36722
[200]	valid_0's multi_logloss: 0.29713
[300]	valid_0's multi_logloss: 0.286935
[400]	valid_0's multi_logloss: 0.290535
[500]	valid_0's multi_logloss: 0.29014
Early stopping, best iteration is:
[317]	valid_0's multi_logloss: 0.285086
Logloss score: 0.28508577767268695
########### Fold number 2 
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.369896
[200]	valid_0's multi_logloss: 0.295436
[300]	valid_0's multi_logloss: 0.286772
[400]	valid_0's multi_logloss: 0.283605
[500]	valid_0's multi_logloss: 0.284923
Early stopping, best iteration is:
[374]	valid_0's multi_logloss: 0.282703
Logloss score: 0.2827030056110776
########### Fold number 3 
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.380609
[200]	valid_0's multi_logloss: 0.313917
[300]	valid_0's multi_logloss: 0.3

In [None]:
# Ensemble predictions of boosting models sing geometric mean
cbpreds_mean = gmean(catboostpreds, axis=0)
lgbm_mean = gmean(lgbmpreds, axis=0)
xgbm_mean = gmean(xgbmpreds, axis=0)

# Blend the results using Weighted sum of predictions
predictions = cbpreds_mean*0.1 + lgbm_mean*0.80 + xgbm_mean*.10
predictions = pd.DataFrame(predictions)
predictions['fid'] = list(test_fields)

In [None]:
# In this part we format the DataFrame to have column names and order similar to the sample submission file. 
predictions = predictions.rename(columns={
    'fid':'fid',
    0:1,
    1:2, 
    2:3,
    3:4,
    4:5,
    5:6,
    6:7,
    7:8,
    8:9})
crop_id = list(predictions.drop(columns=['fid']).idxmax(axis = 1))
predictions['crop_id'] = crop_id

predictions = predictions.rename(columns={
    1:'Wheat',
    2:'Rye', 
    3:'Barley',
    4:'Oats',
    5:'Corn',
    6:'Oil Seeds',
    7:'Root Crops',
    8:'Meadows',
    9:'Forage Crops'})

crop_name = list(predictions.drop(columns=['fid','crop_id']).idxmax(axis = 1))
predictions['crop_name'] = crop_name

In [None]:
predictions.head()

Unnamed: 0,Wheat,Rye,Barley,Oats,Corn,Oil Seeds,Root Crops,Meadows,Forage Crops,fid,crop_id,crop_name
0,0.044455,0.033085,0.026255,0.019511,0.059778,0.057619,0.044788,0.463565,0.230037,637,8,Meadows
1,0.100711,0.080466,0.039751,0.063711,0.068556,0.139976,0.073669,0.172293,0.222318,739,9,Forage Crops
2,0.107365,0.117492,0.07644,0.070832,0.048889,0.129258,0.054812,0.159603,0.203272,764,9,Forage Crops
3,0.050277,0.028992,0.040235,0.04993,0.353522,0.12849,0.180552,0.047108,0.084962,1124,5,Corn
4,0.116283,0.061126,0.031747,0.038198,0.036374,0.082605,0.035481,0.26087,0.308272,1152,9,Forage Crops


In [None]:
output_list = []
for index, row in predictions.iterrows():
  predicted_probabilities = list(row[['Wheat','Rye', 'Barley','Oats','Corn','Oil Seeds','Root Crops','Meadows','Forage Crops']])

  output_list.append({'fid': int(row['fid']),
                    'crop_id': int(row['crop_id']),
                    'crop_name': str(row['crop_name']),
                    'crop_probs':predicted_probabilities})

In [None]:
output_name = './germany_submssion' 
output_frame = pd.DataFrame.from_dict(output_list)
output_frame.to_json(output_name)
print('Submission was saved to location: {}'.format(output_name))

Submission was saved to location: ./germany_submssion
