## Feature Extraction

In [None]:
import math
import pandas as pd

import warnings

In [None]:
df = pd.read_csv("../../shared_pemetaan/2023/Mixed Method/[2024] Preprocessed Dataset/[03] df_all.csv")


## Feature Extraction for Data Sample

In [None]:
## Extract 17 Features
## Features are constructed to represent the phenological growth phases of paddy 
## Derived from either one growth cycle and one year, as described in the following table


def extract_features(data):
    
    ## Fitur Fase Tumbuh

    # backscatter at t0 
    data[f'F1'] = data[f'VH_0']
    # min backscatter in the last 10 time points (one growth cycle)
    data[f'F2'] = data.loc[:, f'VH_0':f'VH_9'].min(axis=1)
    # max backscatter in the last 10 time points (one growth cycle)
    data[f'F3'] = data.loc[:, f'VH_0':f'VH_9'].max(axis=1)
    # time range from t-0 to Min backscatter in the last 10 time points (one growth cycle)
    data[f'F4'] = data.loc[:, f'VH_0':f'VH_9'].idxmin(axis=1)
    data[f'F4'] = data[f'F4'].str.extract(r'VH_(\d+)')[0].astype(int)
    # time range from t0 to Max backscatter in the last 10 time points (one growth cycle)
    data[f'F5'] = data.loc[:, f'VH_0':f'VH_9'].idxmax(axis=1)
    data[f'F5'] = data[f'F5'].str.extract(r'VH_(\d+)')[0].astype(int)

    ## Fitur Tahunan
    
    # min backscatter in the last year
    data[f'F6'] = data.loc[:, f'VH_0':f'VH_31'].min(axis=1)
    # max backscatter in the last year
    data[f'F7'] = data.loc[:, f'VH_0':f'VH_31'].max(axis=1)
    # deviation/Variance
    data[f'F8'] = data[f'F7'] - data[f'F6']
    
    # total harvesting events in one year*
    def get_n(gelombang):
        jumlah_gelombang = 0
        for i in range(1, (len(gelombang) - 1)):
            if (gelombang[i - 1] < gelombang[i]) and (gelombang[i] > gelombang[i + 1]):  # Puncak
                jumlah_gelombang += 1
            elif (gelombang[i - 1] > gelombang[i]) and (gelombang[i] < gelombang[i + 1]):  # Lembah
                jumlah_gelombang += 1 
        return math.floor(jumlah_gelombang/2)

    def getF9(df):
        n_ = df.loc[:, f'VH_0':f'VH_31'].apply(get_n, axis=1)
        return n_

    data[f'F9'] = getF9(data)
    
    
    ## Slope 
    
    # slope between t_0 and t_1
    data[f'F10']=data.apply(lambda y: (y[f'VH_0']-y[f'VH_1']+1e-10),axis=1)
    # degree of slope between t_0 and t_1
    data[f'F11']=data.apply(lambda y: math.atan(y[f'F10']),axis=1)
    # slope between highest data
    data[f'F12']=data.apply(lambda y: (y[f'VH_0']-y[f'F2']+1e-10)/(y[f'F4']+1e-10),axis=1)
    # degree of slope between highest data
    data[f'F13']=data.apply(lambda y: math.atan(y[f'F12']),axis=1)
    # slope between lowest data
    data[f'F14']=data.apply(lambda y: (y[f'VH_0']-y[f'F3']+1e-10)/(y[f'F5']+1e-10),axis=1)
    # degree of slope between lowest data
    data[f'F15']=data.apply(lambda y: math.atan(y[f'F14']),axis=1)
    # slope between  t_0 and t_10
    data[f'F16']=data.apply(lambda y: (y[f'VH_0']-y[f'VH_10']+1e-10),axis=1)
    # degree of slope between t_0 and t_10
    data[f'F17']=data.apply(lambda y: math.atan(y[f'F16']),axis=1)

    return data

In [None]:
df_extracted = extract_features(df)

In [None]:
df_extracted = df_extracted[['idsegment','id_subsegment', 'nth', 'periode',
       'observation','VH_0', 'VH_1', 'VH_2', 'VH_3', 'VH_4', 'VH_5', 'VH_6',
       'VH_7', 'VH_8', 'VH_9', 'VH_10', 'VH_11', 'VH_12', 'VH_13', 'VH_14',
       'VH_15', 'VH_16', 'VH_17', 'VH_18', 'VH_19', 'VH_20', 'VH_21', 'VH_22',
       'VH_23', 'VH_24', 'VH_25', 'VH_26', 'VH_27', 'VH_28', 'VH_29', 'VH_30',
       'VH_31','F1', 'F2', 'F3', 'F4', 'F5', 'F6',
       'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16',
       'F17']]
df_extracted = df_extracted.sort_values(by=['idsegment','id_subsegment'])
df_extracted

In [None]:
variabel_list = [f'F{i}' for i in range(1,18)]

plt.figure(figsize=(10, 8))
for i, var in enumerate(variabel_list):
    plt.subplot(3, 3, i + 1)
    sns.boxplot(x='obs', y=var, data=df_extracted)
plt.tight_layout()
plt.show()

In [None]:
# df_extracted.to_csv('sampel_to_explore.csv')

## Calculate DTW Features

In [None]:
## Keterangan : https://chatgpt.com/share/68481f45-bb91-4228-b45a-179a52d9954b
## Get median of each observation, each label in a year

def prepare_median(X,id_vars,value_vars,group_var):
    data_melt=pd.melt(X,id_vars=id_vars,value_vars=value_vars)
    data_melt[group_var]=data_melt[group_var].astype(str)
    grouped=pd.pivot_table(data=data_melt,index='variable',values='value',columns=group_var,
                           aggfunc=[np.median]).reset_index()
    grouped.columns = grouped.columns.map('_'.join).str.strip('_')
    grouped['variable_dt']=grouped.variable.apply(lambda y:int(y.split('_')[1]))
    grouped=grouped.sort_values('variable_dt',ascending=False)
    col_group_2=['variable_dt']+[i for i in grouped.columns if i[0]=='m']
    grouped_melt_median=pd.melt(
        grouped[col_group_2],
        id_vars='variable_dt',
        value_vars=[i for i in grouped.columns if i[0]=='m'])
    return grouped_melt_median

In [None]:
## Calculate dtw of observtion and the median, up to up_limit, for each class
## Add [num of paddy growth phase class] more class 

def calculate_dtw(X,median,up_limit,suffix,s_var):
    median=median.query('variable_dt<=@up_limit').sort_values('variable_dt',ascending=True)
    col_=[f't_{i}_{suffix}' for i in np.arange(0,up_limit)]
    for i in tqdm(np.arange(0,len(median.variable.unique()))):
        med_check=median.loc[median['variable']=='median_'+str(i)].value.to_numpy()
        #break
        #print(med_check)
        X[f'dtw_{i}_{s_var}']=X[col_].apply(lambda y: dtw.distance(y,med_check),axis=1)
    return X

In [None]:
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y,stratify=Y, test_size=0.4, random_state=42)
X_test, X_val, Y_test, Y_val = train_test_split(X_temp, Y_temp, stratify=Y_temp,test_size=0.5, random_state=42)
X_copy=X_train.copy().assign(target=Y_train).assign(keys=lambda y:y.index)

median_10m=prepare_median(X_copy,id_vars=['keys','target'],value_vars=[f't_{31-i}_10m' for i in np.arange(0,32)],group_var='target')

In [None]:
X_train=calculate_dtw(X_train,median_10m,32,'10m','31')
X_train=calculate_dtw(X_train,median_10m,12,'10m','11')
X_train=calculate_dtw(X_train,median_10m,8,'10m','7')