In [313]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
import featuretools as ft
import warnings

warnings.filterwarnings("ignore")

#### Loading data

In [314]:
df = pd.read_excel('./Data/2410_iPAGE_SoilData_original.xlsx')
# converting all the 'non numerical' values in the numerical columns to NaN
for i in range(4,len(df.columns)):
    df[df.columns[i]] = pd.to_numeric(df[df.columns[i]],errors='coerce')

# Now we drop the rows with NaN, but we also have text columns with NaN values, which we are avoiding here
df = df.dropna(subset=df.columns[4:])
df = df[(df['Nitrogen N (%)']<2)&(df['Potassium K (meq/100)']<10)&(df['SOC (%)']<5)&(df['Boron B (ug/g)']<5)]

## Train test split

#X = df[['Area', 'soil group', 'Land class', 'knit (surface)', 'pH',
#       'Nitrogen N (%)', 'Potassium K (meq/100)', 'Phosphorus P (ug/g)',
#       'Sulfur S (ug/g)']]
X = df[[ 'Area','knit (surface)','pH',
       'Nitrogen N (%)', 'Potassium K (meq/100)', 'Phosphorus P (ug/g)',
       'Sulfur S (ug/g)']]
y = df[['SOC (%)']]
print('shape of feature matrix before transformation:',X.shape)

shape of feature matrix before transformation: (573, 7)


In [315]:
print(X.head())

        Area knit (surface)   pH  Nitrogen N (%)  Potassium K (meq/100)  \
0  Mithpukur     Clay loam   5.0            0.08                   0.15   
1  Mithpukur     Clay loam   4.9            0.09                   0.25   
2  Mithpukur     Clay loam   4.6            0.05                   0.09   
3  Mithpukur     Clay loam   5.2            0.06                   0.30   
4  Mithpukur     Clay loam   5.3            0.11                   0.17   

   Phosphorus P (ug/g)  Sulfur S (ug/g)  
0                 19.6             37.7  
1                  4.1             32.0  
2                 13.3             13.5  
3                 20.2             30.0  
4                 20.5             27.8  


#### Train test split

In [316]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
#X_train =X_train.astype('float32')

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(458, 7) (458, 1)
(115, 7) (115, 1)


#### Encoding categorical variables


In [317]:
# label encoding for 
categorical_cols = [col for col in X.columns if df[col].dtype==object]
for col in categorical_cols:
    encoder = LabelEncoder()
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])

In [318]:
y_train.head()

Unnamed: 0,SOC (%)
186,2.73
63,1.8
162,1.69
60,1.88
15,1.51


#### Feature engineering using the featuretools library

In [319]:
#Adding columns derived from these numerical columns, ie after transformation using featuretools
# need to have an id column
X_train['id'] = [i for i in range(X_train.shape[0])]
X_test['id'] = [i for i in range(X_test.shape[0])]
y_train.reset_index(inplace=True,drop=True)
y_test.reset_index(inplace=True,drop=True)


dataframes_train = {
    "soil_numerical_train": (X_train, "id"),
}
dataframes_test = {
    "soil_numerical_test": (X_test, "id"),
}
feature_primitives = ft.list_primitives()
#transform_numeric =feature_primitives[(feature_primitives['type']=='transform')&(feature_primitives['valid_inputs']=="<ColumnSchema (Semantic Tags = ['numeric'])>")]
trans_primitives = ['divide_by_feature','square_root','divide_numeric','natural_logarithm','multiply_numeric']
feature_matrix_train,feature_dfs_train =ft.dfs(dataframes_train,
                                   target_dataframe_name='soil_numerical_train', 
                                    trans_primitives=trans_primitives)
print('shape of feature matrix of train with transformations:',feature_matrix_train.shape)
feature_matrix_test,feature_dfs_test =ft.dfs(dataframes_test,
                                   target_dataframe_name='soil_numerical_test', 
                                    trans_primitives=trans_primitives)

print('shape of feature matrix of test with transformations:',feature_matrix_test.shape)

shape of feature matrix of train with transformations: (458, 91)
shape of feature matrix of test with transformations: (115, 91)


In [320]:
y_train.tail()

Unnamed: 0,SOC (%)
453,1.28
454,1.53
455,1.46
456,2.75
457,1.68


In [321]:
#rows where there are NaNs or Infs present

# for training dataset
indices_to_remove=feature_matrix_train[feature_matrix_train.isin([np.nan, np.inf, -np.inf]).any(axis=1)].index
feature_matrix_train=feature_matrix_train.drop(axis=1,index=indices_to_remove)
y_train=y_train.drop(axis=1,index=indices_to_remove)
#y_train.reset_index(inplace=True,drop=True)
print(feature_matrix_train.shape,y_train.shape)

# for testing dataset
indices_to_remove=feature_matrix_test[feature_matrix_test.isin([np.nan, np.inf, -np.inf]).any(axis=1)].index
feature_matrix_test=feature_matrix_test.drop(axis=1,index=indices_to_remove)
y_test=y_test.drop(axis=1,index=indices_to_remove)
#y_test.reset_index(inplace=True,drop=True)
print(feature_matrix_test.shape,y_test.shape)

(370, 91) (370, 1)
(96, 91) (96, 1)


### Feature selection

In [322]:
# Using SelectKBest feature selection method

num_features_selected = 10

print(y_train.columns)
best_features_ = SelectKBest(score_func=f_regression,k=num_features_selected)
select = best_features_.fit(feature_matrix_train,y_train)
features_train_selected = select.transform(feature_matrix_train)
best_features=best_features_.get_feature_names_out()
print(best_features)

Index(['SOC (%)'], dtype='object')
['1 / Nitrogen N (%)' 'Area / Nitrogen N (%)' 'Nitrogen N (%) / Area'
 'Nitrogen N (%) / knit (surface)' 'Phosphorus P (ug/g) / Nitrogen N (%)'
 'Potassium K (meq/100) / Nitrogen N (%)'
 'knit (surface) / Nitrogen N (%)' 'pH / Nitrogen N (%)'
 'NATURAL_LOGARITHM(Nitrogen N (%))' 'SQUARE_ROOT(Nitrogen N (%))']


### Model fitting

In [323]:
# Random forest regressor on SOC with only numerical columns
ml_models = [RandomForestRegressor(),LinearRegression(),Ridge()]
print(y_train.columns)
for model in ml_models:
    print(model)
    model_fit = model.fit(X=feature_matrix_train[best_features],y=y_train)
    y_predict = model_fit.predict(feature_matrix_test[best_features])
    r2 = r2_score(y_test,y_predict)
    mape = mean_absolute_percentage_error(y_pred=y_predict,y_true=y_test)
    mae = mean_absolute_error(y_test,y_predict)
    print('r2 : ',r2)
    print('MAPE: ',mape)
    print('MAE : ',mae)

Index(['SOC (%)'], dtype='object')
RandomForestRegressor()
r2 :  0.8043311969927996
MAPE:  0.11135190208236319
MAE :  0.19057385416666658
LinearRegression()
r2 :  0.7579620643329216
MAPE:  0.12541076408905447
MAE :  0.21719716157027394
Ridge()
r2 :  0.6555133255988796
MAPE:  0.14725354519285117
MAE :  0.26244281823313037


#### To do
1. vary the no.of features selected for each target and plot a performance chart
2. Hyperparameter optimisation
3. use sklearn pipelines