In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
from sklearn.pipeline import Pipeline
import featuretools as ft
import warnings

warnings.filterwarnings("ignore")

#### Loading data

In [3]:
df = pd.read_csv('./Data/merged_v2.csv')
df.head()

Unnamed: 0,longitude,latitude,Area,Soil group,Land class,Soil type,pH,SOC,Nitrogen,Potassium,Phosphorus,Sulfur,Boron,Zinc,Sand,Silt,Clay
0,89.2767,25.5678,Mithpukur,belab,high ground,Clay loam,5.0,1.27,0.08,0.15,19.6,37.7,0.26,0.86,33.0,33.0,33.0
1,89.2767,25.5678,Mithpukur,belab,high ground,Clay loam,4.9,1.47,0.09,0.25,4.1,32.0,0.25,0.75,33.0,33.0,33.0
2,89.2767,25.5678,Mithpukur,belab,high ground,Clay loam,4.6,1.07,0.05,0.09,13.3,13.5,0.27,0.95,33.0,33.0,33.0
3,89.2767,25.5678,Mithpukur,belab,high ground,Clay loam,5.2,1.51,0.06,0.3,20.2,30.0,0.28,1.0,33.0,33.0,33.0
4,89.2767,25.5678,Mithpukur,belab,high ground,Clay loam,5.3,1.08,0.11,0.17,20.5,27.8,0.3,1.04,33.0,33.0,33.0


In [4]:
print(df.shape)
print(df.columns)

(2584, 17)
Index(['longitude', 'latitude', 'Area', 'Soil group', 'Land class',
       'Soil type', 'pH', 'SOC', 'Nitrogen', 'Potassium', 'Phosphorus',
       'Sulfur', 'Boron', 'Zinc', 'Sand', 'Silt', 'Clay'],
      dtype='object')


#### Preprocessing

In [5]:
numerical_features = ['pH', 'Nitrogen', 'Potassium', 'Phosphorus',
       'Sulfur', 'Zinc', 'Sand', 'Silt', 'Clay']

#### Train test split

In [6]:

X = df[numerical_features]
y = df[['Boron']]
print('shape of feature matrix before transformation:',X.shape)

shape of feature matrix before transformation: (2584, 9)


In [7]:
X.head()

Unnamed: 0,pH,Nitrogen,Potassium,Phosphorus,Sulfur,Zinc,Sand,Silt,Clay
0,5.0,0.08,0.15,19.6,37.7,0.86,33.0,33.0,33.0
1,4.9,0.09,0.25,4.1,32.0,0.75,33.0,33.0,33.0
2,4.6,0.05,0.09,13.3,13.5,0.95,33.0,33.0,33.0
3,5.2,0.06,0.3,20.2,30.0,1.0,33.0,33.0,33.0
4,5.3,0.11,0.17,20.5,27.8,1.04,33.0,33.0,33.0


In [8]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
#X_train =X_train.astype('float32')

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(2067, 9) (2067, 1)
(517, 9) (517, 1)


#### Encoding categorical variables


#### Feature engineering using the featuretools library

In [9]:
#Adding columns derived from these numerical columns, ie after transformation using featuretools
# need to have an id column
X_train['id'] = [i for i in range(X_train.shape[0])]
X_test['id'] = [i for i in range(X_test.shape[0])]
y_train.reset_index(inplace=True,drop=True)
y_test.reset_index(inplace=True,drop=True)


dataframes_train = {
    "soil_numerical_train": (X_train, "id"),
}
dataframes_test = {
    "soil_numerical_test": (X_test, "id"),
}
feature_primitives = ft.list_primitives()
#transform_numeric =feature_primitives[(feature_primitives['type']=='transform')&(feature_primitives['valid_inputs']=="<ColumnSchema (Semantic Tags = ['numeric'])>")]
trans_primitives = ['divide_by_feature','square_root','divide_numeric','natural_logarithm','multiply_numeric']
feature_matrix_train,feature_dfs_train =ft.dfs(dataframes_train,
                                   target_dataframe_name='soil_numerical_train', 
                                    trans_primitives=trans_primitives)
print('shape of feature matrix of train with transformations:',feature_matrix_train.shape)
feature_matrix_test,feature_dfs_test =ft.dfs(dataframes_test,
                                   target_dataframe_name='soil_numerical_test', 
                                    trans_primitives=trans_primitives)

print('shape of feature matrix of test with transformations:',feature_matrix_test.shape)

shape of feature matrix of train with transformations: (2067, 144)
shape of feature matrix of test with transformations: (517, 144)


In [10]:
y_train.tail()

Unnamed: 0,Boron
2062,0.034797
2063,0.010466
2064,0.011669
2065,0.018369
2066,0.146921


In [11]:
#rows where there are NaNs or Infs present

# for training dataset
indices_to_remove=feature_matrix_train[feature_matrix_train.isin([np.nan, np.inf, -np.inf]).any(axis=1)].index
feature_matrix_train=feature_matrix_train.drop(axis=1,index=indices_to_remove)
y_train=y_train.drop(axis=1,index=indices_to_remove)
#y_train.reset_index(inplace=True,drop=True)
print(feature_matrix_train.shape,y_train.shape)

# for testing dataset
indices_to_remove=feature_matrix_test[feature_matrix_test.isin([np.nan, np.inf, -np.inf]).any(axis=1)].index
feature_matrix_test=feature_matrix_test.drop(axis=1,index=indices_to_remove)
y_test=y_test.drop(axis=1,index=indices_to_remove)
#y_test.reset_index(inplace=True,drop=True)
print(feature_matrix_test.shape,y_test.shape)

(2066, 144) (2066, 1)
(517, 144) (517, 1)


### Feature selection

In [12]:
# Using SelectKBest feature selection method

num_features_selected = 10

print(y_train.columns)
best_features_ = SelectKBest(score_func=f_regression,k=num_features_selected)
select = best_features_.fit(feature_matrix_train,y_train)
features_train_selected = select.transform(feature_matrix_train)
best_features=best_features_.get_feature_names_out()
print(best_features)

Index(['Boron'], dtype='object')
['Sand' 'Silt' '1 / Sand' 'Silt / Sand' 'Silt / pH' 'Clay * Silt'
 'Nitrogen * Silt' 'NATURAL_LOGARITHM(Sand)' 'SQUARE_ROOT(Sand)'
 'SQUARE_ROOT(Silt)']


### Model fitting

In [13]:
# Random forest regressor on SOC with only numerical columns
ml_models = [RandomForestRegressor(),LinearRegression(),Ridge()]
print(y_train.columns)
for model in ml_models:
    print(model)
    model_fit = model.fit(X=feature_matrix_train[best_features],y=y_train)
    y_predict = model_fit.predict(feature_matrix_test[best_features])
    r2 = r2_score(y_test,y_predict)
    mape = mean_absolute_percentage_error(y_pred=y_predict,y_true=y_test)
    mae = mean_absolute_error(y_test,y_predict)
    print('r2 : ',r2)
    print('MAPE: ',mape)
    print('MAE : ',mae)

Index(['Boron'], dtype='object')
RandomForestRegressor()
r2 :  0.5842428632897173
MAPE:  0.8427987321997565
MAE :  0.052317596375750106
LinearRegression()
r2 :  0.6313090336732199
MAPE:  1.117532481386081
MAE :  0.050430271865713326
Ridge()
r2 :  0.6243321297556836
MAPE:  1.1241796246830782
MAE :  0.05086427531687368


#### To do
1. vary the no.of features selected for each target and plot a performance chart
2. Hyperparameter optimisation
3. create a pipeline to include the transformations and add the custom fn for feature engg using featuretools