# Machine learning models based on tabular feature data

This nb shows the code used to run the machine learning models.

The data used in this nb was prepared on "tab_processing.ipynb"

In [None]:
# Import libraries
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold

# Check if gpu is available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
# Load the train/val and test datasets for DNN
df_test = pd.read_csv('/data/fielddata/df_test.csv')
df_train_val = pd.read_csv('/data/fielddata/df_train_val.csv')

# Load the train/val datasets for XGBoost and Random Forest
X = pd.read_csv('/data/fielddata/X_ordinal.csv')
X_ohe = pd.read_csv('/data/fielddata/X_ohe.csv')
Y = pd.read_csv('/data/fielddata/Y.csv')

# Load the test datasets for XGBoost and Random Forest
Xtest = pd.read_csv('/data/fielddata/Xtest_ordinal.csv')
Xtest_ohe = pd.read_csv('/data/fielddata/Xtest_ohe.csv')
Ytest = pd.read_csv('/data/fielddata/Ytest.csv')

In [None]:
# Functions for splitting the dataset into train:validation
splitter = RandomSplitter(seed=42)
splits = splitter(range_of(df_train_val))
splits

def kfold_splitter(df, column='Year', splits=5, shuffle=True):
    from sklearn.model_selection import StratifiedKFold
    kfold = StratifiedKFold(n_splits=splits, shuffle=shuffle)
    train_idx = []
    val_idx = []

    for train_index, val_index in kfold.split(df.index, df[column]):
        train_idx.append(L(train_index, use_list=True))
        val_idx.append(L(val_index, use_list=True))
    
    return train_idx, val_idx

# DNN 

## Kfold stratified validation

In [None]:
procs = [Categorify, Normalize, FillMissing]
cat_names = ['Parental 1', 'Parental 2', 'Planting', 'Stock', 'Fertilizer']
cont_names =['Days_after_sowing']

val_loss = []
rmse_kfold = []
rmse_pct_kfold =[]
r2_kfold=[]

csvlogger = CSVLogger(f'/data/results/DNN_5fold_metrics.csv', append=True)
early_stopping = EarlyStoppingCallback(monitor='valid_loss', patience=3, min_delta=0.01)
train_index, val_index = kfold_splitter(df_train_val)

for i in range(5):
    data_fold = (TabularDataLoaders.from_df(df_train_val,
                                            y_names="Yield",
                                            cat_names = cat_names,
                                            cont_names=cont_names,
                                            procs = procs,
                                            splits=(train_index[i], val_index[i])))
    
    config = tabular_config(ps=0.5, embed_p=0.5)
    learn = tabular_learner(data_fold,
                            config=config,
                            layers=[200,100],
                            metrics=[rmse, R2Score()],
                            opt_func=ranger,
                            y_range=[0,20],
                            wd=0.3)
    
    # Disable Fastai progress bar
    with learn.no_bar()and learn.no_logging():
        learn.fit_one_cycle(100, 1e-3, cbs=[csvlogger, early_stopping])

    
    df_ymin, df_ymax = df_train_val['Yield'].min(), df_train_val['Yield'].max()
    val_loss_k, rmse_k, r2score_k = learn.validate()
    val_loss.append(val_loss_k)
    rmse_kfold.append(rmse_k)
    rmse_pct_kfold.append(((rmse_k/(df_ymax - df_ymin))*100))
    r2_kfold.append(r2score_k)

In [None]:
# Stratified kfold metrics per round
d ={"validation loss":val_loss, "rmse": rmse_kfold, "rmse %": rmse_pct_kfold, "r2score":r2_kfold}

dnnkfold = pd.DataFrame(data=d)
dnnkfold['rmse %'] = fastkfold['rmse %'].apply(lambda x: np.mean(x))
dnnkfold.to_csv('/data/results/DNN_5fold_summary_metrics.csv', index=False)

## Test predictions

Train a DNN model and to measure the performance of the model in the holdout dataset.

In [None]:
procs = [Categorify, Normalize, FillMissing]
cat_names = [ 'Parental 1', 'Parental 2', 'Planting', 'Stock', 'Fertilizer']
cont_names =['Days_after_sowing']

data_init = (TabularDataLoaders.from_df(df_train_val,
                                        y_names="Yield",
                                        cat_names=cat_names,
                                        cont_names=cont_names,
                                        procs = procs,
                                        splits=splits))

config = tabular_config(ps=0.5, embed_p=0.5)

learn_tab = tabular_learner(data_init,
                            config=config,
                            layers=[200,100],
                            metrics=[rmse, R2Score()],
                            opt_func=ranger,
                            y_range=[0,20],
                            wd=0.3)

#Callbacks 
early_stopping = EarlyStoppingCallback(monitor='valid_loss', patience=3, min_delta=0.01)
learn_tab.fit_one_cycle(100, 1e-3, cbs=[early_stopping])

# Export and save the model
learn_tab.save('/data/model_weights/DNN_model')

In [None]:
# If you want to load the model use the command below
#learn_tab.load('/data/model_weights/DNN_model')

In [None]:
# predict the grain yield in the holdout dataset
dl = learn_tab.dls.test_dl(df_test)
test_preds = learn_tab.get_preds(dl=dl, reorder=False)
test_2019 = dl.dataset.decode()
df_test2019 = pd.DataFrame()

df_test2019['Stock'] = df_test['Stock']
df_test2019['Parental 1'] = df_test['Parental 1']
df_test2019['Parental 2'] = df_test['Parental 2']
df_test2019['Planting'] = df_test['Planting']
df_test2019['Fertilizer'] = df_test['Fertilizer']
df_test2019['Days_after_sowing'] = df_test['Days_after_sowing']

df_test2019['Yield'] = df_test['Yield']
df_test2019['Predictions'] = test_preds[0].flatten()

df_test2019.to_csv('/data/results/DNN_prediction_on_holdout_dataset.csv')

# XGBoost kfold

### Ordinal encoding

In [None]:
# Kfold validation with XGB
rmse_kfold = []
rmse_pct_kfold =[]
r2_kfold=[]

train_index, val_index = kfold_splitter(df_train_val)

for i in range(5):
    Xtrain, Xval = X.iloc[train_index[i]], X.iloc[val_index[i]]
    ytrain, yval = Y[train_index[i]], Y[val_index[i]]
    
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                                 subsample=0.75,
                                 max_depth=9,
                                 eta=1e-2,
                                 n_estimators=150,
                                 min_child_weight=0.35)
    
    xgb_model.fit(X_train, y_train)
    
    # Metrics
    ypred = xgb_model.predict(Xval)
    df_ymin, df_ymax = df_train_val['Yield'].min(), df_train_val['Yield'].max()
    mse_val = mean_squared_error(yval, ypred)
    
    rmse_k = mse_val**(1/2.0)
    rmse_kfold.append(rmse_k)
    rmse_pct_kfold.append(((rmse_k/(df_ymax - df_ymin))*100))
    r2_kfold.append(r2_score(yval, ypred))

In [None]:
d ={"rmse": rmse_kfold, "rmse %": rmse_pct_kfold, "r2score":r2_kfold}
xgbfold = pd.DataFrame(data=d)
xgbfold['rmse %'] = xgbfold['rmse %'].apply(lambda x: np.mean(x))
xgbfold.to_csv('/data/results/XGB_5fold_summary_metrics.csv', index=False)
xgbfold

In [None]:
# Metrics
ypred = xgb_model.predict(Xtest)
test_df = df_test.copy()
test_df['Predictions'] = ypred
test_df.to_csv('/data/results/XGB_prediction_on_holdout_dataset.csv')

### OHE


In [None]:
# Kfold validation with XGB
rmse_kfold = []
rmse_pct_kfold =[]
r2_kfold=[]

train_index, val_index = kfold_splitter(df_train_val)

for i in range(5):
    Xtrain, Xval = X_ohe.iloc[train_index[i]], X_ohe.iloc[val_index[i]]
    ytrain, yval = Y[train_index[i]], Y[val_index[i]]
    
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                                 subsample=0.75,
                                 max_depth=9,
                                 eta=1e-2,
                                 n_estimators=150,
                                 min_child_weight=0.35)
    
    xgb_model.fit(Xtrain, ytrain)
    
    # Metrics
    ypred = xgb_model.predict(Xval)
    df_ymin, df_ymax = df_train_val['Yield'].min(), df_train_val['Yield'].max()
    mse_val = mean_squared_error(yval, ypred)
    
    rmse_k = mse_val**(1/2.0)
    rmse_kfold.append(rmse_k)
    rmse_pct_kfold.append(((rmse_k/(df_ymax - df_ymin))*100))
    r2_kfold.append(r2_score(yval, ypred))

In [None]:
d ={"rmse": rmse_kfold, "rmse %": rmse_pct_kfold, "r2score":r2_kfold}
xgbfold = pd.DataFrame(data=d)
xgbfold['rmse %'] = xgbfold['rmse %'].apply(lambda x: np.mean(x))
xgbfold.to_csv('/data/results/XGB_OHE_5fold_summary_metrics.csv')
xgbfold

In [None]:
# Predict the test dataset
ypred = xgb_model.predict(testX)
df = df_test.copy()
df['Predicted'] = ypred
df.to_csv('/data/results/XGB_OHE_prediction_on_holdout_dataset.csv')

# Random Forests kfold

### Ordinal encoding

In [None]:
# Kfold validation with Random Forests
rmse_kfold = []
rmse_pct_kfold =[]
r2_kfold=[]

train_index, val_index = kfold_splitter(df_train_val)

for i in range(5):
    Xtrain, Xval = X.iloc[train_index[i]], X.iloc[val_index[i]]
    ytrain, yval = Y[train_index[i]], Y[val_index[i]]
    
    rf = RandomForestRegressor(n_estimators=100, random_state=10)
    rf.fit(Xtrain, ytrain)
    
    # Metrics
    ypred = rf.predict(Xval)
    df_ymin, df_ymax = df_train_val['Yield'].min(), df_train_val['Yield'].max()
    mse_val = mean_squared_error(yval, ypred)
    
    rmse_k = mse_val**(1/2.0)
    rmse_kfold.append(rmse_k)
    rmse_pct_kfold.append(((rmse_k/(df_ymax - df_ymin))*100))
    r2_kfold.append(r2_score(yval, ypred))
    

In [None]:
d ={"rmse": rmse_kfold, "rmse %": rmse_pct_kfold, "r2score":r2_kfold}
rffold = pd.DataFrame(data=d)
rffold['rmse %'] = rffold['rmse %'].apply(lambda x: np.mean(x))
rffold.to_csv('/data/results/RF_5fold_summary_metrics.csv')
rffold

In [None]:
# # Metrics
ypred = rf.predict(Xtest)
df = df_test.copy()
df['Predicted'] = ypred
df_ymin, df_ymax = df['Yield'].min(), df['Yield'].max()
df.to_csv('/data/results/RF_prediction_on_holdout_dataset.csv')

### OHE

In [None]:
# Kfold validation with Random Forests
rmse_kfold = []
rmse_pct_kfold =[]
r2_kfold=[]

train_index, val_index = kfold_splitter(df_train_val)

for i in range(5):
    Xtrain, Xval = X_ohe.iloc[train_index[i]], X_ohe.iloc[val_index[i]]
    ytrain, yval = Y[train_index[i]], Y[val_index[i]]
    
    rf_ohe = RandomForestRegressor(n_estimators=100, random_state=10)
    rf_ohe.fit(Xtrain, ytrain)
    
    # Metrics
    ypred = rf_ohe.predict(Xval)
    df_ymin, df_ymax = df_train_val['Yield'].min(), df_train_val['Yield'].max()
    mse_val = mean_squared_error(yval, ypred)
    
    rmse_k = mse_val**(1/2.0)
    rmse_kfold.append(rmse_k)
    rmse_pct_kfold.append(((rmse_k/(df_ymax - df_ymin))*100))
    r2_kfold.append(r2_score(yval, ypred))
    

In [None]:
d ={"rmse": rmse_kfold, "rmse %": rmse_pct_kfold, "r2score":r2_kfold}
rffold = pd.DataFrame(data=d)
rffold['rmse %'] = rffold['rmse %'].apply(lambda x: np.mean(x))
rffold.to_csv('/data/results/RF_OHE_5fold_summary_metrics.csv', index=False)

In [None]:
# Predict grain yield on the holdout dataset
ypred = rf_ohe.predict(testX)
df = df_test.copy()
df['Predicted'] = ypred
df.to_csv('/data/results/RF_OHE_prediction_on_holdout_dataset.csv')