In [1]:
import pandas as pd

DATA = '/kaggle/input/liming-and-rhizoctonia-root-rot-of-soybean/h6v8j338ds-1/Dataset.xlsx'
SHEET_NAME = 'Data'

df = pd.read_excel(io=DATA, sheet_name=SHEET_NAME)

df.head()

Unnamed: 0,Plot,Field,Site,Block,Dose,PLH,INSH,BRA,TNOD,FNOD,PODS,GRAINS,WEIGHT,PLANTS,YIELD,INC,AUDPC
0,33,NIT,Patch,1,0,77.9,3.2,3.0,16.8,12.6,46.0,114.4,155.5,6.333333,1865.057471,44.18,753.98
1,34,NIT,Patch,1,3,85.4,3.05,0.9,17.9,11.8,45.4,112.2,176.3,13.333333,4059.829715,15.51,657.92
2,35,NIT,Patch,1,6,82.3,3.0,1.1,17.4,12.6,41.3,99.5,163.266667,12.222222,3253.08642,11.94,355.69
3,36,NIT,Patch,1,9,87.5,3.05,0.8,17.5,12.1,46.7,112.7,176.333333,12.555556,3829.118774,9.7,387.57
4,37,NIT,Patch,2,0,83.2,2.25,0.3,17.1,10.6,34.8,84.4,156.4,7.666667,1819.003831,49.78,522.98


It is not obvious from the data or from the data card what the signal variable should be. So we will need to do some exploration and see what if anything the data can tell us.

In [2]:
from plotly import express

express.histogram(data_frame=df, x='YIELD')

We would like the dependent variable to be yield; the actual measured yield has a Gaussian-looking distribution.

In [3]:
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)

Let's use all of our floating-point data, the data with columns that are all capital letters, and see if it predicts any of the other columns.

In [4]:
import arrow
from umap import UMAP

TARGET = 'Site'
COLUMNS = ['PLH', 'INSH', 'BRA', 'TNOD', 'FNOD', 'PODS', 'GRAINS', 'WEIGHT', 'PLANTS', 'YIELD', 'INC', 'AUDPC']
time_start = arrow.now()
reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=1000)
umap_df = pd.DataFrame(data=reducer.fit_transform(X=df[COLUMNS]), columns=['x', 'y'])
umap_df[TARGET] = df[TARGET].tolist()
express.scatter(data_frame=umap_df, x='x', y='y', color=TARGET, ).show()
print('UMAP done in {}'.format(arrow.now() - time_start))

2024-04-29 12:57:28.042728: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 12:57:28.042881: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 12:57:28.208652: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=1000, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Mon Apr 29 12:57:40 2024 Construct fuzzy simplicial set
Mon Apr 29 12:57:40 2024 Finding Nearest Neighbors
Mon Apr 29 12:57:44 2024 Finished Nearest Neighbor Search
Mon Apr 29 12:57:47 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Mon Apr 29 12:57:49 2024 Finished embedding


UMAP done in 0:00:09.109466


Interestingly the Site variable is the only one that UMAP really finds.

In [5]:
import arrow
from umap import UMAP

YIELD = 'YIELD'
YIELD_COLUMNS = ['PLH', 'INSH', 'BRA', 'TNOD', 'FNOD', 'PODS', 'GRAINS', 'WEIGHT', 'PLANTS', 'INC', 'AUDPC']
time_start = arrow.now()
yield_umap = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=1000)
yield_df = pd.DataFrame(data=yield_umap.fit_transform(X=df[YIELD_COLUMNS]), columns=['x', 'y'])
yield_df[YIELD] = df[YIELD].tolist()
express.scatter(data_frame=yield_df, x='x', y='y', color=YIELD,).show()
print('UMAP done in {}'.format(arrow.now() - time_start))

UMAP(n_epochs=1000, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Mon Apr 29 12:57:49 2024 Construct fuzzy simplicial set
Mon Apr 29 12:57:49 2024 Finding Nearest Neighbors
Mon Apr 29 12:57:49 2024 Finished Nearest Neighbor Search
Mon Apr 29 12:57:49 2024 Construct embedding


Epochs completed:   0%|            0/1000 [00:00]

	completed  0  /  1000 epochs
	completed  100  /  1000 epochs
	completed  200  /  1000 epochs
	completed  300  /  1000 epochs
	completed  400  /  1000 epochs
	completed  500  /  1000 epochs
	completed  600  /  1000 epochs
	completed  700  /  1000 epochs
	completed  800  /  1000 epochs
	completed  900  /  1000 epochs
Mon Apr 29 12:57:50 2024 Finished embedding


UMAP done in 0:00:01.456835


This looks promising. Let's build a regression model and see what happens.

In [6]:
from plotly import express
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[YIELD_COLUMNS], df[YIELD], test_size=0.2, random_state=2024)
model = linear_model.LinearRegression().fit(X=X_train, y=y_train)
y_pred = model.predict(X=X_test)
print(mean_squared_error(y_true=y_test, y_pred=y_pred))
print(r2_score(y_true=y_test, y_pred=y_pred))
express.histogram(x=YIELD_COLUMNS, y=model.coef_)

207672.34526671222
0.6793031270833108
