In [1]:
import pandas as pd

PLANTS = '/kaggle/input/plant-growth-data-classification/plant_growth_data.csv'
TARGET = 'Growth_Milestone'

df = pd.get_dummies(data=pd.read_csv(filepath_or_buffer=PLANTS), columns=['Soil_Type', 'Water_Frequency', 'Fertilizer_Type'])
                    
df.head()

Unnamed: 0,Sunlight_Hours,Temperature,Humidity,Growth_Milestone,Soil_Type_clay,Soil_Type_loam,Soil_Type_sandy,Water_Frequency_bi-weekly,Water_Frequency_daily,Water_Frequency_weekly,Fertilizer_Type_chemical,Fertilizer_Type_none,Fertilizer_Type_organic
0,5.192294,31.719602,61.591861,0,False,True,False,True,False,False,True,False,False
1,4.033133,28.919484,52.422276,1,False,False,True,False,False,True,False,False,True
2,8.892769,23.179059,44.660539,0,False,True,False,True,False,False,False,True,False
3,8.241144,18.465886,46.433227,0,False,True,False,True,False,False,False,True,False
4,8.374043,18.128741,63.625923,0,False,False,True,True,False,False,False,False,True


In [2]:
df['Growth_Milestone'].value_counts().to_dict()

{0: 97, 1: 96}

Our target class is balanced, which is good news.

In [3]:
import arrow
from umap import UMAP

COLUMNS = ['Sunlight_Hours', 'Temperature', 'Humidity', 
       'Soil_Type_clay', 'Soil_Type_loam', 'Soil_Type_sandy',
       'Water_Frequency_bi-weekly', 'Water_Frequency_daily',
       'Water_Frequency_weekly', 'Fertilizer_Type_chemical',
       'Fertilizer_Type_none', 'Fertilizer_Type_organic']
time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-21 19:46:34.479345: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 19:46:34.479513: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 19:46:34.646708: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Sun Jul 21 19:46:46 2024 Construct fuzzy simplicial set
Sun Jul 21 19:46:47 2024 Finding Nearest Neighbors
Sun Jul 21 19:46:51 2024 Finished Nearest Neighbor Search
Sun Jul 21 19:46:55 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Sun Jul 21 19:46:56 2024 Finished embedding
done with UMAP in 0:00:09.996259


In [4]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color=TARGET, )

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=10000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 156 iterations
accuracy: 0.4359


In [6]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

In [7]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.45      0.45      0.45        20
           1       0.42      0.42      0.42        19

    accuracy                           0.44        39
   macro avg       0.44      0.44      0.44        39
weighted avg       0.44      0.44      0.44        39



In [8]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

MODEL = {
    'Naive Bayes': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
    '3 Nearest Neighbors': KNeighborsClassifier(n_neighbors=3),
    '5 Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    '7 Nearest Neighbors': KNeighborsClassifier(n_neighbors=7),
    '8 Nearest Neighbors': KNeighborsClassifier(n_neighbors=8),
    '9 Nearest Neighbors': KNeighborsClassifier(n_neighbors=9),
    '10 Nearest Neighbors': KNeighborsClassifier(n_neighbors=10),
    '11 Nearest Neighbors': KNeighborsClassifier(n_neighbors=11),
    'Linear SVM': SVC(kernel='linear', C=0.025, random_state=2024),
    'Gaussian Process':GaussianProcessClassifier(1.0 * RBF(1.0), random_state=2024),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=2024),
    '10 estimator Random Forest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=2024),
    '20 estimator Random Forest': RandomForestClassifier(max_depth=5, n_estimators=20, max_features=1, random_state=2024),
    'Neural Net': MLPClassifier(alpha=1, max_iter=1000, random_state=2024),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=2024),
    'RBF SVM': SVC(gamma=2, C=1, random_state=2024),
}

result = []
for name, clf in MODEL.items():
    time_start = arrow.now()
    clf.fit(X=X_train, y=y_train)
    score = f1_score(y_true=y_test, y_pred=clf.predict(X=X_test))
    result.append((score, name))
    print('{:5.4f} {} {}'.format(score, arrow.now() - time_start, name))
result = sorted(result, key=lambda x: x[0], reverse=True)
print('best: {} {}'.format(result[0][0], result[0][1]))

0.4211 0:00:00.007609 Naive Bayes
0.1667 0:00:00.021608 QDA
0.5128 0:00:00.010802 3 Nearest Neighbors
0.5455 0:00:00.010388 5 Nearest Neighbors
0.5238 0:00:00.011812 7 Nearest Neighbors
0.4211 0:00:00.012515 8 Nearest Neighbors
0.4286 0:00:00.012125 9 Nearest Neighbors
0.4211 0:00:00.010357 10 Nearest Neighbors
0.5581 0:00:00.011097 11 Nearest Neighbors
0.4211 0:00:00.010619 Linear SVM



Variables are collinear


The optimal value found for dimension 0 of parameter k1__constant_value is close to the specified lower bound 1e-05. Decreasing the bound and calling fit again may find a better value.



0.5455 0:00:00.230339 Gaussian Process
0.6047 0:00:00.015178 Decision Tree
0.6364 0:00:00.108070 10 estimator Random Forest
0.6047 0:00:00.052886 20 estimator Random Forest
0.6552 0:00:00.069067 Neural Net
0.6222 0:00:00.172291 AdaBoost
0.1818 0:00:00.009419 RBF SVM
best: 0.6551724137931034 Neural Net


In [9]:
from sklearn.neural_network import MLPClassifier

neural_net = MLPClassifier(alpha=1, max_iter=10000, random_state=2024).fit(X=X_train, y=y_train)

print(classification_report(zero_division=0 , y_true=y_test, y_pred=neural_net.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.49      1.00      0.66        19

    accuracy                           0.49        39
   macro avg       0.24      0.50      0.33        39
weighted avg       0.24      0.49      0.32        39

