# Preprocessing Data

In [1]:
# import package
import sys
import numpy as np
import pandas as pd

In [2]:
# load dataset
datasets = {
    'kale' : '../datasets/dataset_kale.csv',
    'seledri' : '../datasets/dataset_seledri.csv'
}

raw_data = {}
for plant_name, path in datasets.items():
    raw_data[plant_name] = pd.read_csv(path)

In [3]:
# preprocessing
# set label kale
# 1 - 3 - 5 - 2 - 4 ( Optimal - Non Optimal)

# set label seledri
# 2 - 5 - 1 - 4 - 3 ( Optimal - Non Optimal)
optimal_list = {
    'kale' : ['Kale1',
                'Kale3',
               'Kale5',
              ],
    'seledri' : ['Seledri 2',
                'Seledri 5',
                'Seledri 1',]
}

not_optimal_list = {
    'kale' : ['Kale2',
               'Kale4'],
    'seledri' : ['Seledri 4',
                'Seledri 3']
}


pre_processing_data = {}

for plant_name, path in datasets.items():
    list_pd = []
    for index, row in raw_data[plant_name].iterrows():
        # change soil moisture to 0 : Low, 50 : Normwl, 100 : High

        if row['name'] in optimal_list[plant_name]:
            # change status to optimal,

            # modify status if low to normal or high, as its not possible to high
            if row['soil_moisture'] == 'Low':
                row['soil_moisture'] = 'Normal'

            row['status'] = 'Optimal'

        elif row['name'] in not_optimal_list[plant_name]:
            # change status to not optimal_list
            row['soil_moisture'] = 'Low' # change soil_moisture to Low
            row['status'] = 'Not Optimal'

        # add new column for numerical features in knn instead of categorical features
        if row['soil_moisture'] == 'Low':
            row['soil_moisture_encode'] = 0

        if row['soil_moisture'] == 'Normal':
            row['soil_moisture_encode'] = 50

        if row['soil_moisture'] == 'High':
            row['soil_moisture_encode'] = 100

        list_pd.append(row)
    pre_processing_data[plant_name] = pd.DataFrame(list_pd)

    # remove nan values
    pre_processing_data[plant_name].dropna(inplace=True)


In [4]:
# save to excel format
preprocessing_save_path = {
    'kale' : '../datasets/kale_datasets_preprocessing.xlsx',
    'seledri' : '../datasets/seledri_datasets_preprocessing.xlsx'
}

for plant_name, saved_path in preprocessing_save_path.items():
    # save prerpocessing result as excel
    pre_processing_data[plant_name].to_excel(saved_path, header=True, index=False)

# Training and Testing Decision Tree Model

just run this step, skip preprocessing step and load from exisiting preprocessing file

In [1]:
# import library
import pickle
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
# load from preprocessing files
preprocessing_save_path = {
    'kale' : '../datasets/kale_datasets_preprocessing.xlsx',
    'seledri' : '../datasets/seledri_datasets_preprocessing.xlsx'
}
pre_processing_data = {}

for plant_name, saved_path in preprocessing_save_path.items():
    # save prerpocessing result as excel
    pre_processing_data[plant_name] = pd.read_excel(saved_path)

In [4]:
# prepare training and testing data
training_features_column = ['temperature', 'humidity', 'light_intensity', 'soil_moisture_encode']
target_column = ['status']
list_plant_name = ['kale','seledri']
X_train = {}
X_test = {}
y_train = {}
y_test = {}
for plant_name in list_plant_name:
    # Split dataset into training set and test set
    X_train[plant_name], X_test[plant_name], y_train[plant_name], y_test[plant_name] = train_test_split(pre_processing_data[plant_name][training_features_column], pre_processing_data[plant_name][target_column], test_size=0.3)

#pre_processing_data['kale'][training_features_column]

In [5]:
# make model and testing
saved_model_path = {
    'kale' : '../trained/kale_dt_model.sav',
    'seledri' : '../trained/seledri_dt_model.sav'
}

#Train the model using the training sets
for plant_name in list_plant_name:
    # create decision tree classifier
    clf = DecisionTreeClassifier(random_state=0,max_depth=3)

    # train model dt
    decision_tree_clf = clf.fit(X_train[plant_name], np.ravel(y_train[plant_name]))

    #Predict the response for test dataset
    y_pred = decision_tree_clf.predict(X_test[plant_name])

    # Model Accuracy, how often is the classifier correct?'
    clf_report = classification_report(y_test[plant_name], y_pred, target_names=["Optimal", "Not Optimal"])
    accuracy = metrics.accuracy_score(y_test[plant_name], y_pred) * 100

    print(f"Decision Tree Performance Report for {plant_name} models: ")
    print(f"Accuracy : {accuracy:.2f} %")
    print(f"Classification Report : {clf_report}")

    # save the model
    pickle.dump(decision_tree_clf, open(saved_model_path[plant_name].format(accuracy), 'wb'))

Decision Tree Performance Report for kale models: 
Accuracy : 100.00 %
Classification Report :               precision    recall  f1-score   support

     Optimal       1.00      1.00      1.00      7343
 Not Optimal       1.00      1.00      1.00     11218

    accuracy                           1.00     18561
   macro avg       1.00      1.00      1.00     18561
weighted avg       1.00      1.00      1.00     18561

Decision Tree Performance Report for seledri models: 
Accuracy : 100.00 %
Classification Report :               precision    recall  f1-score   support

     Optimal       1.00      1.00      1.00      7103
 Not Optimal       1.00      1.00      1.00     10372

    accuracy                           1.00     17475
   macro avg       1.00      1.00      1.00     17475
weighted avg       1.00      1.00      1.00     17475

