# Preprocessing Data

In [1]:
# import package
import sys
import numpy as np
import pandas as pd

In [2]:
# load dataset
datasets = {
    'bayam' : '../datasets/bayam_datasets.csv',
    'caisim' : '../datasets/caisim_datasets.csv'
}

raw_data = {}
for plant_name, path in datasets.items():
    raw_data[plant_name] = pd.read_csv(path)

In [3]:
# preprocessing
# set label bayam merah
# 4 - 2 - 1 - 3 - 5 ( Optimal - Non Optimal)

# set label caisim
# 1 - 2 - 4 -5 - 3 ( Optimal - Non Optimal)
optimal_list = {
    'bayam' : ['Bayam_Pot4',
                'Bayam_Pot2',
               'Bayam_Pot1',
              ],
    'caisim' : ['Caisim 1',
                'Caisim 2',
                'Caisim 4',]
}

not_optimal_list = {
    'bayam' : ['Bayam_Pot3',
               'Bayam_Pot5'],
    'caisim' : ['Caisim 5',
                'Caisim 3']
}


pre_processing_data = {}

for plant_name, path in datasets.items():
    list_pd = []
    for index, row in raw_data[plant_name].iterrows():
        # change soil moisture to 0 : Low, 50 : Normwl, 100 : High

        if row['name'] in optimal_list[plant_name]:
            # change status to optimal,

            # modify status if low to normal or high, as its not possible to high
            if row['soil_moisture'] == 'Low':
                row['soil_moisture'] = 'Normal'

            row['STATUS'] = 'Optimal'

        elif row['name'] in not_optimal_list[plant_name]:
            # change status to not optimal_list
            row['soil_moisture'] = 'Low' # change soil_moisture to Low
            row['STATUS'] = 'Not Optimal'

        # add new column for numerical features in knn instead of categorical features
        if row['soil_moisture'] == 'Low':
            row['soil_moisture_encode'] = 0

        if row['soil_moisture'] == 'Normal':
            row['soil_moisture_encode'] = 50

        if row['soil_moisture'] == 'High':
            row['soil_moisture_encode'] = 100

        list_pd.append(row)
    pre_processing_data[plant_name] = pd.DataFrame(list_pd)


In [4]:
# save to excel format
preprocessing_save_path = {
    'bayam' : '../datasets/bayam_datasets_preprocessing.xlsx',
    'caisim' : '../datasets/caisim_datasets_preprocessing.xlsx'
}

for plant_name, saved_path in preprocessing_save_path.items():
    # save prerpocessing result as excel
    pre_processing_data[plant_name].to_excel(saved_path, header=True, index=False)

# Training and Testing KNN Model

just run this step, skip preprocessing step and load from existing preprocesing file

In [1]:
# import library
import pickle
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
# load from preprocessing files
preprocessing_save_path = {
    'bayam' : '../datasets/bayam_datasets_preprocessing.xlsx',
    'caisim' : '../datasets/caisim_datasets_preprocessing.xlsx'
}
pre_processing_data = {}

for plant_name, saved_path in preprocessing_save_path.items():
    # save prerpocessing result as excel
    pre_processing_data[plant_name] = pd.read_excel(saved_path)

In [3]:
# prepare training and testing data

training_features_column = ['temperature', 'humidity', 'light_intensity', 'soil_moisture_encode']
target_column = ['STATUS']
list_plant_name = ['bayam', 'caisim']
X_train = {}
X_test = {}
y_train = {}
y_test = {}
for plant_name in list_plant_name:
    # Split dataset into training set and test set
    X_train[plant_name], X_test[plant_name], y_train[plant_name], y_test[plant_name] = train_test_split(pre_processing_data[plant_name][training_features_column], pre_processing_data[plant_name][target_column], test_size=0.3)

#pre_processing_data['bayam'][training_features_column]

In [4]:
saved_model_path = {
    'bayam' : '../trained/bayam_knn_model.sav',
    'caisim' : '../trained/caisim_knn_model.sav'
}

#Train the model using the training sets
for plant_name in list_plant_name:
    #Create KNN Classifier
    knn = KNeighborsClassifier(n_neighbors=5)

    # train model
    knn.fit(X_train[plant_name], np.ravel(y_train[plant_name]))

    #Predict the response for test dataset
    y_pred = knn.predict(X_test[plant_name])

    # Model Accuracy, how often is the classifier correct?'
    clf_report = classification_report(y_test[plant_name], y_pred, target_names=["Optimal", "Not Optimal"])
    accuracy = metrics.accuracy_score(y_test[plant_name], y_pred) * 100

    print(f"KNN Performance Report for {plant_name} models: ")
    print(f"Accuracy : {accuracy:.2f} %")
    print(f"Classification Report : {clf_report}")

    # save the model
    pickle.dump(knn, open(saved_model_path[plant_name].format(accuracy), 'wb'))

KNN Performance Report for bayam models: 
Accuracy : 99.95 %
Classification Report :               precision    recall  f1-score   support

     Optimal       1.00      1.00      1.00      7760
 Not Optimal       1.00      1.00      1.00     11700

    accuracy                           1.00     19460
   macro avg       1.00      1.00      1.00     19460
weighted avg       1.00      1.00      1.00     19460

KNN Performance Report for caisim models: 
Accuracy : 99.37 %
Classification Report :               precision    recall  f1-score   support

     Optimal       0.99      0.99      0.99      1665
 Not Optimal       1.00      0.99      0.99      2445

    accuracy                           0.99      4110
   macro avg       0.99      0.99      0.99      4110
weighted avg       0.99      0.99      0.99      4110

