In [2]:
import os
import csv
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [3]:
# 1. Baca data
DATA_PATH = 'data/yield_df.csv'
df = pd.read_csv(DATA_PATH).drop(columns=['Unnamed: 0'])

In [4]:
numerical_cols   = ['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']
categorical_cols = ['Area']

In [5]:
for item in df['Item'].unique():
    df_item = df[df['Item'] == item]
    if len(df_item) < 100:  # skip jika data < 100 baris
        continue

    # 3a. Siapkan X dan y
    X = df_item.drop(columns=['hg/ha_yield', 'Item'])
    y = df_item['hg/ha_yield']

    # 3b. Preprocessor
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

    # 3c. Definisi pipeline
    rf_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    knn_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', KNeighborsRegressor(n_neighbors=5))
    ])

    # 3d. Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 3e. Train
    rf_pipeline.fit(X_train, y_train)
    knn_pipeline.fit(X_train, y_train)

    # 3f. Predict
    y_pred_rf  = rf_pipeline.predict(X_test)
    y_pred_knn = knn_pipeline.predict(X_test)

    # 3g. Hitung dan print metrik
    print(f"\n=== Evaluasi untuk Item: {item} ===")
    for model_name, y_pred in [('Random Forest', y_pred_rf), ('KNN', y_pred_knn)]:
        r2   = r2_score(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae  = mean_absolute_error(y_test, y_pred)

        print(f"\nModel: {model_name}")
        print(f" - R²   : {r2:.3f}")
        print(f" - MSE  : {mse:.3f}")
        print(f" - RMSE : {rmse:.3f}")
        print(f" - MAE  : {mae:.3f}")

print("\nSelesai menghitung metrik untuk semua item.")


=== Evaluasi untuk Item: Maize ===

Model: Random Forest
 - R²   : 0.965
 - MSE  : 25888170.703
 - RMSE : 5088.042
 - MAE  : 2315.260

Model: KNN
 - R²   : 0.960
 - MSE  : 29289519.872
 - RMSE : 5411.979
 - MAE  : 2905.433

=== Evaluasi untuk Item: Potatoes ===

Model: Random Forest
 - R²   : 0.968
 - MSE  : 297536687.763
 - RMSE : 17249.252
 - MAE  : 9138.556

Model: KNN
 - R²   : 0.967
 - MSE  : 306418983.164
 - RMSE : 17504.827
 - MAE  : 10326.526

=== Evaluasi untuk Item: Rice, paddy ===

Model: Random Forest
 - R²   : 0.974
 - MSE  : 9897340.409
 - RMSE : 3146.004
 - MAE  : 1569.424

Model: KNN
 - R²   : 0.961
 - MSE  : 14659437.975
 - RMSE : 3828.765
 - MAE  : 2155.078

=== Evaluasi untuk Item: Sorghum ===

Model: Random Forest
 - R²   : 0.936
 - MSE  : 12484489.997
 - RMSE : 3533.340
 - MAE  : 1660.376

Model: KNN
 - R²   : 0.897
 - MSE  : 20204660.505
 - RMSE : 4494.959
 - MAE  : 2089.330

=== Evaluasi untuk Item: Soybeans ===

Model: Random Forest
 - R²   : 0.928
 - MSE  : 43