In [None]:
import os
import csv
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,  accuracy_score

In [7]:
# 1. Baca data
DATA_PATH = 'data/yield_df.csv'
df = pd.read_csv(DATA_PATH).drop(columns=['Unnamed: 0'])

In [8]:
numerical_cols   = ['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']
categorical_cols = ['Area']

In [9]:
thresholds = {}
for item in df['Item'].unique():
    df_item = df[df['Item'] == item]
    if len(df_item) < 100:
        continue
    q1 = df_item['hg/ha_yield'].quantile(0.33)
    q2 = df_item['hg/ha_yield'].quantile(0.66)
    thresholds[item] = (q1, q2)


In [10]:
def categorize(item, value):
    low, high = thresholds[item]
    if value <= low:
        return 'rendah'
    elif value <= high:
        return 'sedang'
    else:
        return 'tinggi'

In [11]:
for item in df['Item'].unique():
    if item not in thresholds:
        continue  # skip item tanpa threshold (data < 100)

    df_item = df[df['Item'] == item]
    X = df_item.drop(columns=['hg/ha_yield', 'Item'])
    y = df_item['hg/ha_yield']

    # 5a. Preprocessor
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

    # 5b. Pipeline regresi
    rf_pipeline = Pipeline([
        ('pre', preprocessor),
        ('reg', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    knn_pipeline = Pipeline([
        ('pre', preprocessor),
        ('reg', KNeighborsRegressor(n_neighbors=5))
    ])

    # 5c. Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 5d. Train
    rf_pipeline.fit(X_train, y_train)
    knn_pipeline.fit(X_train, y_train)

    # 5e. Predict
    y_pred_rf  = rf_pipeline.predict(X_test)
    y_pred_knn = knn_pipeline.predict(X_test)

    # 5f. Buat kelas untuk evaluasi akurasi
    y_test_cls     = [categorize(item, val) for val in y_test]
    y_pred_rf_cls  = [categorize(item, val) for val in y_pred_rf]
    y_pred_knn_cls = [categorize(item, val) for val in y_pred_knn]

    # 5g. Print hasil
    print(f"\n=== Evaluasi untuk Item: {item} ===")
    for model_name, y_pred, y_pred_cls in [
        ('Random Forest', y_pred_rf,  y_pred_rf_cls),
        ('KNN Regressor',  y_pred_knn, y_pred_knn_cls)
    ]:
        r2   = r2_score(y_test, y_pred)
        mse  = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae  = mean_absolute_error(y_test, y_pred)
        acc  = accuracy_score(y_test_cls, y_pred_cls)

        print(f"\nModel: {model_name}")
        print(f" - R²       : {r2:.3f}")
        print(f" - MSE      : {mse:.3f}")
        print(f" - RMSE     : {rmse:.3f}")
        print(f" - MAE      : {mae:.3f}")
        print(f" - Accuracy : {acc:.3f} (kelas rendah/sedang/tinggi)")

print("\nSelesai menghitung metrik untuk semua item.")


=== Evaluasi untuk Item: Maize ===


NameError: name 'accuracy_score' is not defined