In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    f1_score, 
    cohen_kappa_score
)
from xgboost import XGBClassifier

In [None]:
# ==========================================
# Configuration
# ==========================================

# File Paths
TRAIN_PATH = "dataset/train_interpolated.csv"
VALID_PATH = "dataset/valid_interpolated.csv"
TEST_PATH  = "dataset/test_interpolated.csv"

# Crop ID to Name mapping
CROP_MAPPING = {
    27: "Sesame", 2: "Pepper", 8: "Aralia", 1: "Sweet potato",
    17: "Sudangrass", 29: "Soybean", 9: "Perilla", 19: "Greenhouse",
    24: "Yuzu", 23: "Maize", 28: "Kiwi", 22: "Onion",
    16: "Apple", 30: "Grape", 14: "Peach", 10: "Garlic",
    12: "Pear", 13: "Cabbage", 11: "Sapling", 31: "Radish"
}

# Feature definitions (Bands and Time steps)
BANDS = ['b02', 'b03', 'b04', 'b05', 'b06', 'b07', 'b08', 'b8a', 'b11', 'b12']
MONTHS = [f"2021{m:02d}" for m in range(7, 13)]

# 1. Load Data
print("--- Loading Data ---")
df_train = pd.read_csv(TRAIN_PATH)
df_valid = pd.read_csv(VALID_PATH)
df_test  = pd.read_csv(TEST_PATH)

# 2. Apply Mapping & Preprocessing
# Ensure crop names exist and remove invalid rows
for df in (df_train, df_valid, df_test):
    df["crop_name"] = df["CR_ID"].map(CROP_MAPPING)
    df.dropna(subset=["crop_name"], inplace=True)

print(f"Train shape: {df_train.shape}")
print(f"Valid shape: {df_valid.shape}")
print(f"Test shape:  {df_test.shape}")

In [None]:
# 3. Define Feature List
# Construct feature column names: {band}_{YYYYMM}_{interval}
features = [f"{b}_{mon}_{d}" for b in BANDS for mon in MONTHS for d in range(1, 4)]
print(f"Number of input features: {len(features)}")

# 4. Prepare X (Features) and y (Target)
le = LabelEncoder().fit(df_train["crop_name"])

X_train = df_train[features].values
y_train = le.transform(df_train["crop_name"])

X_valid = df_valid[features].values
y_valid = le.transform(df_valid["crop_name"])

X_test  = df_test[features].values
y_test  = le.transform(df_test["crop_name"])

print("Data preparation complete.")
print(f"Classes: {le.classes_}")

In [None]:
# 5. Train Random Forest Classifier
print("--- Training Model ---")

rf = RandomForestClassifier(
    n_estimators=500, 
    random_state=42, 
    n_jobs=-1  
)

# Fit the model
rf.fit(X_train, y_train)
print("Training complete.")

In [None]:
# 6. Evaluate on Validation Set
print("\n=== Validation Set Performance ===")
y_pred_val = rf.predict(X_valid)
print(f"Validation Accuracy: {accuracy_score(y_valid, y_pred_val):.4f}")

# 7. Evaluate on Test Set
print("\n=== Test Set Performance ===")
start_time = time.time()  
y_pred_test = rf.predict(X_test)
end_time = time.time()    

# Calculate Metrics
acc = accuracy_score(y_test, y_pred_test)
macro_f1 = f1_score(y_test, y_pred_test, average="macro")
weighted_f1 = f1_score(y_test, y_pred_test, average='weighted')
kappa = cohen_kappa_score(y_test, y_pred_test)

# Print Metrics
print(f"Accuracy: {acc:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Weighted F1: {weighted_f1:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print("-" * 60)
print(classification_report(y_test, y_pred_test, target_names=le.classes_))

In [None]:
# 8. Model Complexity & Efficiency Analysis
print("\n=== Model Complexity & Efficiency ===")

# Calculate total number of nodes across all trees in the forest
n_params = sum([tree.tree_.node_count for tree in rf.estimators_])
print(f"Total Parameters (Total Nodes): {n_params}")

# Calculate inference speed per sample
inference_time_total = end_time - start_time
inference_time_ms_per_sample = (inference_time_total / len(X_test)) * 1000
print(f"Inference Time: {inference_time_ms_per_sample:.4f} ms/sample")

In [None]:
# Train XGBoost Model
print("--- Training XGBoost Model ---")

xgb = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1  
)

# Fit the model with validation monitoring
xgb.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)
print("Training complete.")

In [None]:
# 1. Evaluate on Validation Set
print("\n=== Validation Set Performance ===")
y_pred_val = xgb.predict(X_valid)
print(f"Accuracy: {accuracy_score(y_valid, y_pred_val):.4f}")
print(classification_report(y_valid, y_pred_val, target_names=le.classes_))

# 2. Evaluate on Test Set
print("\n=== Test Set Performance ===")
start_time = time.time() 
y_pred_test = xgb.predict(X_test)
end_time = time.time()   

# Calculate Metrics
acc = accuracy_score(y_test, y_pred_test)
macro_f1 = f1_score(y_test, y_pred_test, average="macro")
weighted_f1 = f1_score(y_test, y_pred_test, average='weighted')
kappa = cohen_kappa_score(y_test, y_pred_test)

# Print Metrics
print(f"Accuracy: {acc:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Weighted F1: {weighted_f1:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print("-" * 60)
print(classification_report(y_test, y_pred_test, target_names=le.classes_))

# 3. Model Complexity & Efficiency Analysis
print("\n=== Model Complexity & Efficiency ===")

# Calculate Total Parameters (Total Nodes in all trees)
booster = xgb.get_booster()
df_trees = booster.trees_to_dataframe()
n_params = df_trees.shape[0]
print(f"Total Parameters (Total Nodes): {n_params}")

# Calculate Inference Time
inference_time_total = end_time - start_time
inference_time_ms_per_sample = (inference_time_total / len(X_test)) * 1000
print(f"Inference Time: {inference_time_ms_per_sample:.4f} ms/sample")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Plot Confusion Matrix
print("--- Plotting Confusion Matrix ---")

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# Plot settings
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['axes.unicode_minus'] = False

fig, ax = plt.subplots(figsize=(10, 8)) # Adjusted size for better visibility
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

# Display plot
disp.plot(cmap="Blues", ax=ax, xticks_rotation=90)
ax.set_title("XGBoost Confusion Matrix")

plt.tight_layout()
plt.show()