In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [2]:
# Load data
df = pd.read_csv("/content/drive/MyDrive/Yearbook of Agricultural Statistics/train1.csv") # Assumes train1.csv is in the same directory

In [3]:
# --- Preprocessing ---

# --- 1. Bin the target variable
target_col = "Production (M.Ton)"
try:
    df["Production_Category"] = pd.qcut(df[target_col], q=4, labels=["Low", "Medium", "High", "Very High"])
except ValueError:
    # Handle cases where quantile edges are not unique
    df["Production_Category"] = pd.qcut(df[target_col], q=4, labels=["Low", "Medium", "High", "Very High"], duplicates="drop")

# Drop the original continuous target variable
df = df.drop(columns=[target_col])

In [4]:
# 2. Encode categorical features
categorical_cols = df.select_dtypes(include=["object"]).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [5]:
# 3. Separate features (X) and target (y)
y = df_encoded["Production_Category"]
X = df_encoded.drop(columns=["Production_Category"])

In [6]:
# 4. Split data (using the same random state for consistency with previous runs)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Preprocessing complete.")

Preprocessing complete.


In [7]:
# --- Random Forest Model Training ---

print("\nTraining Random Forest Classifier...")
# Initialize the Random Forest Classifier (using parameters from previous run)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)


Training Random Forest Classifier...


In [8]:
# Train the model
rf_classifier.fit(X_train, y_train)

print("Model training complete.")


Model training complete.


In [9]:
# --- Model Evaluation ---

print("\nEvaluating model...")
# Predict on the test set
y_pred_rf = rf_classifier.predict(X_test)



Evaluating model...


In [10]:
# Calculate metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)
f1_macro_rf = f1_score(y_test, y_pred_rf, average="macro")
f1_weighted_rf = f1_score(y_test, y_pred_rf, average="weighted")

print("\n--- Random Forest Classification Metrics ---")
print(f"Accuracy: {accuracy_rf:.4f}")
print("\nClassification Report:")
print(report_rf)
print(f"Macro F1 Score: {f1_macro_rf:.4f}")
print(f"Weighted Average F1 Score: {f1_weighted_rf:.4f}")


--- Random Forest Classification Metrics ---
Accuracy: 0.8203

Classification Report:
              precision    recall  f1-score   support

        High       0.75      0.80      0.77       256
         Low       0.89      0.86      0.87       256
      Medium       0.76      0.77      0.76       256
   Very High       0.90      0.86      0.88       256

    accuracy                           0.82      1024
   macro avg       0.82      0.82      0.82      1024
weighted avg       0.82      0.82      0.82      1024

Macro F1 Score: 0.8214
Weighted Average F1 Score: 0.8214
