In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = "yield_india.csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
df.drop(columns=['Unnamed: 0', 'Area'], inplace=True)  # Removed 'Year'

# Convert categorical column 'Item' to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Item'], drop_first=True)

# Define features and target variable
X = df.drop(columns=['hg/ha_yield'])
y = df['hg/ha_yield']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

# Feature Importance
importances = rf_model.feature_importances_
feature_names = X.columns

# Display feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)


Mean Absolute Error (MAE): 1.744975308641988
Mean Squared Error (MSE): 198.9652295061781
Root Mean Squared Error (RMSE): 14.105503518349783
R² Score: 0.9999999795972386

Feature Importance:
                          Feature  Importance
9             Item_Sweet potatoes    0.261412
6                Item_Rice, paddy    0.183211
5                   Item_Potatoes    0.163204
10                     Item_Wheat    0.143025
4                      Item_Maize    0.082655
8                   Item_Soybeans    0.066920
7                    Item_Sorghum    0.055732
2               pesticides_tonnes    0.043833
3                        avg_temp    0.000007
1   average_rain_fall_mm_per_year    0.000000
0                      Unnamed: 3    0.000000


In [None]:
import numpy as np
import pandas as pd

# Function to take manual input and predict yield
def predict_yield(model, feature_names):
    # Manually enter values
    avg_temp = float(input("Enter the average temperature (°C): "))
    avg_rainfall = float(input("Enter the average rainfall (mm per year): "))
    pesticides = float(input("Enter the amount of pesticides used (tonnes): "))

    # Crop selection
    crop_types = [
        "Maize", "Potatoes", "Rice, paddy", "Sorghum",
        "Soybeans", "Sweet potatoes", "Wheat"
    ]

    print("\nAvailable Crops:", crop_types)
    crop = input("Enter the crop name exactly as listed: ")

    # Create a dictionary with all feature names initialized to 0
    input_data = {feature: 0 for feature in feature_names}

    # Assign input values
    input_data["average_rain_fall_mm_per_year"] = avg_rainfall
    input_data["pesticides_tonnes"] = pesticides
    input_data["avg_temp"] = avg_temp

    # Set the correct crop to 1 in the one-hot encoding
    crop_feature_name = f"Item_{crop}"
    if crop_feature_name in input_data:
        input_data[crop_feature_name] = 1
    else:
        print("Error: Crop name does not match available options.")
        return

    # Convert to DataFrame to match training format
    input_df = pd.DataFrame([input_data])

    # Predict yield
    predicted_yield = model.predict(input_df)[0]

    print(f"\nPredicted Crop Yield: {predicted_yield:.2f} hg/ha")

# Run the manual test function with feature names from trained model
predict_yield(rf_model, X.columns)


In [None]:
# import pickle

# # Save the trained model to a file
# model_filename = "crop_yield_model.pkl"
# with open(model_filename, 'wb') as file:
#     pickle.dump(rf_model, file)

# print(f"Model saved as {model_filename}")


In [None]:
# file_path = "yield_india.csv"
# ff = pd.read_csv(file_path)

# ff['Item'].unique()