In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = "yield_india.csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
df.drop(columns=['Unnamed: 0', 'Area', 'Year'], inplace=True)  # Removed 'Year'

# Convert categorical column 'Item' to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['Item'], drop_first=True)

# Define features and target variable
X = df.drop(columns=['hg/ha_yield'])
y = df['hg/ha_yield']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")

# Feature Importance
importances = rf_model.feature_importances_
feature_names = X.columns

# Display feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance_df)


In [None]:
import numpy as np
import pandas as pd

# Function to take manual input and predict yield
def predict_yield(model, feature_names):
    # Manually enter values
    avg_temp = float(input("Enter the average temperature (°C): "))
    avg_rainfall = float(input("Enter the average rainfall (mm per year): "))
    pesticides = float(input("Enter the amount of pesticides used (tonnes): "))

    # Crop selection
    crop_types = [
        "Maize", "Potatoes", "Rice, paddy", "Sorghum",
        "Soybeans", "Sweet potatoes", "Wheat"
    ]

    print("\nAvailable Crops:", crop_types)
    crop = input("Enter the crop name exactly as listed: ")

    # Create a dictionary with all feature names initialized to 0
    input_data = {feature: 0 for feature in feature_names}

    # Assign input values
    input_data["average_rain_fall_mm_per_year"] = avg_rainfall
    input_data["pesticides_tonnes"] = pesticides
    input_data["avg_temp"] = avg_temp

    # Set the correct crop to 1 in the one-hot encoding
    crop_feature_name = f"Item_{crop}"
    if crop_feature_name in input_data:
        input_data[crop_feature_name] = 1
    else:
        print("Error: Crop name does not match available options.")
        return

    # Convert to DataFrame to match training format
    input_df = pd.DataFrame([input_data])

    # Predict yield
    predicted_yield = model.predict(input_df)[0]

    print(f"\nPredicted Crop Yield: {predicted_yield:.2f} hg/ha")

# Run the manual test function with feature names from trained model
predict_yield(rf_model, X.columns)


In [None]:
import pickle

# Save the trained model to a file
model_filename = "crop_yield_model.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(rf_model, file)

print(f"Model saved as {model_filename}")


In [None]:
file_path = "yield_india.csv"
ff = pd.read_csv(file_path)

ff['Item'].unique()

In [None]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"
params = {
	"latitude": 56,
	"longitude": 22,
	"current": ["temperature_2m", "relative_humidity_2m", "surface_pressure"],
	"hourly": "vapour_pressure_deficit",
	"daily": ["precipitation_sum", "rain_sum"]
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
# print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
# print(f"Elevation {response.Elevation()} m asl")
# print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
# print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")


# Current values. The order of variables needs to be the same as requested.
current = response.Current()

current_temperature_2m = current.Variables(0).Value()

current_relative_humidity_2m = current.Variables(1).Value()

current_surface_pressure = current.Variables(2).Value()

# print(f"Current time {current.Time()}")

# print(f"Current temperature_2m {current_temperature_2m}")
# print(f"Current relative_humidity_2m {current_relative_humidity_2m}")
# print(f"Current surface_pressure {current_surface_pressure}")
# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_vapour_pressure_deficit = hourly.Variables(0).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["vapour_pressure_deficit"] = hourly_vapour_pressure_deficit

hourly_dataframe = pd.DataFrame(data = hourly_data)
# print(hourly_dataframe)

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_precipitation_sum = daily.Variables(0).ValuesAsNumpy()
daily_rain_sum = daily.Variables(1).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["rain_sum"] = daily_rain_sum

daily_dataframe = pd.DataFrame(data = daily_data)
# print(daily_dataframe['precipitation_sum'].mean())
print(daily_dataframe['rain_sum'].mean())